-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_replies.js
153 lines (123 loc) · 4.53 KB
/
twitter_replies.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// Phantombuster configuration {
"phantombuster command: nodejs"
"phantombuster package: 5"
"phantombuster flags: save-folder"
"phantombuster dependencies: lib-Mattr-Helper.js"
const Buster = require("phantombuster")
const buster = new Buster()
const Nick = require("nickjs")
const nick = new Nick()
const MattrHelper = require('./lib-Mattr-Helper');
// }
nick.newTab().then(async (tab) => {
const mattrHelper = new MattrHelper(buster, nick, tab);
if (!buster.argument.url) {
throw new Error('Missing url argument to agent');
}
await mattrHelper.openTab(buster.argument.url);
await tab.untilVisible('div[role="main"]'); // Make sure we have loaded the right page
await tab.inject("../injectables/jquery-3.0.0.min.js"); // We're going to use jQuery to scrape
const scrapePage = (arg, done) => {
try {
const results = [];
const items = $('.replies-to li.stream-item:not(.tracked)'); // select the items, excluding previously tracked
const minPosition = $('.stream-container[data-min-position]').attr('data-min-position');
if (items.length) {
// add a css class to track which items we've already processed
items.addClass('tracked');
// visibly fade the items for easier debugging via screenshots
items.css('opacity', 0.25);
items.each((i, el) => {
const item = $(el);
const id = item.data('itemId');
const data = {
id,
text: item.find('.tweet-text').clone().children('.u-hidden').remove().end().text(),
username: item.find('.account-group .username').text().replace('@', ''),
userId: item.find('.account-group').data('userId'),
createdAt: item.find('.time [data-time]').data('time'),
};
if (data.text) {
results.push(data);
}
});
}
done(null, { results, minPosition });
} catch(e) {
done('Something went wrong while scraping the page');
}
};
const triggerNextPage = (arg, done) => {
try {
const nextLink = $('.ThreadedConversation-showMoreThreadsButton');
if (nextLink.length) {
nextLink.click();
done(null);
} else {
// Trigger the next page by scrolling to the bottom.
// Here we are jumping back to the top first because sometimes doing a direct jump to
// the bottom is not triggering twitters javascript to fetch the next page.
const targetHeight = $('div[role="main"]').height();
setTimeout(() => {
$('#permalink-overlay')[0].scrollTo(0, 1);
}, 0);
setTimeout(() => {
$('#permalink-overlay')[0].scrollTo(0, targetHeight);
}, 100);
setTimeout(() => {
done(null);
}, 200);
}
} catch(e) {
done('Something went triggering the next page');
}
};
const checkForHasMoreLink = (arg, done) => {
const hasNextPage = $('div.timeline-end.has-items.has-more-items').length > 0;
done(null, hasNextPage);
}
const tracked = {};
const data = [];
const addResult = (result) => {
if (tracked[result.id]) return false;
tracked[result.id] = true;
data.push(result);
};
const addResults = results => results.forEach(result => addResult(result));
let page = 1;
let checkNextPage = true;
let scrapeResult;
while (checkNextPage) {
console.log('Scraping page', page);
scrapeResult = await tab.evaluate(scrapePage);
console.log('Scraped page', page);
if (!scrapeResult.results.length) {
console.log('No replies found on page', page);
checkNextPage = false;
break;
}
addResults(scrapeResult.results);
console.log('Preserving', scrapeResult.results.length, 'results from page', page, 'with minPosition', scrapeResult.minPosition);
//await tab.screenshot(`${page}-page-before-triggered.jpg`);
await tab.evaluate(triggerNextPage);
console.log('Next page triggered from page', page);
//await tab.screenshot(`${page}-page-after-triggered.jpg`);
if (scrapeResult.minPosition) {
await tab.waitWhileVisible(`.stream-container[data-min-position="${scrapeResult.minPosition}"]`);
} else {
checkNextPage = await tab.evaluate(checkForHasMoreLink);
}
page++;
}
console.log('Scraping finished with', data.length, 'result(s)');
const sortedData = data.sort((a, b) => b.createdAt > a.createdAt);
await buster.setResultObject(sortedData);
})
.then(() => {
console.log("Job done!")
nick.exit()
})
.catch((err) => {
console.log(`Something went wrong: ${err}`)
nick.exit(1)
})