-
Notifications
You must be signed in to change notification settings - Fork 0
/
pageScraper.js
147 lines (126 loc) · 4.75 KB
/
pageScraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
const scraperObject ={
async linkScraper(globals,browser){
let page = await browser.newPage();
await page.goto(globals.url)
console.log(`Navigating to ${globals.url}...`);
let allPages = [];
console.log('Globals Link Scrapper', globals);
let pagePromise = (link) => new Promise(async(resolve, reject) => {
let linkClass = globals['linkClass'];
let newPage = await browser.newPage();
await newPage.goto(link);
await newPage.waitForSelector(linkClass);
let pageUrls = await newPage.evaluate( (linkClass) => {
console.log('pageUrls LinkClass', linkClass);
const urlArray = Array.from(document.querySelectorAll(linkClass)).map((link) => link.href);
const uniqueUrlArray = [...new Set(urlArray)];
return uniqueUrlArray;
},linkClass);
resolve(pageUrls);
await newPage.close();
});
for(let i = 0; i <= globals['phpNavAmount']; i = i + globals['phpNavIncrement']){
let t = globals['url'] + globals['phpNavigator'] + i;
let currentPageUrls = await pagePromise(t);
await allPages.push.apply(allPages,currentPageUrls);
sleep(5000);
}
return allPages;
},
async pageScraper(globals,browser,pageUrl){
let id = "#" + globals.idContainingContent;
let page = []
let pagePromise = (link) => new Promise(async(resolve,reject) =>{
let newPage = await browser.newPage();
await newPage.goto(link);
await newPage.waitForSelector(id);
let content = await newPage.evaluate((idget)=>{
let ps = document.getElementById(idget);
let con = ps.getElementsByTagName("p");
let storyText = "";
for(let p = 0; p < con.length; p++){
storyText += con[p].textContent + "\n";
}
return storyText;
}, globals.idContainingContent)
resolve(content);
await newPage.close();
});
let s = await pagePromise(pageUrl);
page.push(s);
return page;
},
async pageChapterScraper(globals,browser,pageUrls){
console.log('Page Urls:',pageUrls);
let id = "#" + globals.idContainingContent;
let storys = [];
let chapterPromise = (link) => new Promise(async(resolve,reject) =>{
let newPage = await browser.newPage();
await newPage.goto(link);
await newPage.waitForSelector(id);
let content = await newPage.evaluate((idget)=>{
let ps = document.getElementById(idget);
let con = ps.getElementsByTagName("p");
let storyText = "";
for(let p = 0; p < con.length; p++){
storyText += con[p].textContent + "\n";
}
return storyText;
},globals.idContainingContent)
resolve(content);
await newPage.close();
});
let storyPromise = (link) => new Promise(async(resolve,reject)=>{
let story =[];
let newPage =await browser.newPage();
let id = "#" + globals.idContainingContent;
await newPage.goto(link);
await newPage.waitForSelector(id);
let chaptersExist = await newPage.evaluate((chapter)=>{
if(document.querySelector(chapter)){
return true;
}else{
return false;
}
}, globals.chapterSelector)
if(chaptersExist){
let chapterNumber = await newPage.evaluate((chapter)=>{
let s = document.querySelector(chapter);
let o = s?.options.length;
return o;
},globals.chapterSelector)
for(let i = 1; i <=chapterNumber; i++){
let newLink = link + globals.chapterOffset + i;
let chapter = await chapterPromise(newLink);
await story.push(chapter);
sleep(5000);
}
}else{
let content = await newPage.evaluate((idget)=>{
let ps = document.getElementById(idget);
let con = ps.getElementsByTagName("p");
let storyText = "";
for(let p = 0; p < con.length; p++){
storyText += con[p].textContent + "\n";
}
return storyText;
},globals.idContainingContent);
await story.push(content);
sleep(5000);
}
resolve(story);
await newPage.close();
})
for (let k = 0; k <pageUrls.length; k++){
let sp = await storyPromise(pageUrls[k]);
await storys.push(sp);
}
return storys;
}
}
async function sleep(ms){
return new Promise((resolve)=>{
setTimeout(resolve,ms);
})
}
module.exports = scraperObject;