-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
150 lines (116 loc) · 4.72 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
const puppeteer = require('puppeteer');
const ExcelJS = require('exceljs');
const fs = require('fs');
const readline = require('readline');
const chalk = require('chalk');
var log = require('loglevel');
const prefix = require('loglevel-plugin-prefix');
const colors = {
TRACE: chalk.magenta,
DEBUG: chalk.cyan,
INFO: chalk.blue,
WARN: chalk.yellow,
ERROR: chalk.red,
};
prefix.reg(log);
log.enableAll();
prefix.apply(log, {
format(level, name, timestamp) {
return `${chalk.gray(`[${timestamp}]`)} ${colors[level.toUpperCase()](level)} ${chalk.green(`${name}:`)}`;
},
});
prefix.apply(log.getLogger('critical'), {
format(level, name, timestamp) {
return chalk.red.bold(`[${timestamp}] ${level} ${name}:`);
},
});
log.setLevel("INFO")
console.log("====== ALLMUSIC-SCRAPER ======")
// Read lines from txt file and add them to the urlList array.
var rd = readline.createInterface({
input: fs.createReadStream('urllist.txt')
});
const urlList = []
rd.on('line', function (line) {
urlList.push({ url: line })
});
log.debug(urlList)
// Base URL of the pages
const baseUrl = "https://www.allmusic.com/album/"
// Initialize and configure the workbook and worksheet
const workbook = new ExcelJS.Workbook();
workbook.addWorksheet('My Sheet');
const worksheet = workbook.getWorksheet('My Sheet');
// Configure the columns
worksheet.columns = [
{ header: 'PublishedDate', key: 'PublishedDate', width: 15 },
{ header: 'AMG ID', key: 'AlbumId', width: 15 },
{ header: 'Artist', key: 'Artist', width: 20 },
{ header: 'Album', key: 'Album', width: 20 },
{ header: 'Genre', key: 'Genre', width: 30 },
{ header: 'Styles', key: 'Styles', width: 50 }
];
var errorCount = 0;
var errorUrls = [];
(async () => {
const pageTimeout = 0;
const userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1)';
const browserOptions = {
headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu',
'--no-first-run',
'--no-zygote',
//'--single-process'
]
};
const browser = await puppeteer.launch(browserOptions);
const page = await browser.newPage();
const pageOptions = {
timeout: pageTimeout,
waitUntil: 'domcontentloaded'
};
await page.setViewport({ width: 1024, height: 1280 });
await page.setUserAgent(userAgent);
log.info("Albums to retrieve: " + urlList.length)
//const agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/79.0.3945.73 Mobile/15E148 Safari/604.1"]
// Loop through list of albums and retrieve the metadata for each.
for (let i = 0; i < urlList.length; i++) {
log.info("Retrieving metadata for " + urlList[i].url + " | " + (i + 1) + "/" + urlList.length)
try {
//const randomAgent = agents[Math.floor(Math.random() * agents.length)];
//await page.setUserAgent(randomAgent)
await page.goto(baseUrl + urlList[i].url, pageOptions);
const STYLES_SELECTOR = '.styles';
const JSON_SELECTOR = 'script[type="application/ld+json"]';
const json = await page.$(JSON_SELECTOR);
const jsondata = await json.evaluate(element => element.innerText);
const styles = await page.$(STYLES_SELECTOR);
const styledata = await styles?.evaluate(element => element.innerText);
const metadata = JSON.parse(jsondata)
log.debug(metadata);
// Add a new row
worksheet.addRow(
{
Artist: metadata?.byArtist[0]?.name,
AlbumId: urlList[i]?.url,
PublishedDate: typeof metadata?.datePublished !== 'undefined' ? new Date(metadata?.datePublished) : "",
Album: metadata?.name,
Genre: metadata?.genre?.join(','),
Styles: styledata?.substring(7).split('\n').join(', ')
}
);
await page.waitForTimeout(600);
} catch (e) {
log.error(urlList[i].url + ' | main program error:' + e);
errorCount++
errorUrls.push(urlList[i].url)
}
}
await workbook.xlsx.writeFile("./album-info.xlsx");
await page.close();
await browser.close();
log.info("Scraping completed.")
if (errorCount > 0) {
log.warn(errorCount + " album(s) could not be retrieved:")
log.warn(errorUrls.join(','))
}
})();