-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper-post-process.ts
99 lines (87 loc) · 3.97 KB
/
scraper-post-process.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import { cheerio } from "https://deno.land/x/[email protected]/mod.ts";
interface Post {
creator?: {
name: string,
url: string,
}
url: string;
publicationDate?: Date,
lastModificationDate?: Date,
content?: string,
}
const FORUM_BASE_URL= 'https://forum.obsidian.md/t/dataview-plugin-snippet-showcase/13673/';
// The filename is the first invocation argument
const filename = Deno.args[0] // Same name as downloaded_filename
//const data = await readJSON(filename)
const maxLimit = Number(await Deno.readTextFile("./maxLimit.txt"));
// getting the last post id
const lastPostResponse = await fetch(`${FORUM_BASE_URL}${maxLimit}`, {redirect: 'follow'});
const lastPostHtml = await lastPostResponse.text();
const maxPostNumber = await getMaxPostNumber(lastPostHtml);
const lastPostId = Number(await Deno.readTextFile("./lastPostId.txt"));
for (let i=maxPostNumber; i>lastPostId; --i ) {
const textResponse = await fetch(`${FORUM_BASE_URL}${i}`, {redirect: 'follow'});
const textData = await textResponse.text();
// await Deno.writeTextFile(`./raw_content_${i}.html`, textData);
await processPost(textData, i);
}
await Deno.writeTextFile("./lastPostId.txt", maxPostNumber);
async function getMaxPostNumber(html: string): Promise<any>{
const $ = cheerio.load(html);
const posts = $('div[class="topic-body crawler-post"]');
let maxPost = 0;
await posts.each(function(index, data) {
const post = cheerio.load(data);
const postNumber = getPostNumber(post);
maxPost = Math.max(maxPost, postNumber);
});
return maxPost
}
function getPostNumber(data: any): number {
return Number(data('span[itemprop=position]').text().replace('#',''));
}
async function processPost(html: string, postIndex: number){
const $ = cheerio.load(html);
const posts = $('div[class="topic-body crawler-post"]');
var rawData: Array<Post> = []
await posts.each(function(index, data) {
const post = cheerio.load(data);
const creatorName = post('a[itemprop=url] > span[itemprop=name]');
const creatorUrl = post('span[class=creator] > a[itemprop=url]');
const publicationDate = post('span[class=crawler-post-infos] > time');
const publishDate = publicationDate.first().attr('datetime')?.toString();
const lastModificationDateContainer = post('span[class=crawler-post-infos] > meta');
const lastModDate = lastModificationDateContainer.first().attr('content')?.toString()
const postId =getPostNumber(post);
const content = post('div[class=post]')?.html() || '';
rawData.push({
creator: {
url: creatorUrl.first().attr('href')?.toString() || '',
name: creatorName.text(),
},
url: `${FORUM_BASE_URL}${postId}`,
publicationDate: publishDate ? new Date(publishDate): undefined,
lastModificationDate: lastModDate ? new Date(lastModDate): undefined,
content,
})
}
);
const dataviewCodes = rawData.filter(post => post.content?.includes('```dataview'));
for (const codePost of dataviewCodes){
const content = generateContent(codePost);
const fileName =`${codePost.creator?.name || ''}_${codePost.publicationDate?.getTime()}.md`
await Deno.writeTextFile(`./scripts/${fileName}`, content);
}
}
function generateContent(codePost: Post): string {
const name = generateName(codePost?.creator?.name, codePost?.creator?.url);
return `---\ntitle:\n${generateMetaItem('author', name)}${generateMetaItem('Publication Date', codePost.publicationDate?.toDateString())}${generateMetaItem('Last modification Date', codePost.lastModificationDate?.toDateString())}---\n\n${codePost.content}`;
}
function generateName(name: string | undefined, url: string | undefined): string {
return `${name} (${url})`;
}
function generateMetaItem(label: string, item: string | undefined): string {
return item
? `${label}: ${item}\n`
: '';
}