diff --git a/sql-dump-to-arango/index.ts b/sql-dump-to-arango/index.ts index c06aaca..eb65aff 100644 --- a/sql-dump-to-arango/index.ts +++ b/sql-dump-to-arango/index.ts @@ -2,6 +2,7 @@ import { parseDumpContent, sqlDumpStream, sqlDumpStreamFromWeb } from "./parser/ import { initLangDatabase, insertLinks, insertPages, insertRedirects } from "./arango/connection.js"; import { createDumpProgressLogger } from "./logger/dumpProgress.js"; import { CollectionType } from "arangojs/collection.js"; +import { env } from "./env.js"; export type WikiPage = { title: string, @@ -32,12 +33,12 @@ async function parseAndLoadPage() { let nextBatch:WikiPage[] = []; const { log } = createDumpProgressLogger(info.size, "Page"); - for await (const values of parseDumpContent(stream, ["page_id","page_title", "page_namespace","page_is_redirect"] as const)) { - const isRedirect = values[3] == "1"; - if (values[2] != "0") continue; + for await (const [page_id, page_title, page_namespace, page_is_redirect] of parseDumpContent(stream, ["page_id","page_title", "page_namespace","page_is_redirect"] as const)) { + const isRedirect = page_is_redirect == "1"; + if (page_namespace != "0") continue; // pageMap.set(values[1],values[0]); - pageMap.set(values[1],[values[0],isRedirect]); - nextBatch.push({_key: values[0], title:values[1], isRedirect}); + pageMap.set(page_title,[page_id,isRedirect]); + nextBatch.push({_key: page_id, title:page_title, isRedirect}); count++; if (count % 32_768 == 0) { @@ -84,20 +85,20 @@ async function parseAndLoadRedirect() { let count = 0; let nextBatch:{_from:string,_to:string}[] = []; const { log } = createDumpProgressLogger(info.size, "Redirect "); - for await (const values of parseDumpContent(stream, ["rd_from","rd_namespace","rd_title","rd_interwiki","rd_fragment"] as const)) { - if (values[1] != "0") continue; - const _toIsRedirect = pageMap.get(values[2]); + for await (const [rd_from, rd_namespace, rd_title, rd_interwiki, rd_fragment] of parseDumpContent(stream, ["rd_from","rd_namespace","rd_title","rd_interwiki","rd_fragment"] as const)) { + if (rd_namespace != "0") continue; + const _toIsRedirect = pageMap.get(rd_title); // the redirect lead to a page that does not exist ( most likely it's not in the article namespace) if (_toIsRedirect == null) { continue; } // The redirect lead to another redirect that need to be resolved if (_toIsRedirect[1]) { - toResolve.push([values[0],values[2]]); + toResolve.push([rd_from,rd_title]); continue; // The redirect lead to a valid page } else { - nextBatch.push({_from:values[0],_to:_toIsRedirect[0]}); + nextBatch.push({_from:rd_from,_to:_toIsRedirect[0]}); count++; } @@ -151,6 +152,26 @@ async function parseAndLoadRedirect() { } +const linkTargetMap = new Map(); + +async function parseLinkTarget() { + const { info, stream } = await sqlDumpStream("linktarget"); + let count = 0; + const { log } = createDumpProgressLogger(info.size, "Linktarget"); + + for await (const [lt_id,lt_namespace, lt_title] of parseDumpContent(stream, ["lt_id","lt_namespace", "lt_title" ] as const)) { + if (lt_namespace != "0") continue; + linkTargetMap.set(lt_id, lt_title); + count++; + + if (count % 32_768*8 == 0) { + log(info.bytesRead, count); + } + + } +} + + async function parseAndLoadPageLinks() { const link = langDb.collection<{}>("link"); @@ -165,9 +186,69 @@ async function parseAndLoadPageLinks() { let count = 0; let nextBatch:{_from:string,_to:string}[] = []; const { log } = createDumpProgressLogger(info.size, "PageLinks "); - for await (const values of parseDumpContent(stream, ["pl_from","pl_namespace","pl_title","pl_from_namespace"] as const)) { - if (values[1] != "0" || values[3] != "0") continue; - const _toIsRedirect = pageMap.get(values[2]); + for await (const [pl_from, pl_namespace, pl_title, pl_from_namespace] of parseDumpContent(stream, ["pl_from","pl_namespace","pl_title","pl_from_namespace"] as const)) { + if (pl_namespace != "0" || pl_from_namespace != "0") continue; + const _toIsRedirect = pageMap.get(pl_title); + if (_toIsRedirect == null) { + continue; + } + // The link lead to a redirect that need to be resolve; + if (_toIsRedirect[1]) { + const resolvedId = resolveRedirect(redirectMap.get(_toIsRedirect[0])); + if (resolvedId == null) { + continue; + } + nextBatch.push({_from: pl_from, _to: resolvedId}); + count++; + // The link resolve to a valid page + } else { + nextBatch.push({_from: pl_from, _to: _toIsRedirect[0]}); + count++; + } + + + if (count % 4096 == 0) { + await previousBatchPromise; + + const batch = nextBatch; + nextBatch = []; + + previousBatchPromise = insertLinks(batch, link); + // previousBatchPromise = Promise.resolve() as Promise; + if (count % 16_384 == 0) { + log(info.bytesRead, count); + } + } + + } + await previousBatchPromise; + + const batch = nextBatch; + nextBatch = []; + + await insertLinks(batch, link); + log(info.bytesRead, count); + nextBatch = []; +} + +async function parseAndLoadPageLinksWithLinkTarget() { + await parseLinkTarget(); + const link = langDb.collection<{}>("link"); + + try { + await link.create({type: CollectionType.EDGE_COLLECTION}).catch(); + await link.ensureIndex({ type: "persistent", fields: [ "_from", "_to" ], unique: true, inBackground: true}).catch(); + } catch(e) {} + + const { info, stream } = await sqlDumpStream("pagelinks"); + let previousBatchPromise = Promise.resolve() as Promise; + let count = 0; + let nextBatch:{_from:string,_to:string}[] = []; + const { log } = createDumpProgressLogger(info.size, "PageLinks "); + for await (const [pl_from, pl_from_namespace, pl_target_id] of parseDumpContent(stream, ["pl_from", "pl_from_namespace", "pl_target_id"] as const)) { + const toTitle = linkTargetMap.get(pl_target_id); + if (toTitle == null) continue; + const _toIsRedirect = pageMap.get(toTitle); if (_toIsRedirect == null) { continue; } @@ -177,11 +258,11 @@ async function parseAndLoadPageLinks() { if (resolvedId == null) { continue; } - nextBatch.push({_from: values[0], _to: resolvedId}); + nextBatch.push({_from: pl_from, _to: resolvedId}); count++; // The link resolve to a valid page } else { - nextBatch.push({_from: values[0], _to: _toIsRedirect[0]}); + nextBatch.push({_from: pl_from, _to: _toIsRedirect[0]}); count++; } @@ -228,10 +309,14 @@ function resolveRedirect(pageTitle):null|string { } return null; } - await parseAndLoadPage(); await parseAndLoadRedirect(); -await parseAndLoadPageLinks(); +if (["fr"].includes(env.WIKI_LANG)) { + await parseAndLoadPageLinksWithLinkTarget(); +} else { + await parseAndLoadPageLinks(); +} + console.log("finished"); process.exit(0); diff --git a/sql-dump-to-arango/parser/dumpParser.ts b/sql-dump-to-arango/parser/dumpParser.ts index 4559c99..54c301e 100644 --- a/sql-dump-to-arango/parser/dumpParser.ts +++ b/sql-dump-to-arango/parser/dumpParser.ts @@ -5,7 +5,7 @@ import { stat } from "node:fs/promises"; import { env } from "../env.js"; import { mkdir } from "fs/promises"; -export type FileType = "page" | "redirect" | "pagelinks"; +export type FileType = "page" | "redirect" | "pagelinks" | "linktarget"; export async function sqlDumpStreamFromCache(fileType:FileType) { const path = `./cache/${env.WIKI_LANG}/${env.WIKI_LANG}wiki-latest-${fileType}.sql.gz`; if (!existsSync(path)) {