Skip to content

Commit

Permalink
Handle linktarget for frwiki
Browse files Browse the repository at this point in the history
  • Loading branch information
Sacramentix committed May 26, 2024
1 parent fc11620 commit c11e38e
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 18 deletions.
119 changes: 102 additions & 17 deletions sql-dump-to-arango/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { parseDumpContent, sqlDumpStream, sqlDumpStreamFromWeb } from "./parser/
import { initLangDatabase, insertLinks, insertPages, insertRedirects } from "./arango/connection.js";
import { createDumpProgressLogger } from "./logger/dumpProgress.js";
import { CollectionType } from "arangojs/collection.js";
import { env } from "./env.js";

export type WikiPage = {
title: string,
Expand Down Expand Up @@ -32,12 +33,12 @@ async function parseAndLoadPage() {
let nextBatch:WikiPage[] = [];
const { log } = createDumpProgressLogger(info.size, "Page");

for await (const values of parseDumpContent(stream, ["page_id","page_title", "page_namespace","page_is_redirect"] as const)) {
const isRedirect = values[3] == "1";
if (values[2] != "0") continue;
for await (const [page_id, page_title, page_namespace, page_is_redirect] of parseDumpContent(stream, ["page_id","page_title", "page_namespace","page_is_redirect"] as const)) {
const isRedirect = page_is_redirect == "1";
if (page_namespace != "0") continue;
// pageMap.set(values[1],values[0]);
pageMap.set(values[1],[values[0],isRedirect]);
nextBatch.push({_key: values[0], title:values[1], isRedirect});
pageMap.set(page_title,[page_id,isRedirect]);
nextBatch.push({_key: page_id, title:page_title, isRedirect});
count++;

if (count % 32_768 == 0) {
Expand Down Expand Up @@ -84,20 +85,20 @@ async function parseAndLoadRedirect() {
let count = 0;
let nextBatch:{_from:string,_to:string}[] = [];
const { log } = createDumpProgressLogger(info.size, "Redirect ");
for await (const values of parseDumpContent(stream, ["rd_from","rd_namespace","rd_title","rd_interwiki","rd_fragment"] as const)) {
if (values[1] != "0") continue;
const _toIsRedirect = pageMap.get(values[2]);
for await (const [rd_from, rd_namespace, rd_title, rd_interwiki, rd_fragment] of parseDumpContent(stream, ["rd_from","rd_namespace","rd_title","rd_interwiki","rd_fragment"] as const)) {
if (rd_namespace != "0") continue;
const _toIsRedirect = pageMap.get(rd_title);
// the redirect lead to a page that does not exist ( most likely it's not in the article namespace)
if (_toIsRedirect == null) {
continue;
}
// The redirect lead to another redirect that need to be resolved
if (_toIsRedirect[1]) {
toResolve.push([values[0],values[2]]);
toResolve.push([rd_from,rd_title]);
continue;
// The redirect lead to a valid page
} else {
nextBatch.push({_from:values[0],_to:_toIsRedirect[0]});
nextBatch.push({_from:rd_from,_to:_toIsRedirect[0]});
count++;
}

Expand Down Expand Up @@ -151,6 +152,26 @@ async function parseAndLoadRedirect() {
}


const linkTargetMap = new Map<string, string>();

async function parseLinkTarget() {
const { info, stream } = await sqlDumpStream("linktarget");
let count = 0;
const { log } = createDumpProgressLogger(info.size, "Linktarget");

for await (const [lt_id,lt_namespace, lt_title] of parseDumpContent(stream, ["lt_id","lt_namespace", "lt_title" ] as const)) {
if (lt_namespace != "0") continue;
linkTargetMap.set(lt_id, lt_title);
count++;

if (count % 32_768*8 == 0) {
log(info.bytesRead, count);
}

}
}


async function parseAndLoadPageLinks() {

const link = langDb.collection<{}>("link");
Expand All @@ -165,9 +186,69 @@ async function parseAndLoadPageLinks() {
let count = 0;
let nextBatch:{_from:string,_to:string}[] = [];
const { log } = createDumpProgressLogger(info.size, "PageLinks ");
for await (const values of parseDumpContent(stream, ["pl_from","pl_namespace","pl_title","pl_from_namespace"] as const)) {
if (values[1] != "0" || values[3] != "0") continue;
const _toIsRedirect = pageMap.get(values[2]);
for await (const [pl_from, pl_namespace, pl_title, pl_from_namespace] of parseDumpContent(stream, ["pl_from","pl_namespace","pl_title","pl_from_namespace"] as const)) {
if (pl_namespace != "0" || pl_from_namespace != "0") continue;
const _toIsRedirect = pageMap.get(pl_title);
if (_toIsRedirect == null) {
continue;
}
// The link lead to a redirect that need to be resolve;
if (_toIsRedirect[1]) {
const resolvedId = resolveRedirect(redirectMap.get(_toIsRedirect[0]));
if (resolvedId == null) {
continue;
}
nextBatch.push({_from: pl_from, _to: resolvedId});
count++;
// The link resolve to a valid page
} else {
nextBatch.push({_from: pl_from, _to: _toIsRedirect[0]});
count++;
}


if (count % 4096 == 0) {
await previousBatchPromise;

const batch = nextBatch;
nextBatch = [];

previousBatchPromise = insertLinks(batch, link);
// previousBatchPromise = Promise.resolve() as Promise<any>;
if (count % 16_384 == 0) {
log(info.bytesRead, count);
}
}

}
await previousBatchPromise;

const batch = nextBatch;
nextBatch = [];

await insertLinks(batch, link);
log(info.bytesRead, count);
nextBatch = [];
}

async function parseAndLoadPageLinksWithLinkTarget() {
await parseLinkTarget();
const link = langDb.collection<{}>("link");

try {
await link.create({type: CollectionType.EDGE_COLLECTION}).catch();
await link.ensureIndex({ type: "persistent", fields: [ "_from", "_to" ], unique: true, inBackground: true}).catch();
} catch(e) {}

const { info, stream } = await sqlDumpStream("pagelinks");
let previousBatchPromise = Promise.resolve() as Promise<any>;
let count = 0;
let nextBatch:{_from:string,_to:string}[] = [];
const { log } = createDumpProgressLogger(info.size, "PageLinks ");
for await (const [pl_from, pl_from_namespace, pl_target_id] of parseDumpContent(stream, ["pl_from", "pl_from_namespace", "pl_target_id"] as const)) {
const toTitle = linkTargetMap.get(pl_target_id);
if (toTitle == null) continue;
const _toIsRedirect = pageMap.get(toTitle);
if (_toIsRedirect == null) {
continue;
}
Expand All @@ -177,11 +258,11 @@ async function parseAndLoadPageLinks() {
if (resolvedId == null) {
continue;
}
nextBatch.push({_from: values[0], _to: resolvedId});
nextBatch.push({_from: pl_from, _to: resolvedId});
count++;
// The link resolve to a valid page
} else {
nextBatch.push({_from: values[0], _to: _toIsRedirect[0]});
nextBatch.push({_from: pl_from, _to: _toIsRedirect[0]});
count++;
}

Expand Down Expand Up @@ -228,10 +309,14 @@ function resolveRedirect(pageTitle):null|string {
}
return null;
}

await parseAndLoadPage();
await parseAndLoadRedirect();
await parseAndLoadPageLinks();
if (["fr"].includes(env.WIKI_LANG)) {
await parseAndLoadPageLinksWithLinkTarget();
} else {
await parseAndLoadPageLinks();
}

console.log("finished");

process.exit(0);
2 changes: 1 addition & 1 deletion sql-dump-to-arango/parser/dumpParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { stat } from "node:fs/promises";
import { env } from "../env.js";
import { mkdir } from "fs/promises";

export type FileType = "page" | "redirect" | "pagelinks";
export type FileType = "page" | "redirect" | "pagelinks" | "linktarget";
export async function sqlDumpStreamFromCache(fileType:FileType) {
const path = `./cache/${env.WIKI_LANG}/${env.WIKI_LANG}wiki-latest-${fileType}.sql.gz`;
if (!existsSync(path)) {
Expand Down

0 comments on commit c11e38e

Please sign in to comment.