From 89e471b601a528b90575596054c27c0412e2d7ef Mon Sep 17 00:00:00 2001 From: aidanpscott <158211280+aidanpscott@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:18:33 -0500 Subject: [PATCH] Feature/convert reversal index/685 (#747) * Add convert reversal index * Implement index task for dictionary reversals * Format for lint * Make missing reversal index throw an error * Implement alphabet-based chunking for reversal * Format for lint * Remove unnecessary comments * Remove index.json metadata from the reversal * Format for lint * Use configured filename for reversal * Change indexFilePath to lexicon file * Remove unnecessary comments * Include ligatures with alphabet letters * return files to write (makes it easier to test) * use reversal/[lang] instead of reversal/language/[lang] * TODO (another PR) * need to consistently create directories for files * allow clean of directory before writing files --------- Co-authored-by: Carson Kramer Co-authored-by: Chris Hubbard --- .gitignore | 1 + convert/Task.ts | 10 ++- convert/convertReverseIndex.ts | 157 +++++++++++++++++++++++++++++++++ convert/index.ts | 6 +- 4 files changed, 166 insertions(+), 8 deletions(-) create mode 100644 convert/convertReverseIndex.ts diff --git a/.gitignore b/.gitignore index f097ec045..70fbe353e 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ static/songs static/plans static/quiz static/data.sqlite +static/reversal src/lib/data/catalog.js src/lib/data/firebase-config.js src/lib/data/config.js diff --git a/convert/Task.ts b/convert/Task.ts index 55cd5799f..95e33f160 100644 --- a/convert/Task.ts +++ b/convert/Task.ts @@ -1,9 +1,11 @@ +export type FileContent = { + path: string; + content: string; +}; + export interface TaskOutput { taskName: string; - files: { - path: string; - content: string; - }[]; + files: FileContent[]; } export type Promisable = T | Promise; diff --git a/convert/convertReverseIndex.ts b/convert/convertReverseIndex.ts new file mode 100644 index 000000000..041e074de --- /dev/null +++ b/convert/convertReverseIndex.ts @@ -0,0 +1,157 @@ +import { readFileSync, mkdirSync, existsSync } from 'fs'; +import path from 'path'; +import { FileContent, Task, TaskOutput } from './Task'; +import type { DictionaryConfig } from '$config'; + +interface ReversalEntry { + index: number; + name: string; + homonym_index?: number; +} + +const ENTRIES_PER_CHUNK = 100; + +function makeEntryLetter(char: string) { + return char.toUpperCase(); +} + +function getBaseLetter(char: string, alphabet: string[]): string | null { + const alphabetEntry = alphabet.find((entry) => + char.normalize('NFD')[0].toLowerCase().startsWith(entry.toLowerCase()) + ); + if (!alphabetEntry) { + return null; + } + return makeEntryLetter(alphabetEntry[0]); +} + +export function convertReverseIndex( + dataDir: string, + language: string, + alphabet: string[] +): FileContent[] { + const indexFilePath = path.join(dataDir, 'reversal', `lexicon-${language}.idx`); + const outputDir = path.join('static', 'reversal', language); + const files: FileContent[] = []; + + if (!existsSync(indexFilePath)) { + throw new Error(`Required reversal index not found: ${indexFilePath}`); + } + + if (!existsSync(outputDir)) { + mkdirSync(outputDir, { recursive: true }); + } + + const content = readFileSync(indexFilePath, 'utf-8'); + const indexEntries = content + .split('\n') + .map((line) => line.trim().split('\t')) + .filter(([gloss]) => gloss?.length > 0); + + const entriesByLetter: { [letter: string]: [string, string][] } = {}; + + let latestLetter = makeEntryLetter(alphabet[0]); + indexEntries.forEach((entry) => { + if (!entry || !entry[0]) return; + const gloss = entry[0]; + + const firstLetter = getBaseLetter(gloss, alphabet); + const entryLetter = firstLetter ?? latestLetter; + if (!entriesByLetter[entryLetter]) { + entriesByLetter[entryLetter] = []; + } + entriesByLetter[entryLetter].push([entry[0], entry[1]]); + latestLetter = entryLetter; + }); + + Object.entries(entriesByLetter).forEach(([letter, entries]) => { + entries.sort(([a], [b]) => a.localeCompare(b, language)); + + let currentChunk: { [key: string]: ReversalEntry[] } = {}; + let currentCount = 0; + let chunkIndex = 0; + + for (let i = 0; i < entries.length; i++) { + const [gloss, ids] = entries[i]; + if (!gloss || !ids) continue; + + const idList = ids + .split(',') + .map((id) => { + const trimmed = id.trim(); + const match = trimmed.match(/^(\d+)(?:\^(\d+))?$/); + if (match) { + const entry: ReversalEntry = { + index: parseInt(match[1]), + name: gloss + }; + if (match[2]) { + entry.homonym_index = parseInt(match[2]); + } + return entry; + } + return null; + }) + .filter((entry): entry is ReversalEntry => entry !== null); + + if (idList.length > 0) { + currentChunk[gloss] = idList; + currentCount++; + + if (currentCount >= ENTRIES_PER_CHUNK || i === entries.length - 1) { + const chunkFileName = `${letter.toLowerCase()}-${String(chunkIndex + 1).padStart(3, '0')}.json`; + const chunkPath = path.join(outputDir, chunkFileName); + + files.push({ + path: chunkPath, + content: JSON.stringify(currentChunk, null, 2) + }); + + currentChunk = {}; + currentCount = 0; + chunkIndex++; + } + } + } + }); + + return files; +} + +export class ConvertReverseIndex extends Task { + public triggerFiles: string[] = ['reversal']; + + constructor(dataDir: string) { + super(dataDir); + } + + public async run(verbose: number, outputs: Map): Promise { + const configOutput = outputs.get('ConvertConfig') as { data: DictionaryConfig } | undefined; + if (!configOutput || !configOutput.data) { + throw new Error('Config data not found in outputs'); + } + + if (!configOutput.data.writingSystems) { + throw new Error('No writing systems found in config data'); + } + + let files: FileContent[] = []; + for (const lang in configOutput.data.writingSystems) { + const writingSystem = configOutput.data.writingSystems[lang]; + + if (writingSystem.reversalFilename && writingSystem.alphabet) { + if (verbose) { + console.log(`Processing reversal index for language: ${lang}`); + } + + const langFiles = convertReverseIndex(this.dataDir, lang, writingSystem.alphabet); + files.push(...langFiles); + } + } + + return { + taskName: this.constructor.name, + files + }; + } +} diff --git a/convert/index.ts b/convert/index.ts index 130aba672..1e1a13baa 100644 --- a/convert/index.ts +++ b/convert/index.ts @@ -9,6 +9,7 @@ import { ConvertStyles } from './convertStyles'; import { ConvertBadges } from './convertBadges'; import { ConvertPlans } from './convertPlans'; import { ConvertSQLite } from './convertSQLite'; +import { ConvertReverseIndex } from './convertReverseIndex'; import { watch } from 'chokidar'; import { Task, TaskOutput } from './Task'; import { writeFile } from 'fs'; @@ -51,10 +52,7 @@ const commonStepClasses = [ //Classes only necessary for SAB const SABStepClasses = [ConvertPlans, ConvertBooks]; -const DABStepClasses = [ - // ConvertReversalIndex, - ConvertSQLite -]; +const DABStepClasses = [ConvertReverseIndex, ConvertSQLite]; const stepClasses: Task[] = [ ...commonStepClasses,