Skip to content

Commit

Permalink
Feature/convert reversal index/685 (#747)
Browse files Browse the repository at this point in the history
* Add convert reversal index

* Implement index task for dictionary reversals

* Format for lint

* Make missing reversal index throw an error

* Implement alphabet-based chunking for reversal

* Format for lint

* Remove unnecessary comments

* Remove index.json metadata from the reversal

* Format for lint

* Use configured filename for reversal

* Change indexFilePath to lexicon file

* Remove unnecessary comments

* Include ligatures with alphabet letters

* return files to write (makes it easier to test)
* use reversal/[lang] instead of reversal/language/[lang]
* TODO (another PR)
  * need to consistently create directories for files
  * allow clean of directory before writing files

---------

Co-authored-by: Carson Kramer <[email protected]>
Co-authored-by: Chris Hubbard <[email protected]>
  • Loading branch information
3 people authored Dec 9, 2024
1 parent 4bb247b commit 89e471b
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ static/songs
static/plans
static/quiz
static/data.sqlite
static/reversal
src/lib/data/catalog.js
src/lib/data/firebase-config.js
src/lib/data/config.js
Expand Down
10 changes: 6 additions & 4 deletions convert/Task.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
export type FileContent = {
path: string;
content: string;
};

export interface TaskOutput {
taskName: string;
files: {
path: string;
content: string;
}[];
files: FileContent[];
}

export type Promisable<T> = T | Promise<T>;
Expand Down
157 changes: 157 additions & 0 deletions convert/convertReverseIndex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import { readFileSync, mkdirSync, existsSync } from 'fs';
import path from 'path';
import { FileContent, Task, TaskOutput } from './Task';
import type { DictionaryConfig } from '$config';

interface ReversalEntry {
index: number;
name: string;
homonym_index?: number;
}

const ENTRIES_PER_CHUNK = 100;

function makeEntryLetter(char: string) {
return char.toUpperCase();
}

function getBaseLetter(char: string, alphabet: string[]): string | null {
const alphabetEntry = alphabet.find((entry) =>
char.normalize('NFD')[0].toLowerCase().startsWith(entry.toLowerCase())
);
if (!alphabetEntry) {
return null;
}
return makeEntryLetter(alphabetEntry[0]);
}

export function convertReverseIndex(
dataDir: string,
language: string,
alphabet: string[]
): FileContent[] {
const indexFilePath = path.join(dataDir, 'reversal', `lexicon-${language}.idx`);
const outputDir = path.join('static', 'reversal', language);
const files: FileContent[] = [];

if (!existsSync(indexFilePath)) {
throw new Error(`Required reversal index not found: ${indexFilePath}`);
}

if (!existsSync(outputDir)) {
mkdirSync(outputDir, { recursive: true });
}

const content = readFileSync(indexFilePath, 'utf-8');
const indexEntries = content
.split('\n')
.map((line) => line.trim().split('\t'))
.filter(([gloss]) => gloss?.length > 0);

const entriesByLetter: { [letter: string]: [string, string][] } = {};

let latestLetter = makeEntryLetter(alphabet[0]);
indexEntries.forEach((entry) => {
if (!entry || !entry[0]) return;
const gloss = entry[0];

const firstLetter = getBaseLetter(gloss, alphabet);
const entryLetter = firstLetter ?? latestLetter;
if (!entriesByLetter[entryLetter]) {
entriesByLetter[entryLetter] = [];
}
entriesByLetter[entryLetter].push([entry[0], entry[1]]);
latestLetter = entryLetter;
});

Object.entries(entriesByLetter).forEach(([letter, entries]) => {
entries.sort(([a], [b]) => a.localeCompare(b, language));

let currentChunk: { [key: string]: ReversalEntry[] } = {};
let currentCount = 0;
let chunkIndex = 0;

for (let i = 0; i < entries.length; i++) {
const [gloss, ids] = entries[i];
if (!gloss || !ids) continue;

const idList = ids
.split(',')
.map((id) => {
const trimmed = id.trim();
const match = trimmed.match(/^(\d+)(?:\^(\d+))?$/);
if (match) {
const entry: ReversalEntry = {
index: parseInt(match[1]),
name: gloss
};
if (match[2]) {
entry.homonym_index = parseInt(match[2]);
}
return entry;
}
return null;
})
.filter((entry): entry is ReversalEntry => entry !== null);

if (idList.length > 0) {
currentChunk[gloss] = idList;
currentCount++;

if (currentCount >= ENTRIES_PER_CHUNK || i === entries.length - 1) {
const chunkFileName = `${letter.toLowerCase()}-${String(chunkIndex + 1).padStart(3, '0')}.json`;
const chunkPath = path.join(outputDir, chunkFileName);

files.push({
path: chunkPath,
content: JSON.stringify(currentChunk, null, 2)
});

currentChunk = {};
currentCount = 0;
chunkIndex++;
}
}
}
});

return files;
}

export class ConvertReverseIndex extends Task {
public triggerFiles: string[] = ['reversal'];

constructor(dataDir: string) {
super(dataDir);
}

public async run(verbose: number, outputs: Map<string, TaskOutput>): Promise<TaskOutput> {
const configOutput = outputs.get('ConvertConfig') as { data: DictionaryConfig } | undefined;
if (!configOutput || !configOutput.data) {
throw new Error('Config data not found in outputs');
}

if (!configOutput.data.writingSystems) {
throw new Error('No writing systems found in config data');
}

let files: FileContent[] = [];
for (const lang in configOutput.data.writingSystems) {
const writingSystem = configOutput.data.writingSystems[lang];

if (writingSystem.reversalFilename && writingSystem.alphabet) {
if (verbose) {
console.log(`Processing reversal index for language: ${lang}`);
}

const langFiles = convertReverseIndex(this.dataDir, lang, writingSystem.alphabet);
files.push(...langFiles);
}
}

return {
taskName: this.constructor.name,
files
};
}
}
6 changes: 2 additions & 4 deletions convert/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { ConvertStyles } from './convertStyles';
import { ConvertBadges } from './convertBadges';
import { ConvertPlans } from './convertPlans';
import { ConvertSQLite } from './convertSQLite';
import { ConvertReverseIndex } from './convertReverseIndex';
import { watch } from 'chokidar';
import { Task, TaskOutput } from './Task';
import { writeFile } from 'fs';
Expand Down Expand Up @@ -51,10 +52,7 @@ const commonStepClasses = [
//Classes only necessary for SAB
const SABStepClasses = [ConvertPlans, ConvertBooks];

const DABStepClasses = [
// ConvertReversalIndex,
ConvertSQLite
];
const DABStepClasses = [ConvertReverseIndex, ConvertSQLite];

const stepClasses: Task[] = [
...commonStepClasses,
Expand Down

0 comments on commit 89e471b

Please sign in to comment.