Skip to content

Commit

Permalink
improve performance to handle Finnish (#24)
Browse files Browse the repository at this point in the history
* only memory left

* stalls in processing forms

* fix test
  • Loading branch information
StefanVukovic99 authored May 10, 2024
1 parent 572f05e commit f2aa2fc
Show file tree
Hide file tree
Showing 24 changed files with 6,921 additions and 3,506 deletions.
104 changes: 64 additions & 40 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ const {
tidy_folder: writeFolder
} = process.env;

const { sortTags, similarSort, mergePersonTags, consoleOverwrite, clearConsoleLine } = require('./util/util');
const { sortTags, similarSort, mergePersonTags, consoleOverwrite, clearConsoleLine, logProgress, mapJsonReplacer } = require('./util/util');

const lemmaDict = {};
const formDict = {};
const automatedForms = {};
const formsMap = new Map();
const automatedForms = new Map();

function escapeRegExp(string) {
return string.replace(/[.*+\-?^${}()|[\]\\]/g, '\\$&');
Expand Down Expand Up @@ -73,22 +73,24 @@ function handleNest(nestedGlossObj, sense) {
}
}

function addDeinflections(word, pos, lemma, inflections) {
function addDeinflections(form, pos, lemma, inflections) {
if (targetIso === 'fr') {
word = word.replace(/(qu\')?(ils\/elles|il\/elle\/on)\s*/, '');
form = form.replace(/(qu\')?(ils\/elles|il\/elle\/on)\s*/, '');
}

formDict[word] ??= {};
formDict[word][lemma] ??= {};
formDict[word][lemma][pos] ??= [];
const lemmaForms = formsMap.get(lemma) || new Map();
formsMap.set(lemma, lemmaForms);
const formPOSs = lemmaForms.get(form) || new Map();
lemmaForms.set(form, formPOSs);
formPOSs.get(pos) || formPOSs.set(pos, []);

try {
const inflectionsSet = new Set(formDict[word][lemma][pos]);
const inflectionsSet = new Set(formPOSs.get(pos));
for (const inflection of inflections) {
inflectionsSet.add(inflection);
}

formDict[word][lemma][pos] = Array.from(inflectionsSet);
formPOSs.set(pos, Array.from(inflectionsSet));
} catch(e) {
console.log(e);
}
Expand All @@ -110,20 +112,14 @@ const blacklistedTags = [
];

let lineCount = 0;
const printInterval = 1000;

consoleOverwrite(`3-tidy-up.js started...`);

const lr = new LineByLineReader(kaikkiFile);

lr.on('line', (line) => {
if (line) {
lineCount += 1;

if (lineCount % printInterval === 0) {
consoleOverwrite(`3-tidy-up.js: Processed ${lineCount} lines...`);
}

logProgress("Processing lines", lineCount);
handleLine(line);
}
});
Expand All @@ -140,15 +136,17 @@ function handleLine(line) {
const { form, tags } = formData;

if (form && tags && !tags.some(value => blacklistedTags.includes(value)) && form !== '-') {
automatedForms[form] ??= {};
automatedForms[form][word] ??= {};
automatedForms[form][word][pos] ??= new Set();

const tagsSet = new Set(automatedForms[form][word][pos]);

const wordMap = automatedForms.get(word) || new Map();
const formMap = wordMap.get(form) || new Map();
formMap.get(pos) || formMap.set(pos, new Set());
wordMap.set(form, formMap);
automatedForms.set(word, wordMap);

const tagsSet = new Set((formMap.get(pos)));

tagsSet.add(sortTags(targetIso, tags).join(' '));

automatedForms[form][word][pos] = similarSort(mergePersonTags(targetIso, Array.from(tagsSet)));
formMap.set(pos, similarSort(mergePersonTags(targetIso, Array.from(tagsSet))));
}
});
}
Expand Down Expand Up @@ -345,23 +343,29 @@ function getPersianReading(word, line){
}

function handleAutomatedForms() {
let missingForms = 0;
consoleOverwrite('3-tidy-up.js: Handling automated forms...');

for (const [form, info] of Object.entries(automatedForms)) {
if (!(form in formDict)) {
missingForms += 1;
let counter = 0;
let total = [...automatedForms.entries()].reduce((acc, [_, formInfo]) => acc + formInfo.size, 0);
let missingForms = 0;

if (Object.keys(info).length < 5) {
for (const [lemma, parts] of Object.entries(info)) {
for (const [pos, glosses] of Object.entries(parts)) {
if (form !== lemma) {
const inflections = glosses.map(gloss => `-automated- ${gloss}`);
addDeinflections(form, pos, lemma, inflections);
}
for (const [lemma, formInfo] of automatedForms.entries()) {
for (const [form, posInfo] of formInfo.entries()) {
counter += 1;
logProgress("Processing automated forms", counter, total);
if (!formsMap.get(lemma)?.get(form)) {
missingForms += 1;
for (const [pos, glosses] of posInfo.entries()) {

if (form !== lemma) {
addDeinflections(form, pos, lemma, glosses);
}
posInfo.delete(pos);
}
}
formInfo.delete(form);
}
automatedForms.delete(lemma);
}

console.log(`There were ${missingForms} missing forms that have now been automatically populated.`);
Expand All @@ -371,15 +375,35 @@ lr.on('end', () => {
clearConsoleLine();
process.stdout.write(`Processed ${lineCount} lines...\n`);

handleAutomatedForms();

const lemmasFilePath = `${writeFolder}/${sourceIso}-${targetIso}-lemmas.json`;
consoleOverwrite(`3-tidy-up.js: Writing lemma dict to ${lemmasFilePath}...`);
writeFileSync(lemmasFilePath, JSON.stringify(lemmaDict));

for (const prop of Object.getOwnPropertyNames(lemmaDict)) {
delete lemmaDict[prop];
}

handleAutomatedForms();

const formsFilePath = `${writeFolder}/${sourceIso}-${targetIso}-forms.json`;
consoleOverwrite(`3-tidy-up.js: Writing form dict to ${formsFilePath}...`);
writeFileSync(formsFilePath, JSON.stringify(formDict));

const mapChunks = Array.from(formsMap.entries()).reduce((acc, [key, value], index) => {
logProgress("Chunking form dict", index, formsMap.size);
const chunkIndex = Math.floor(index / 10000);
acc[chunkIndex] ??= new Map();
acc[chunkIndex].set(key, value);
return acc;
}, {});

if(!mapChunks['0']) {
mapChunks['0'] = new Map();
}

for (const [index, chunk] of Object.entries(mapChunks)) {
logProgress("Writing form dict chunks", index, Object.keys(mapChunks).length);
consoleOverwrite(`3-tidy-up.js: Writing form dict ${index} to ${formsFilePath}...`);
writeFileSync(`${formsFilePath.replace('.json', '')}-${index}.json`, JSON.stringify(chunk, mapJsonReplacer));
}

consoleOverwrite('3-tidy-up.js finished.\n');
});
4 changes: 2 additions & 2 deletions 3-tidy-up.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ for (const {iso: sourceIso} of languages){
}
);

const testForms = JSON.parse(readFileSync(`data/test/temp/${sourceIso}-${targetIso}-forms.json`, 'utf8'));
const testForms = JSON.parse(readFileSync(`data/test/temp/${sourceIso}-${targetIso}-forms-0.json`, 'utf8'));
const testLemmas = JSON.parse(readFileSync(`data/test/temp/${sourceIso}-${targetIso}-lemmas.json`, 'utf8'));

const validForms = JSON.parse(readFileSync(`data/test/tidy/${sourceIso}-${targetIso}-forms.json`, 'utf8'));
const validForms = JSON.parse(readFileSync(`data/test/tidy/${sourceIso}-${targetIso}-forms-0.json`, 'utf8'));
const validLemmas = JSON.parse(readFileSync(`data/test/tidy/${sourceIso}-${targetIso}-lemmas.json`, 'utf8'));

describe(`Tidying up ${sourceIso}-${targetIso}`, () => {
Expand Down
33 changes: 22 additions & 11 deletions 4-make-yomitan.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const date = require('date-and-time');
const now = new Date();
const currentDate = date.format(now, 'YYYY.MM.DD');

const { sortTags, writeInBatches, consoleOverwrite } = require('./util/util');
const { sortTags, writeInBatches, consoleOverwrite, mapJsonReviver } = require('./util/util');

const {
source_iso,
Expand All @@ -19,8 +19,15 @@ consoleOverwrite(`4-make-yomitan.js: reading lemmas...`);
const lemmasFile = `${readFolder}/${source_iso}-${target_iso}-lemmas.json`;
const lemmaDict = JSON.parse(readFileSync(path.resolve(__dirname, lemmasFile)));
consoleOverwrite(`4-make-yomitan.js: reading forms...`);
const formsFile = `${readFolder}/${source_iso}-${target_iso}-forms.json`;
const formDict = JSON.parse(readFileSync(path.resolve(__dirname, formsFile)));

const formsFiles = readdirSync(readFolder).filter((file) => file.startsWith(`${source_iso}-${target_iso}-forms-`));
const formsMap = new Map();
for (const file of formsFiles) {
const formsPart = JSON.parse(readFileSync(path.resolve(__dirname, readFolder, file)), mapJsonReviver);
for (const [lemma, forms] of formsPart.entries()) {
formsMap.set(lemma, forms);
}
}

if (!existsSync(`data/language/${source_iso}/${target_iso}`)) {
mkdirSync(`data/language/${source_iso}/${target_iso}`, {recursive: true});
Expand Down Expand Up @@ -127,12 +134,16 @@ for (const [lemma, readings] of Object.entries(lemmaDict)) {

if(lemma !== normalizedLemma && lemma !== reading){
term = lemma;
formDict[normalizedLemma] ??= {};
formDict[normalizedLemma][lemma] ??= {};
formDict[normalizedLemma][lemma]["any"] ??= [];
const lemmaForms = formsMap.get(lemma) || new Map();
const formPOSs = lemmaForms.get(normalizedLemma) || new Map();
const anyForms = formPOSs.get("any") || [];
formPOSs.set("any", anyForms);
lemmaForms.set(normalizedLemma, formPOSs);
formsMap.set(lemma, lemmaForms);

const message = `${normalizedLemma}\u00A0≈\u00A0${lemma}`;
if (!formDict[normalizedLemma][lemma]["any"].includes(message)){
formDict[normalizedLemma][lemma]["any"].push(message);
if (!anyForms.includes(message)){
anyForms.push(message);
}
}

Expand Down Expand Up @@ -254,9 +265,9 @@ const multiwordInflections = [ // TODO: switch on source_iso
];

consoleOverwrite('4-make-yomitan.js: Processing forms...');
for (const [form, allInfo] of Object.entries(formDict)) {
for (const [lemma, info] of Object.entries(allInfo)) {
for (const [pos, glosses] of Object.entries(info)) {
for (const [lemma, forms] of formsMap.entries()) {
for (const [form, POSs] of forms.entries()) {
for (const [pos, glosses] of POSs.entries()) {
const inflectionHypotheses = glosses.flatMap((gloss) => {
if (!gloss) { return []; }

Expand Down
2 changes: 1 addition & 1 deletion 4-make-yomitan.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const languages = JSON.parse(readFileSync('languages.json', 'utf8'));
for (const {iso: sourceIso} of languages){
for (const {iso: targetIso} of languages){
const tidyLemmas = `data/test/tidy/${sourceIso}-${targetIso}-lemmas.json`;
const tidyForms = `data/test/tidy/${sourceIso}-${targetIso}-forms.json`;
const tidyForms = `data/test/tidy/${sourceIso}-${targetIso}-forms-0.json`;

if(!existsSync(tidyLemmas) || !existsSync(tidyForms)){
continue;
Expand Down
Loading

0 comments on commit f2aa2fc

Please sign in to comment.