Skip to content

Commit

Permalink
fix duplicate first line on some nested entries (#72)
Browse files Browse the repository at this point in the history
* filter glossless

* refactor glossesArray

* refactorings

* move inflection gloss processing

* move tag preprocessing

* refactor, prepare glossTree

* fix

* [la] add baseline test for domus

* fix domus

* cleanup
  • Loading branch information
StefanVukovic99 authored Jun 17, 2024
1 parent 39add54 commit 9a20fc0
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 190 deletions.
224 changes: 115 additions & 109 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ function escapeRegExp(string) {
return string.replace(/[.*+\-?^${}()|[\]\\]/g, '\\$&');
}

function isEmpty(obj) {
return Object.keys(obj).length === 0;
}

function isInflectionGloss(glosses, formOf) {
glossesString = JSON.stringify(glosses);
switch (targetIso) {
Expand Down Expand Up @@ -63,8 +67,8 @@ function handleLevel(nest, level) {
return nestDefs;
}

function handleNest(nestedGlossObj, sense) {
const nestedGloss = handleLevel(nestedGlossObj, 1);
function handleNest(glossTree, sense) {
const nestedGloss = handleLevel(glossTree, 1);

if (nestedGloss.length > 0) {
for (const entry of nestedGloss) {
Expand Down Expand Up @@ -188,99 +192,71 @@ function handleLine(line) {

const {senses} = parsedLine;
if (!senses) return;
let nestedGlossObj = {};

for (const [senseIndex, sense] of senses.entries()) {
const sensesWithGlosses = senses.filter(sense => sense.glosses || sense.raw_glosses || sense.raw_gloss);
sensesWithGlosses.map(sense => {
const glosses = sense.raw_glosses || sense.raw_gloss || sense.glosses;
const glossesArray = glosses
? Array.isArray(glosses) ? glosses : [glosses]
: [];
const glossesArray = Array.isArray(glosses) ? glosses : [glosses];

const formOf = sense.form_of;
const tags = sense.tags || [];
if(sense.raw_tags && Array.isArray(sense.raw_tags)) {
tags.push(...sense.raw_tags);
}

if(glossesArray.length === 0) {
continue;
}

if(isInflectionGloss(glossesArray, formOf)) {
switch (targetIso) {
case 'en':
processEnglishInflectionGlosses(sense, word, pos);
break;
case 'fr':
let inflection, lemma;

const match1 = sense.glosses[0].match(/(.*)du verbe\s+((?:(?!\bdu\b).)*)$/);
const match2 = sense.glosses[0].match(/^((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]*)$/);

if (match1) {
inflection = match1[1];
lemma = match1[2];
} else if (match2) {
inflection = match2[1];
lemma = match2[2];
}
sense.glossesArray = glossesArray;
sense.tags = tags;
});

if (inflection && lemma) {
inflection = inflection.trim();
lemma = lemma.replace(/\.$/, '').trim();
const sensesWithoutInflectionGlosses = sensesWithGlosses.filter(sense => {
const {glossesArray, form_of, glosses} = sense;
if(!isInflectionGloss(glossesArray, form_of)) return true;
processInflectionGlosses(glosses, word, pos);
return false;
});

if (inflection && word !== lemma) {
addDeinflections(word, pos, lemma, [inflection]);
}
}
break;
}
continue;
if (sensesWithoutInflectionGlosses.length === 0) return;

lemmaDict[word] ??= {};
lemmaDict[word][reading] ??= {};
lemmaDict[word][reading][pos] ??= {};
lemmaDict[word][reading][pos].ipa ??= [];

for (const ipaObj of ipa) {
if (!lemmaDict[word][reading][pos].ipa.some(obj => obj.ipa === ipaObj.ipa)) {
lemmaDict[word][reading][pos].ipa.push(ipaObj);
}
}

lemmaDict[word] ??= {};
lemmaDict[word][reading] ??= {};
lemmaDict[word][reading][pos] ??= {};
lemmaDict[word][reading][pos].ipa ??= [];

for (const ipaObj of ipa) {
if (!lemmaDict[word][reading][pos].ipa.some(obj => obj.ipa === ipaObj.ipa)) {
lemmaDict[word][reading][pos].ipa.push(ipaObj);
lemmaDict[word][reading][pos].senses ??= [];

const glossTree = {};
for (const sense of sensesWithoutInflectionGlosses) {
const { glossesArray, tags } = sense;
let temp = glossTree;
for (const [levelIndex, levelGloss] of glossesArray.entries()) {
if(!temp[levelGloss]) {
temp[levelGloss] = {};
if(levelIndex === 0) {
temp[levelGloss]['_tags'] = tags;
}
} else if (levelIndex === 0) {
temp[levelGloss]['_tags'] = tags.filter(value => temp[levelGloss]['_tags'].includes(value));
}
temp = temp[levelGloss];
}

lemmaDict[word][reading][pos].senses ??= [];
}

for (const [gloss, children] of Object.entries(glossTree)) {
const tags = children._tags;
delete children['_tags'];

const currSense = { glosses: [], tags };

if (glossesArray.length > 1) {
let nestedObj = nestedGlossObj;

for (const level of glossesArray) {
nestedObj[level] ??= {};
nestedObj = nestedObj[level];
}

if (senseIndex === senses.length - 1 && nestedGlossObj) {
try {
handleNest(nestedGlossObj, currSense);
} catch (error) {
console.log(`Recursion error on word '${word}', pos '${pos}'`);
continue;
}
nestedGlossObj = {};
}
} else if (glossesArray.length === 1) {
if (nestedGlossObj) {
handleNest(nestedGlossObj, currSense);
nestedGlossObj = {};
}

const gloss = glossesArray[0];

if (!JSON.stringify(currSense.glosses).includes(gloss)) {
currSense.glosses.push(gloss);
}
if(isEmpty(children)) {
currSense.glosses.push(gloss);
} else {
const branch = {};
branch[gloss] = children;
handleNest(branch, currSense);
}

if (currSense.glosses.length > 0) {
Expand All @@ -289,44 +265,74 @@ function handleLine(line) {
}
}

function processEnglishInflectionGlosses(sense, word, pos) {
if (sense.glosses) {
glossPieces = sense.glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim()));
const lemmas = new Set();
const inflections = new Set();
for (const piece of glossPieces) {
const lemmaMatch = piece.match(/of ([^\s]+)\s*$/);
if (lemmaMatch) {
lemmas.add(lemmaMatch[1].replace(/:/g, '').trim());
}
function processInflectionGlosses(glosses, word, pos) {
switch (targetIso) {
case 'en':
processEnglishInflectionGlosses(glosses, word, pos);
break;
case 'fr':
let inflection, lemma;

if (lemmas.size > 1) {
// console.warn(`Multiple lemmas in inflection glosses for word '${word}'`, lemmas);
return;
}
const match1 = glosses[0].match(/(.*)du verbe\s+((?:(?!\bdu\b).)*)$/);
const match2 = glosses[0].match(/^((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]*)$/);

const lemma = lemmas.values().next().value;
if (match1) {
inflection = match1[1];
lemma = match1[2];
} else if (match2) {
inflection = match2[1];
lemma = match2[2];
}

if(!lemma) continue;
if (inflection && lemma) {
inflection = inflection.trim();
lemma = lemma.replace(/\.$/, '').trim();

const escapedLemma = escapeRegExp(lemma);
if (inflection && word !== lemma) {
addDeinflections(word, pos, lemma, [inflection]);
}
}
break;
}
}

const inflection = piece
.replace(/inflection of /, '')
.replace(new RegExp(`of ${escapedLemma}`), '')
.replace(new RegExp(`${escapedLemma}`), '')
.replace(new RegExp(`\\s+`), ' ')
.replace(/:/g, '')
.trim();
function processEnglishInflectionGlosses(glosses, word, pos) {
if(!glosses) return;
glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim()));
const lemmas = new Set();
const inflections = new Set();
for (const piece of glossPieces) {
const lemmaMatch = piece.match(/of ([^\s]+)\s*$/);
if (lemmaMatch) {
lemmas.add(lemmaMatch[1].replace(/:/g, '').trim());
}

inflections.add(inflection);
if (lemmas.size > 1) {
// console.warn(`Multiple lemmas in inflection glosses for word '${word}'`, lemmas);
return;
}

const lemma = lemmas.values().next().value;
if (word !== lemma) {
for (const inflection of [...inflections].filter(Boolean)) {
addDeinflections(word, pos, lemma, [inflection]);
}

if(!lemma) continue;

const escapedLemma = escapeRegExp(lemma);

const inflection = piece
.replace(/inflection of /, '')
.replace(new RegExp(`of ${escapedLemma}`), '')
.replace(new RegExp(`${escapedLemma}`), '')
.replace(new RegExp(`\\s+`), ' ')
.replace(/:/g, '')
.trim();

inflections.add(inflection);
}

const lemma = lemmas.values().next().value;
if (word !== lemma) {
for (const inflection of [...inflections].filter(Boolean)) {
addDeinflections(word, pos, lemma, [inflection]);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions data/test/dict/de/en/tag_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
0
],
[
"vi",
"vt",
"partOfSpeech",
-1,
"intransitive verb",
"transitive verb",
1
],
[
Expand Down
7 changes: 3 additions & 4 deletions data/test/dict/de/en/term_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
[
"pflegen",
"",
"v c-4 strong",
"v c-4 strong vt",
"v",
0,
[
Expand Down Expand Up @@ -89,16 +89,15 @@
]
}
]
},
"To improve or care for something in an intellectual sense. [weak or strong verb]"
}
],
0,
""
],
[
"pflegen",
"",
"v c-4 vi strong",
"v c-4 strong",
"v",
0,
[
Expand Down
Loading

0 comments on commit 9a20fc0

Please sign in to comment.