diff --git a/.gitignore b/.gitignore index aef1c85..e177713 100755 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *.json *.jsonl +!jsconfig.json +!.vscode/settings.json !tag_bank_term.json !tag_bank_ipa.json !parts_of_speech.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..be944f5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "javascript.validate.enable": true +} \ No newline at end of file diff --git a/3-tidy-up.js b/3-tidy-up.js index 51a1664..045c663 100644 --- a/3-tidy-up.js +++ b/3-tidy-up.js @@ -7,20 +7,31 @@ const { target_iso: targetIso, kaikki_file: kaikkiFile, tidy_folder: writeFolder -} = process.env; +} = /** @type {TidyEnv} */ (process.env); const { sortTags, similarSort, mergePersonTags, consoleOverwrite, clearConsoleLine, logProgress, mapJsonReplacer } = require('./util/util'); +/** @type {LemmaDict} */ const lemmaDict = {}; + const formsMap = new Map(); const automatedForms = new Map(); +/** + * @param {string} string + * @returns {string} +*/ function escapeRegExp(string) { return string.replace(/[.*+\-?^${}()|[\]\\]/g, '\\$&'); } +/** + * @param {string[]} glosses + * @param {FormOf[]|undefined} formOf + * @returns {boolean} + */ function isInflectionGloss(glosses, formOf) { - glossesString = JSON.stringify(glosses); + const glossesString = JSON.stringify(glosses); switch (targetIso) { case 'de': if (glosses.some(gloss => /des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens)/.test(gloss))) return true; @@ -28,6 +39,7 @@ function isInflectionGloss(glosses, formOf) { if (glosses.some(gloss => /.*inflection of.*/.test(gloss))) return true; if(!Array.isArray(formOf)) return false; for (const {word: lemma} of formOf) { + if(!lemma) continue; if (glosses.some(gloss => new RegExp(`of ${escapeRegExp(lemma)}$`).test(gloss))) return true; } @@ -38,12 +50,16 @@ function isInflectionGloss(glosses, formOf) { return false; } - -function handleLevel(nest, level) { +/** + * @param {GlossTree} glossTree + * @param {number} level + * @returns {*} + */ +function handleLevel(glossTree, level) { const nestDefs = []; let defIndex = 0; - for (const [def, children] of nest) { + for (const [def, children] of glossTree) { defIndex += 1; if(children.size > 0) { @@ -51,6 +67,7 @@ function handleLevel(nest, level) { const childDefs = handleLevel(children, nextLevel); const listType = level === 1 ? "li" : "number"; + /** @type {StructuredContent[]} */ const content = level === 1 ? def : [{ "tag": "span", "data": { "listType": "number" }, "content": `${defIndex}. ` }, def]; nestDefs.push([ @@ -65,6 +82,10 @@ function handleLevel(nest, level) { return nestDefs; } +/** + * @param {GlossTree} glossTree + * @param {SenseInfo} sense + */ function handleNest(glossTree, sense) { const nestedGloss = handleLevel(glossTree, 1); @@ -74,7 +95,12 @@ function handleNest(glossTree, sense) { } } } - +/** + * @param {*} form + * @param {string} pos + * @param {*} lemma + * @param {*} inflections + */ function addDeinflections(form, pos, lemma, inflections) { if (targetIso === 'fr') { form = form.replace(/(qu\')?(ils\/elles|il\/elle\/on)\s*/, ''); @@ -132,12 +158,14 @@ lr.on('line', (line) => { if (line) { lineCount += 1; logProgress("Processing lines", lineCount); - handleLine(line); + handleLine(JSON.parse(line)); } }); -function handleLine(line) { - const parsedLine = JSON.parse(line); +/** + * @param {KaikkiLine} parsedLine + */ +function handleLine(parsedLine) { const { pos, sounds, forms } = parsedLine; if(!pos) return; const word = getCanonicalWordForm(parsedLine); @@ -164,9 +192,11 @@ function handleLine(line) { .flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj.ipa.map(ipa => ({ ipa, tags: ipaObj.tags })) ) .filter(ipaObj => ipaObj.ipa) : []; - - const sensesWithGlosses = senses.filter(sense => sense.glosses || sense.raw_glosses || sense.raw_gloss); - sensesWithGlosses.map(sense => { + + /** @type {TidySense[]} */ + const sensesWithGlosses = senses + .filter(sense => sense.glosses || sense.raw_glosses || sense.raw_gloss) + .map(sense => { const glosses = sense.raw_glosses || sense.raw_gloss || sense.glosses; const glossesArray = Array.isArray(glosses) ? glosses : [glosses]; @@ -175,8 +205,7 @@ function handleLine(line) { tags.push(...sense.raw_tags); } - sense.glossesArray = glossesArray; - sense.tags = tags; + return {...sense, glossesArray, tags}; }); const sensesWithoutInflectionGlosses = sensesWithGlosses.filter(sense => { @@ -195,31 +224,36 @@ function handleLine(line) { saveIpaResult(word, readings, pos, ipaObj); } + /** @type {GlossTree} */ const glossTree = new Map(); for (const sense of sensesWithoutInflectionGlosses) { const { glossesArray, tags } = sense; let temp = glossTree; for (const [levelIndex, levelGloss] of glossesArray.entries()) { - if(!temp.get(levelGloss)) { - temp.set(levelGloss, new Map()); + let curr = temp.get(levelGloss); + if(!curr) { + curr = new Map(); + temp.set(levelGloss, curr); if(levelIndex === 0) { - temp.get(levelGloss).set('_tags', tags); + curr.set('_tags', tags); } } else if (levelIndex === 0) { - temp.get(levelGloss).set('_tags', tags.filter(value => temp.get(levelGloss).get('_tags').includes(value))); + curr.set('_tags', tags.filter(value => curr?.get('_tags')?.includes(value))); } - temp = temp.get(levelGloss); + temp = curr; } } for (const [gloss, children] of glossTree) { - const tags = children.get('_tags'); - children.delete('_tags'); + const tags = children.get('_tags') || []; + children.delete('_tags'); + /** @type {SenseInfo} */ const currSense = { glosses: [], tags }; if(children.size === 0) { currSense.glosses.push(gloss); } else { + /** @type {GlossTree} */ const branch = new Map(); branch.set(gloss, children); handleNest(branch, currSense); @@ -231,41 +265,57 @@ function handleLine(line) { } } +/** + * @param {Form[]|undefined} forms + * @param {string} word + * @param {string} pos + */ function processForms(forms, word, pos) { - if (forms) { - forms.forEach((formData) => { - const { form } = formData; - let { tags } = formData; - if (!form) return; - if (!tags) return; - if (form === '-') return; - tags = tags.filter(tag => !redundantTags.includes(tag)); - const isBlacklisted = tags.some(value => blacklistedTags.includes(value)); - if (isBlacklisted) return; - const isIdentity = !tags.some(value => !identityTags.includes(value)); - if (isIdentity) return; - - const wordMap = automatedForms.get(word) || new Map(); - const formMap = wordMap.get(form) || new Map(); - formMap.get(pos) || formMap.set(pos, new Set()); - wordMap.set(form, formMap); - automatedForms.set(word, wordMap); - - const tagsSet = new Set((formMap.get(pos))); - - tagsSet.add(sortTags(targetIso, tags).join(' ')); - - formMap.set(pos, similarSort(mergePersonTags(targetIso, Array.from(tagsSet)))); - }); - } + if(!forms) return; + forms.forEach((formData) => { + const { form } = formData; + let { tags } = formData; + if (!form) return; + if (!tags) return; + if (form === '-') return; + tags = tags.filter(tag => !redundantTags.includes(tag)); + const isBlacklisted = tags.some(value => blacklistedTags.includes(value)); + if (isBlacklisted) return; + const isIdentity = !tags.some(value => !identityTags.includes(value)); + if (isIdentity) return; + + const wordMap = automatedForms.get(word) || new Map(); + const formMap = wordMap.get(form) || new Map(); + formMap.get(pos) || formMap.set(pos, new Set()); + wordMap.set(form, formMap); + automatedForms.set(word, wordMap); + + const tagsSet = new Set((formMap.get(pos))); + + tagsSet.add(sortTags(targetIso, tags).join(' ')); + + formMap.set(pos, similarSort(mergePersonTags(targetIso, Array.from(tagsSet)))); + }); } +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + * @param {SenseInfo} currSense + */ function saveSenseResult(word, readings, pos, currSense) { for (const reading of readings) { lemmaDict[word][reading][pos].senses.push(currSense); } } +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + * @param {*} ipaObj + */ function saveIpaResult(word, readings, pos, ipaObj) { for (const reading of readings) { const result = lemmaDict[word][reading][pos]; @@ -275,6 +325,11 @@ function saveIpaResult(word, readings, pos, ipaObj) { } } +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + */ function initializeWordResult(word, readings, pos) { for (const reading of readings) { const result = ensureNestedObject(lemmaDict, [word, reading, pos]); @@ -283,6 +338,12 @@ function initializeWordResult(word, readings, pos) { } } +/** + * @param {Glosses|undefined} glosses + * @param {string} word + * @param {string} pos + * @returns + */ function processInflectionGlosses(glosses, word, pos) { switch (targetIso) { case 'de': @@ -290,6 +351,7 @@ function processInflectionGlosses(glosses, word, pos) { case 'en': return processEnglishInflectionGlosses(glosses, word, pos); case 'fr': + if(!glosses) return; let inflection, lemma; const match1 = glosses[0].match(/(.*)du verbe\s+((?:(?!\bdu\b).)*)$/); @@ -315,6 +377,12 @@ function processInflectionGlosses(glosses, word, pos) { } } +/** + * @param {*} glosses + * @param {string} word + * @param {string} pos + * @returns + */ function processGermanInflectionGlosses(glosses, word, pos) { const match1 = glosses[0].match(/(.*)des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens) (.*)$/); if (!match1 || match1.length < 3) return; @@ -325,6 +393,11 @@ function processGermanInflectionGlosses(glosses, word, pos) { } } +/** + * @param {NestedObject} obj + * @param {string[]} keys + * @returns {NestedObject} + */ function ensureNestedObject(obj, keys) { for (const key of keys) { obj[key] ??= {}; @@ -333,9 +406,14 @@ function ensureNestedObject(obj, keys) { return obj; } +/** + * @param {Glosses|undefined} glosses + * @param {string} word + * @param {string} pos + */ function processEnglishInflectionGlosses(glosses, word, pos) { - if(!glosses) return; - glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim())); + if(!glosses || !Array.isArray(glosses)) return; + const glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim())); const lemmas = new Set(); const inflections = new Set(); for (const piece of glossPieces) { @@ -374,6 +452,10 @@ function processEnglishInflectionGlosses(glosses, word, pos) { } } +/** + * @param {KaikkiLine} line + * @returns {string|undefined} + */ function getCanonicalWordForm({word, forms}) { if(!forms) return word; @@ -392,6 +474,11 @@ function getCanonicalWordForm({word, forms}) { } } +/** + * @param {string|undefined} word + * @param {Form[]} forms + * @returns {string|undefined} + */ function getCanonicalForm(word, forms) { const canonicalForm = forms.find(form => form.tags && form.tags.includes('canonical') @@ -403,7 +490,7 @@ function getCanonicalForm(word, forms) { word = word.replace(/ {{#if:.+/, '').trim(); } - bracketsRegex = /\[.*\]$/; + const bracketsRegex = /\[.*\]$/; if (bracketsRegex.test(word)) { word = word.replace(bracketsRegex, '').trim(); } @@ -411,6 +498,11 @@ function getCanonicalForm(word, forms) { return word; } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string[]} + */ function getReadings(word, line){ switch(sourceIso){ case 'fa': return [getPersianReading(word, line)]; @@ -420,13 +512,23 @@ function getReadings(word, line){ } } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string} + */ function getPersianReading(word, line){ const {forms} = line; if(!forms) return word; const romanization = forms.find(({form, tags}) => tags && tags.includes('romanization') && tags.length === 1 && form); - return romanization ? romanization.form : word; + return romanization?.form || word; } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string[]} + */ function getJapaneseReadings(word, line){ const {head_templates} = line; if(!head_templates) { diff --git a/jsconfig.json b/jsconfig.json new file mode 100644 index 0000000..866a5bb --- /dev/null +++ b/jsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "module": "ES2022", + "target": "ES2022", + "checkJs": true, + "strict": true, + "strictNullChecks": true, + "noImplicitAny": true, + "strictPropertyInitialization": true, + "suppressImplicitAnyIndexErrors": false + }, + "paths": { + "*": ["./types/*"], + "ext/json-schema": ["./types/ext/json-schema"] + }, + "exclude": ["node_modules", "**/node_modules/*"] +} + \ No newline at end of file diff --git a/types/types.ts b/types/types.ts new file mode 100644 index 0000000..40a33b9 --- /dev/null +++ b/types/types.ts @@ -0,0 +1,100 @@ +declare global { + type TidyEnv = { + source_iso: string, + target_iso: string, + kaikki_file: string, + tidy_folder: string, + } + + type KaikkiLine = { + head_templates?: HeadTemplate[]; + word?: string; + pos?: string; + sounds?: Sound[]; + forms?: Form[]; + senses?: KaikkiSense[]; + } + + type HeadTemplate = { + name?: string; + args?: string[]; + } + + type Sound = { + ipa?: string; + tags?: string[]; + note?: string; + } + + type Form = { + form?: string; + tags?: string[]; + } + + type KaikkiSense = { + glosses?: Glosses; + raw_glosses?: Glosses; + raw_gloss?: Glosses; + tags?: string[]; + raw_tags?: string[]; + form_of?: FormOf[]; + } + + type Glosses = string | string[]; + + type FormOf = { + word?: string; + } + + type GlossTree = Map & { + get(key: '_tags'): string[] | undefined; + set(key: '_tags', value: string[]): GlossTree; + }; + + type TidySense = Omit & { + tags: string[]; + glossesArray: string[]; + } + + type LemmaDict = { + [word: string]: { + [reading: string]: { + [pos: string]: LemmaInfo + } + } + } + + type LemmaInfo = { + ipa: IpaInfo[], + senses: SenseInfo[], + } + + type IpaInfo = { + ipa: string, + tags: string[], + } + + type SenseInfo = { + glosses: YomitanGloss[], + tags: string[], + } + + type YomitanGloss = string | StructuredGloss + + type StructuredGloss = { + type: "structured-content", + content: string | StructuredContent[], + } + + type StructuredContent = { + tag: string, + data: string, + content: StructuredContent, + } + + type NestedObject = { + [key: string]: NestedObject; + } +} + +export {} // This is needed to make this file a module \ No newline at end of file