Skip to content

Commit

Permalink
[ja] add readings (#129)
Browse files Browse the repository at this point in the history
* mvp

* mvp

* multiple readings

* initialize

* handle multiple readings better

* finish
  • Loading branch information
StefanVukovic99 authored Aug 11, 2024
1 parent 097927c commit d496ba0
Show file tree
Hide file tree
Showing 9 changed files with 7,337 additions and 15 deletions.
92 changes: 77 additions & 15 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ function handleLine(line) {
if(!pos) return;
const word = getCanonicalForm(parsedLine);
if (!word) return;
const reading = getReading(word, parsedLine);
const readings = getReadings(word, parsedLine);

if (forms) {
forms.forEach((formData) => {
Expand Down Expand Up @@ -214,19 +214,12 @@ function handleLine(line) {

if (sensesWithoutInflectionGlosses.length === 0) return;

lemmaDict[word] ??= {};
lemmaDict[word][reading] ??= {};
lemmaDict[word][reading][pos] ??= {};
lemmaDict[word][reading][pos].ipa ??= [];
initializeWordResult(word, readings, pos);

for (const ipaObj of ipa) {
if (!lemmaDict[word][reading][pos].ipa.some(obj => obj.ipa === ipaObj.ipa)) {
lemmaDict[word][reading][pos].ipa.push(ipaObj);
}
saveIpaResult(word, readings, pos, ipaObj);
}

lemmaDict[word][reading][pos].senses ??= [];

const glossTree = new Map();
for (const sense of sensesWithoutInflectionGlosses) {
const { glossesArray, tags } = sense;
Expand Down Expand Up @@ -258,11 +251,34 @@ function handleLine(line) {
}

if (currSense.glosses.length > 0) {
lemmaDict[word][reading][pos].senses.push(currSense);
saveSenseResult(word, readings, pos, currSense);
}
}
}

function saveSenseResult(word, readings, pos, currSense) {
for (const reading of readings) {
lemmaDict[word][reading][pos].senses.push(currSense);
}
}

function saveIpaResult(word, readings, pos, ipaObj) {
for (const reading of readings) {
const result = lemmaDict[word][reading][pos];
if (!result.ipa.some(obj => obj.ipa === ipaObj.ipa)) {
result.ipa.push(ipaObj);
}
}
}

function initializeWordResult(word, readings, pos) {
for (const reading of readings) {
const result = ensureNestedObject(lemmaDict, [word, reading, pos]);
result.ipa ??= [];
result.senses ??= [];
}
}

function processInflectionGlosses(glosses, word, pos) {
switch (targetIso) {
case 'de':
Expand Down Expand Up @@ -305,6 +321,14 @@ function processGermanInflectionGlosses(glosses, word, pos) {
}
}

function ensureNestedObject(obj, keys) {
for (const key of keys) {
obj[key] ??= {};
obj = obj[key];
}
return obj;
}

function processEnglishInflectionGlosses(glosses, word, pos) {
if(!glosses) return;
glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim()));
Expand Down Expand Up @@ -368,12 +392,12 @@ function getCanonicalForm({word, forms}) {
return word;
}

function getReading(word, line){
function getReadings(word, line){
switch(sourceIso){
case 'fa':
return getPersianReading(word, line);
case 'fa': return [getPersianReading(word, line)];
case 'ja': return getJapaneseReadings(word, line);
default:
return word;
return [word];
}
}

Expand All @@ -384,6 +408,44 @@ function getPersianReading(word, line){
return romanization ? romanization.form : word;
}

function getJapaneseReadings(word, line){
const {head_templates} = line;
if(!head_templates) {
return [word]; // among others, happens on kanji and alt forms
}
if(!Array.isArray(head_templates) || head_templates.length === 0) {
return [word]; // never happens
}
const readings = [];
for (const template of head_templates) {
let reading;
switch(template.name) {
case 'ja-noun':
case 'ja-adj':
case 'ja-verb':
case 'ja-verb form':
case 'ja-verb-form':
case 'ja-phrase':
reading = template?.args?.[1];
break;
case 'ja-pos':
reading = template?.args?.[2];
break;
case 'head':
case 'ja-def':
case 'ja-syllable':
continue;
default:
// console.log('Unknown head_template:', word, head_templates);
}
if(reading) {
readings.push(reading.replace(/\^| /g, ''));
}
}

return readings.length > 0 ? readings : [word];
}

function handleAutomatedForms() {
consoleOverwrite('3-tidy-up.js: Handling automated forms...');

Expand Down
86 changes: 86 additions & 0 deletions data/test/dict/ja/en/tag_bank_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
[
[
"adj",
"partOfSpeech",
-1,
"adjective",
1
],
[
"n",
"partOfSpeech",
-1,
"noun",
1
],
[
"fig",
"",
0,
"figuratively",
0
],
[
"fig",
"",
0,
"figurative",
0
],
[
"abbv",
"",
0,
"abbreviation",
0
],
[
"rare",
"",
1,
"rare",
-1
],
[
"obs",
"archaism",
0,
"obsolete",
0
],
[
"v",
"partOfSpeech",
-1,
"verb",
1
],
[
"vt",
"partOfSpeech",
-1,
"transitive verb",
1
],
[
"euph",
"",
0,
"euphemistic",
0
],
[
"music",
"",
0,
"music",
0
],
[
"phrase",
"partOfSpeech",
-1,
"phrase",
1
]
]
Loading

0 comments on commit d496ba0

Please sign in to comment.