Skip to content

Commit

Permalink
vary canonical form behavior by language (#136)
Browse files Browse the repository at this point in the history
* [en-*] stop using canonical form

* fix for de

* finish
  • Loading branch information
StefanVukovic99 authored Aug 23, 2024
1 parent ea9d365 commit e6e919b
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 4 deletions.
23 changes: 19 additions & 4 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ function handleLine(line) {
const parsedLine = JSON.parse(line);
const { pos, sounds, forms } = parsedLine;
if(!pos) return;
const word = getCanonicalForm(parsedLine);
const word = getCanonicalWordForm(parsedLine);
if (!word) return;
const readings = getReadings(word, parsedLine);

Expand Down Expand Up @@ -371,11 +371,26 @@ function processEnglishInflectionGlosses(glosses, word, pos) {
}
}

function getCanonicalForm({word, forms}) {
function getCanonicalWordForm({word, forms}) {
if(!forms) return word;

const canonicalForm = forms.find(form =>
form.tags &&
switch(sourceIso) {
case 'ar':
case 'fa':
case 'la':
case 'ru':
return getCanonicalForm(word, forms); // canonical form is known to contain accent marks and such
case 'de':
// case 'fr': // canonical form sometimes just prepends the definite article, but many differ from the word in apostrophe variant. I don't know which is used in practice so leaving it until there's a yomitan preprocessor for french apostrophe usage.
case 'en':
return word; // canonical form is redundant, e.g. just prepends the definite article
default:
return getCanonicalForm(word, forms); // default could go either way. keeping existing behavior for now
}
}

function getCanonicalForm(word, forms) {
const canonicalForm = forms.find(form => form.tags &&
form.tags.includes('canonical')
);
if (canonicalForm && canonicalForm.form) {
Expand Down
21 changes: 21 additions & 0 deletions data/test/dict/en/en/tag_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,26 @@
0,
"figuratively",
0
],
[
"n",
"partOfSpeech",
-1,
"noun",
1
],
[
"arch",
"archaism",
4,
"archaic",
-4
],
[
"ltrry",
"",
0,
"literary",
0
]
]
12 changes: 12 additions & 0 deletions data/test/dict/en/en/term_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,17 @@
],
0,
""
],
[
"wain",
"",
"n arch ltrry",
"n",
0,
[
"A wagon; a four-wheeled cart for hauling loads, usually pulled by horses or oxen."
],
0,
""
]
]
17 changes: 17 additions & 0 deletions data/test/dict/en/en/term_bank_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,22 @@
],
0,
""
],
[
"wains",
"",
"non-lemma",
"",
0,
[
[
"wain",
[
"plural"
]
]
],
0,
""
]
]
13 changes: 13 additions & 0 deletions data/test/ipa/en/en/term_meta_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,18 @@
}
]
}
],
[
"wain",
"ipa",
{
"reading": "wain",
"transcriptions": [
{
"ipa": "/weɪn/",
"tags": []
}
]
}
]
]
1 change: 1 addition & 0 deletions data/test/kaikki/en-en.json

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions data/test/tidy/en-en-forms-0.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,28 @@
]
]
}
],
[
"wain",
{
"_type": "map",
"map": [
[
"wains",
{
"_type": "map",
"map": [
[
"noun",
[
"plural"
]
]
]
}
]
]
}
]
]
}
23 changes: 23 additions & 0 deletions data/test/tidy/en-en-lemmas.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,28 @@
]
}
}
},
"wain": {
"wain": {
"noun": {
"ipa": [
{
"ipa": "/weɪn/",
"tags": []
}
],
"senses": [
{
"glosses": [
"(archaic or literary) A wagon; a four-wheeled cart for hauling loads, usually pulled by horses or oxen."
],
"tags": [
"archaic",
"literary"
]
}
]
}
}
}
}

0 comments on commit e6e919b

Please sign in to comment.