From c9ba3d1d3aefa5c24b188b307971622a66c00149 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 5 Jan 2024 10:34:52 +0100 Subject: [PATCH 1/3] Use linkage model in German extractor --- src/wiktextract/extractor/de/linkage.py | 9 +++--- src/wiktextract/extractor/de/models.py | 40 ++++++++++++++----------- tests/test_de_linkages.py | 22 ++++++++++---- 3 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py index 5805c35e..40d1d924 100644 --- a/src/wiktextract/extractor/de/linkage.py +++ b/src/wiktextract/extractor/de/linkage.py @@ -2,7 +2,8 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.extractor.de.models import WordEntry + +from wiktextract.extractor.de.models import Linkage, WordEntry from wiktextract.extractor.share import split_senseids from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -25,7 +26,7 @@ def extract_linkages( ) # Extract links - linkages: list[str] = [] + linkages: list[Linkage] = [] if linkage_type == "expressions": for child in list_item.children: if isinstance(child, str) and contains_dash(child): @@ -90,12 +91,12 @@ def extract_linkages( def process_link( - wxr: WiktextractContext, semantic_links: list[str], link: WikiNode + wxr: WiktextractContext, semantic_links: list[Linkage], link: WikiNode ): clean_link = clean_node(wxr, {}, link) if clean_link.startswith("Verzeichnis:"): return - semantic_links.append(clean_link) + semantic_links.append(Linkage(word=clean_link)) def contains_dash(text: str): diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index 137442b3..bafba0eb 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -7,6 +7,10 @@ class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True, extra="forbid") +class Linkage(BaseModelWrap): + word: str + + class Translation(BaseModelWrap): sense: Optional[str] = Field( default=None, description="A gloss of the sense being translated" @@ -120,15 +124,15 @@ class Sense(BaseModelWrap): default=None, description="Sense number used in Wiktionary" ) translations: Optional[list[Translation]] = [] - antonyms: Optional[list[str]] = [] - derived: Optional[list[str]] = [] - hyponyms: Optional[list[str]] = [] - hypernyms: Optional[list[str]] = [] - holonyms: Optional[list[str]] = [] - expressions: Optional[list[str]] = [] - coordinate_terms: Optional[list[str]] = [] - proverbs: Optional[list[str]] = [] - synonyms: Optional[list[str]] = [] + antonyms: Optional[list[Linkage]] = [] + derived: Optional[list[Linkage]] = [] + hyponyms: Optional[list[Linkage]] = [] + hypernyms: Optional[list[Linkage]] = [] + holonyms: Optional[list[Linkage]] = [] + expressions: Optional[list[Linkage]] = [] + coordinate_terms: Optional[list[Linkage]] = [] + proverbs: Optional[list[Linkage]] = [] + synonyms: Optional[list[Linkage]] = [] class Sound(BaseModelWrap): @@ -185,12 +189,12 @@ class WordEntry(BaseModelWrap): # ) translations: Optional[list[Translation]] = [] sounds: Optional[list[Sound]] = [] - antonyms: Optional[list[str]] = [] - derived: Optional[list[str]] = [] - hyponyms: Optional[list[str]] = [] - hypernyms: Optional[list[str]] = [] - holonyms: Optional[list[str]] = [] - expressions: Optional[list[str]] = [] - coordinate_terms: Optional[list[str]] = [] - proverbs: Optional[list[str]] = [] - synonyms: Optional[list[str]] = [] + antonyms: Optional[list[Linkage]] = [] + derived: Optional[list[Linkage]] = [] + hyponyms: Optional[list[Linkage]] = [] + hypernyms: Optional[list[Linkage]] = [] + holonyms: Optional[list[Linkage]] = [] + expressions: Optional[list[Linkage]] = [] + coordinate_terms: Optional[list[Linkage]] = [] + proverbs: Optional[list[Linkage]] = [] + synonyms: Optional[list[Linkage]] = [] diff --git a/tests/test_de_linkages.py b/tests/test_de_linkages.py index 70a73a31..69a31607 100644 --- a/tests/test_de_linkages.py +++ b/tests/test_de_linkages.py @@ -33,11 +33,17 @@ def test_de_extract_linkages(self): "senses": [ { "senseid": "1", - "coordinate_terms": ["Beleg", "Exempel"], + "coordinate_terms": [ + {"word": "Beleg"}, + {"word": "Exempel"}, + ], }, { "senseid": "2", - "coordinate_terms": ["Muster", "Vorbild"], + "coordinate_terms": [ + {"word": "Muster"}, + {"word": "Vorbild"}, + ], }, ] }, @@ -50,7 +56,9 @@ def test_de_extract_linkages(self): "expected": { "senses": [ { - "expressions": ["ein gutes Beispiel geben"], + "expressions": [ + {"word": "ein gutes Beispiel geben"} + ], } ] }, @@ -60,7 +68,9 @@ def test_de_extract_linkages(self): "input": "====Synonyme====\n:[[Synonym1]]", "senses": [Sense(senseid="1")], "expected": { - "senses": [{"senseid": "1", "synonyms": ["Synonym1"]}], + "senses": [ + {"senseid": "1", "synonyms": [{"word": "Synonym1"}]} + ], }, }, # https://de.wiktionary.org/wiki/Kokospalme @@ -73,8 +83,8 @@ def test_de_extract_linkages(self): { "senseid": "1", "synonyms": [ - "Kokosnusspalme", - "Cocos nucifera", + {"word": "Kokosnusspalme"}, + {"word": "Cocos nucifera"}, ], } ], From b9765753d651b271cf4843555d6885ebe244dcc3 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 5 Jan 2024 10:44:33 +0100 Subject: [PATCH 2/3] Rename lang_name to lang --- src/wiktextract/extractor/de/models.py | 8 ++-- src/wiktextract/extractor/de/page.py | 12 ++--- src/wiktextract/extractor/de/pronunciation.py | 4 +- src/wiktextract/extractor/de/translation.py | 6 +-- src/wiktextract/extractor/es/models.py | 2 +- src/wiktextract/extractor/es/page.py | 7 +-- src/wiktextract/extractor/ru/models.py | 4 +- src/wiktextract/extractor/ru/page.py | 6 +-- src/wiktextract/extractor/ru/translation.py | 4 +- tests/test_de_example.py | 2 +- tests/test_de_gloss.py | 2 +- tests/test_de_linkages.py | 4 +- tests/test_de_page.py | 14 +++--- tests/test_de_pronunciation.py | 10 ++-- tests/test_de_translation.py | 46 ++++++++++--------- tests/test_es_etymology.py | 7 ++- tests/test_es_gloss.py | 3 +- tests/test_es_page.py | 3 +- tests/test_es_pronunciation.py | 3 +- tests/test_es_translation.py | 3 +- tests/test_ru_gloss.py | 2 +- tests/test_ru_page.py | 2 +- tests/test_ru_pronunciation.py | 2 +- tests/test_ru_translation.py | 18 ++++---- 24 files changed, 90 insertions(+), 84 deletions(-) diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index bafba0eb..f6ecaa6b 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -20,7 +20,7 @@ class Translation(BaseModelWrap): default=None, description="Wiktionary language code of the translation term", ) - lang_name: Optional[str] = Field( + lang: Optional[str] = Field( default=None, description="Localized language name" ) uncertain: Optional[bool] = Field( @@ -151,9 +151,7 @@ class Sound(BaseModelWrap): lang_code: list[str] = Field( default=[], description="Wiktionary language code" ) - lang_name: list[str] = Field( - default=[], description="Localized language name" - ) + lang: list[str] = Field(default=[], description="Localized language name") # roman: list[str] = Field( # default=[], description="Translitaration to Roman characters" # ) @@ -179,7 +177,7 @@ class WordEntry(BaseModelWrap): lang_code: str = Field( description="Wiktionary language code", examples=["es"] ) - lang_name: str = Field( + lang: str = Field( description="Localized language name of the word", examples=["español"] ) senses: Optional[list[Sense]] = [] diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 52993e1c..eda76540 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -272,15 +272,15 @@ def parse_page( for level2_node in tree.find_child(NodeKind.LEVEL2): for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # The language sections are marked with - # == ({{Sprache|<lang_name>}}) == - # where <title> is the title of the page and <lang_name> is the + # == <title> ({{Sprache|<lang>}}) == + # where <title> is the title of the page and <lang> is the # German name of the language of the section. if subtitle_template.template_name == "Sprache": - lang_name = subtitle_template.template_parameters.get(1) - lang_code = name_to_code(lang_name, "de") + lang = subtitle_template.template_parameters.get(1) + lang_code = name_to_code(lang, "de") if lang_code == "": wxr.wtp.warning( - f"Unknown language: {lang_name}", + f"Unknown language: {lang}", sortid="extractor/de/page/parse_page/76", ) if ( @@ -290,7 +290,7 @@ def parse_page( continue base_data = WordEntry( - lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + lang=lang, lang_code=lang_code, word=wxr.wtp.title ) parse_section(wxr, page_data, base_data, level2_node.children) diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index 545d016f..43563fa2 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -99,14 +99,14 @@ def process_lautschrift_template( lang_code = template_parameters.get("spr") if lang_code: - lang_name = code_to_name(lang_code, "de") + lang = code_to_name(lang_code, "de") add_sound_data_without_appending_to_existing_properties( wxr, sound_data, { "ipa": [ipa], "lang_code": lang_code, - "lang_name": lang_name, + "lang": lang, }, ) else: diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py index 4f9c744b..f2dcfc43 100644 --- a/src/wiktextract/extractor/de/translation.py +++ b/src/wiktextract/extractor/de/translation.py @@ -103,10 +103,10 @@ def process_translation_list( lang_code = node.template_parameters.get(1) translation_data.lang_code = lang_code - translation_data.lang_name = code_to_name(lang_code, "de") - if translation_data.lang_name == "": + translation_data.lang = code_to_name(lang_code, "de") + if translation_data.lang == "": wxr.wtp.debug( - f"Unknown language code: {translation_data.lang_name}", + f"Unknown language code: {translation_data.lang}", sortid="extractor/de/translation/process_translation_list/70", ) if node.template_name[-1] == "?": diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 54c7e261..838b913c 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -156,7 +156,7 @@ class WordEntry(BaseModelWrap): lang_code: str = Field( description="Wiktionary language code", examples=["es"] ) - lang_name: str = Field( + lang: str = Field( description="Localized language name of the word", examples=["español"] ) senses: Optional[list[Sense]] = [] diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 14d4a4f9..6602cc38 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -4,6 +4,7 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList + from wiktextract.extractor.es.etymology import process_etymology_block from wiktextract.extractor.es.example import extract_example from wiktextract.extractor.es.gloss import extract_gloss @@ -368,10 +369,10 @@ def parse_page( ): continue - lang_name = clean_node(wxr, categories, subtitle_template) - wxr.wtp.start_section(lang_name) + lang = clean_node(wxr, categories, subtitle_template) + wxr.wtp.start_section(lang) base_data = WordEntry( - lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + lang=lang, lang_code=lang_code, word=wxr.wtp.title ) base_data.categories.extend(categories["categories"]) parse_entries(wxr, page_data, base_data, level2_node) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 07e7aab7..0f431fdd 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -12,7 +12,7 @@ class Translation(BaseModelWrap): lang_code: str = Field( description="Wiktionary language code of the translation term" ) - lang_name: str = Field( + lang: str = Field( description="Localized language name of the translation term" ) sense: Optional[str] = Field( @@ -112,7 +112,7 @@ class WordEntry(BaseModelWrap): lang_code: str = Field( description="Wiktionary language code", examples=["ru"] ) - lang_name: str = Field( + lang: str = Field( description="Localized language name of the word", examples=["Русский"] ) categories: list[str] = Field( diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index c63567c1..de225dda 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -202,11 +202,11 @@ def parse_page( categories = {"categories": []} - lang_name = clean_node(wxr, categories, subtitle_template) - wxr.wtp.start_section(lang_name) + lang = clean_node(wxr, categories, subtitle_template) + wxr.wtp.start_section(lang) base_data = WordEntry( - lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + lang=lang, lang_code=lang_code, word=wxr.wtp.title ) base_data.categories.extend(categories["categories"]) diff --git a/src/wiktextract/extractor/ru/translation.py b/src/wiktextract/extractor/ru/translation.py index e1f1a0b0..15c9c2b7 100644 --- a/src/wiktextract/extractor/ru/translation.py +++ b/src/wiktextract/extractor/ru/translation.py @@ -20,7 +20,7 @@ def extract_translations( for key, raw_value in template_node.template_parameters.items(): if isinstance(key, str): lang_code = key - lang_name = code_to_name(lang_code, "ru") + lang = code_to_name(lang_code, "ru") for value_node in ( raw_value @@ -36,7 +36,7 @@ def extract_translations( word_entry.translations.append( Translation( lang_code=lang_code, - lang_name=lang_name, + lang=lang, word=word, sense=sense if sense else None, ) diff --git a/tests/test_de_example.py b/tests/test_de_example.py index 29a903f6..a95667f0 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -20,7 +20,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_page_data(self) -> list[WordEntry]: - return [WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")] + return [WordEntry(word="Beispiel", lang_code="de", lang="Deutsch")] def test_de_extract_examples(self): self.wxr.wtp.start_page("") diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index a79f8ee9..6eeb264f 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -28,7 +28,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_word_entry(self): - return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel") + return WordEntry(lang_code="de", lang="Deutsch", word="Beispiel") def test_de_extract_glosses(self): self.wxr.wtp.start_page("") diff --git a/tests/test_de_linkages.py b/tests/test_de_linkages.py index 69a31607..7de21683 100644 --- a/tests/test_de_linkages.py +++ b/tests/test_de_linkages.py @@ -20,7 +20,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_word_entry(self) -> WordEntry: - return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch") + return WordEntry(word="Beispiel", lang_code="de", lang="Deutsch") def test_de_extract_linkages(self): test_cases = [ @@ -105,7 +105,7 @@ def test_de_extract_linkages(self): self.assertEqual( word_entry.model_dump( exclude_defaults=True, - exclude={"word", "lang_code", "lang_name"}, + exclude={"word", "lang_code", "lang"}, ), case["expected"], ) diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 33f0e64f..77e11e7a 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -29,7 +29,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_base_data(self): - return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel") + return WordEntry(lang_code="de", lang="Deutsch", word="Beispiel") def test_de_parse_page(self): self.wxr.wtp.add_page("Vorlage:Sprache", 10, "") @@ -45,7 +45,7 @@ def test_de_parse_page(self): lst, [ { - "lang_name": "Deutsch", + "lang": "Deutsch", "lang_code": "de", "word": "Beispiel", "pos": "noun", @@ -71,7 +71,7 @@ def test_de_parse_page_skipping_head_templates(self): lst, [ { - "lang_name": "Deutsch", + "lang": "Deutsch", "lang_code": "de", "word": "Beispiel", "pos": "noun", @@ -104,7 +104,7 @@ def test_de_parse_section(self): { "word": "Beispiel", "lang_code": "de", - "lang_name": "Deutsch", + "lang": "Deutsch", "pos": "adj", "senses": [ { @@ -118,7 +118,7 @@ def test_de_parse_section(self): "word": "Beispiel", "lang_code": "de", "pos": "adv", - "lang_name": "Deutsch", + "lang": "Deutsch", "senses": [ { "glosses": ["gloss1"], @@ -131,7 +131,7 @@ def test_de_parse_section(self): "word": "Beispiel", "lang_code": "de", "pos": "verb", - "lang_name": "Deutsch", + "lang": "Deutsch", "senses": [ { "glosses": ["gloss2"], @@ -144,7 +144,7 @@ def test_de_parse_section(self): "word": "Beispiel", "lang_code": "de", "pos": "noun", - "lang_name": "Deutsch", + "lang": "Deutsch", "senses": [ { "glosses": ["gloss3"], diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index ccf288de..19047dc8 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -4,8 +4,10 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.models import Sound -from wiktextract.extractor.de.pronunciation import (process_hoerbeispiele, - process_ipa) +from wiktextract.extractor.de.pronunciation import ( + process_hoerbeispiele, + process_ipa, +) from wiktextract.wxr_context import WiktextractContext @@ -35,7 +37,7 @@ def test_de_process_ipa(self): "expected": [ { "ipa": ["ipa1"], - "lang_name": ["Deutsch"], + "lang": ["Deutsch"], "lang_code": ["de"], } ], @@ -46,7 +48,7 @@ def test_de_process_ipa(self): {"ipa": ["ipa1", "ipa2"]}, { "ipa": ["ipa3"], - "lang_name": ["Deutsch"], + "lang": ["Deutsch"], "lang_code": ["de"], }, ], diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py index 5bd65ed3..21e20493 100644 --- a/tests/test_de_translation.py +++ b/tests/test_de_translation.py @@ -4,8 +4,10 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.models import Sense, Translation, WordEntry -from wiktextract.extractor.de.translation import (extract_translation, - process_translation_list) +from wiktextract.extractor.de.translation import ( + extract_translation, + process_translation_list, +) from wiktextract.wxr_context import WiktextractContext @@ -21,7 +23,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_word_entry(self): - return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch") + return WordEntry(word="Beispiel", lang_code="de", lang="Deutsch") def test_de_extract_translation(self): test_cases = [ @@ -37,7 +39,7 @@ def test_de_extract_translation(self): { "sense": "Beispiel", "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "example", } ], @@ -59,7 +61,7 @@ def test_de_extract_translation(self): { "sense": "Beispiel", "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "example", } ], @@ -79,7 +81,7 @@ def test_de_extract_translation(self): { "sense": "Beispiel", "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "example", } ], @@ -100,7 +102,7 @@ def test_de_extract_translation(self): self.assertEqual( word_entry.model_dump( exclude_defaults=True, - exclude={"word", "lang_code", "lang_name"}, + exclude={"word", "lang_code", "lang"}, ), case["expected"], ) @@ -114,7 +116,7 @@ def test_de_process_translation_list(self): "expected_sense_translations": [ { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "example", } ], @@ -126,7 +128,7 @@ def test_de_process_translation_list(self): "expected_sense_translations": [ { "lang_code": "hy", - "lang_name": "Armenisch", + "lang": "Armenisch", "word": "օրինակ", "roman": "orinak", } @@ -140,7 +142,7 @@ def test_de_process_translation_list(self): "expected_sense_translations": [ { "lang_code": "ru", - "lang_name": "Russisch", + "lang": "Russisch", "word": "пример", "roman": "primer", } @@ -154,7 +156,7 @@ def test_de_process_translation_list(self): "expected_sense_translations": [ { "lang_code": "ar", - "lang_name": "Arabisch", + "lang": "Arabisch", "word": "عريضة", "uncertain": True, } @@ -201,12 +203,12 @@ def test_de_process_translation_list_with_modifiers(self): "expected_sense_translations": [ { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "instance", }, { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "model", "tags": ["Vorbild"], }, @@ -220,7 +222,7 @@ def test_de_process_translation_list_with_modifiers(self): "expected_sense_translations": [ { "lang_code": "fr", - "lang_name": "Französisch", + "lang": "Französisch", "word": "exemple", "tags": ["m"], } @@ -234,19 +236,19 @@ def test_de_process_translation_list_with_modifiers(self): "expected_sense_translations": [ { "lang_code": "la", - "lang_name": "Latein", + "lang": "Latein", "word": "crus", "tags": ["f"], }, { "lang_code": "la", - "lang_name": "Latein", + "lang": "Latein", "word": "camba", "tags": ["vulgärlateinisch", "f"], }, { "lang_code": "la", - "lang_name": "Latein", + "lang": "Latein", "word": "gamba", "tags": ["vulgärlateinisch", "f"], }, @@ -262,30 +264,30 @@ def test_de_process_translation_list_with_modifiers(self): "expected_sense_translations": [ { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "subscription", "tags": ["[1a]"], }, { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "dues", }, { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "membership fee", "tags": ["[1", "2]"], }, { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "contribution", "tags": ["[3]"], }, { "lang_code": "en", - "lang_name": "Englisch", + "lang": "Englisch", "word": "article", }, ], diff --git a/tests/test_es_etymology.py b/tests/test_es_etymology.py index fccd3484..dbe69b37 100644 --- a/tests/test_es_etymology.py +++ b/tests/test_es_etymology.py @@ -1,6 +1,7 @@ import unittest from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.etymology import process_etymology_block from wiktextract.extractor.es.models import WordEntry @@ -84,15 +85,13 @@ def test_es_extract_etymology(self): with self.subTest(case=case): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(case["input"]) - data = WordEntry( - word="test", lang_code="es", lang_name="Español" - ) + data = WordEntry(word="test", lang_code="es", lang="Español") process_etymology_block(self.wxr, data, root) case["expected"].update( { "word": "test", "lang_code": "es", - "lang_name": "Español", + "lang": "Español", } ) self.assertEqual( diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py index aed14433..a22409f3 100644 --- a/tests/test_es_gloss.py +++ b/tests/test_es_gloss.py @@ -2,6 +2,7 @@ from typing import List from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.models import WordEntry @@ -19,7 +20,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_page_data(self) -> List[WordEntry]: - return [WordEntry(word="test", lang_code="es", lang_name="Language")] + return [WordEntry(word="test", lang_code="es", lang="Language")] def test_es_extract_glosses(self): # https://es.wiktionary.org/wiki/ayudar diff --git a/tests/test_es_page.py b/tests/test_es_page.py index aeda2e44..9e98a043 100644 --- a/tests/test_es_page.py +++ b/tests/test_es_page.py @@ -1,6 +1,7 @@ import unittest from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.page import parse_entries @@ -18,7 +19,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_page_data(self) -> list[WordEntry]: - return [WordEntry(word="test", lang_code="es", lang_name="Language")] + return [WordEntry(word="test", lang_code="es", lang="Language")] def test_es_parse_entries(self): """ diff --git a/tests/test_es_pronunciation.py b/tests/test_es_pronunciation.py index 7a57f54c..6cc44fbe 100644 --- a/tests/test_es_pronunciation.py +++ b/tests/test_es_pronunciation.py @@ -1,6 +1,7 @@ import unittest from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.pronunciation import ( @@ -22,7 +23,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_page_data(self) -> list[WordEntry]: - return [WordEntry(word="test", lang_code="es", lang_name="Language")] + return [WordEntry(word="test", lang_code="es", lang="Language")] def test_es_extract_pronunciation(self): # Test cases taken from https://es.wiktionary.org/wiki/Plantilla:pron-graf diff --git a/tests/test_es_translation.py b/tests/test_es_translation.py index caa0fb5c..a902e95d 100644 --- a/tests/test_es_translation.py +++ b/tests/test_es_translation.py @@ -1,6 +1,7 @@ import unittest from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.translation import extract_translation @@ -22,7 +23,7 @@ def get_default_page_data(self) -> list[WordEntry]: WordEntry( word="test", lang_code="es", - lang_name="Language", + lang="Language", ) ] diff --git a/tests/test_ru_gloss.py b/tests/test_ru_gloss.py index 7b708371..f7b8f7f6 100644 --- a/tests/test_ru_gloss.py +++ b/tests/test_ru_gloss.py @@ -19,7 +19,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_page_data(self) -> list[WordEntry]: - return [WordEntry(word="пример", lang_code="ru", lang_name="Русский")] + return [WordEntry(word="пример", lang_code="ru", lang="Русский")] def test_ru_extract_gloss(self): # https://ru.wiktionary.org/wiki/овощ diff --git a/tests/test_ru_page.py b/tests/test_ru_page.py index 9d34857c..94df766d 100644 --- a/tests/test_ru_page.py +++ b/tests/test_ru_page.py @@ -20,7 +20,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() # def get_default_page_data(self) -> list[WordEntry]: - # return [WordEntry(word="test", lang_code="es", lang_name="Language")] + # return [WordEntry(word="test", lang_code="es", lang="Language")] def test_ru_parse_page_1(self): # Navigates homonyms/homographs diff --git a/tests/test_ru_pronunciation.py b/tests/test_ru_pronunciation.py index 9a29ff4f..e6422c09 100644 --- a/tests/test_ru_pronunciation.py +++ b/tests/test_ru_pronunciation.py @@ -27,7 +27,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_word_entry(self) -> WordEntry: - return WordEntry(word="тест", lang_code="ru", lang_name="русский") + return WordEntry(word="тест", lang_code="ru", lang="русский") def process_template_and_assert( self, diff --git a/tests/test_ru_translation.py b/tests/test_ru_translation.py index 7fdd5595..acf9fa70 100644 --- a/tests/test_ru_translation.py +++ b/tests/test_ru_translation.py @@ -19,7 +19,7 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def get_default_word_entry(self) -> WordEntry: - return WordEntry(word="test", lang_code="ru", lang_name="русский") + return WordEntry(word="test", lang_code="ru", lang="русский") def test_ru_extract_gloss(self): # Test cases adapted from: https://ru.wiktionary.org/wiki/дом @@ -41,9 +41,9 @@ def test_ru_extract_gloss(self): { "word": "house", "lang_code": "en", - "lang_name": "английский", + "lang": "английский", }, - {"word": "بيت", "lang_code": "ar", "lang_name": "арабский"}, + {"word": "بيت", "lang_code": "ar", "lang": "арабский"}, ], }, { @@ -53,13 +53,13 @@ def test_ru_extract_gloss(self): { "word": "house", "lang_code": "en", - "lang_name": "английский", + "lang": "английский", "sense": "сооружение", }, { "word": "بيت", "lang_code": "ar", - "lang_name": "арабский", + "lang": "арабский", "sense": "сооружение", }, ], @@ -70,22 +70,22 @@ def test_ru_extract_gloss(self): { "word": "ti", "lang_code": "br", - "lang_name": "бретонский", + "lang": "бретонский", }, { "word": "αὐλή", "lang_code": "grc", - "lang_name": "древнегреческий", + "lang": "древнегреческий", }, { "word": "δόμος", "lang_code": "grc", - "lang_name": "древнегреческий", + "lang": "древнегреческий", }, { "word": "δῶμα", "lang_code": "grc", - "lang_name": "древнегреческий", + "lang": "древнегреческий", }, ], }, From 1f7b3f19b34d19e429d7d960143d8c20dfd21e43 Mon Sep 17 00:00:00 2001 From: Empiriker <till.ueberfries@gmail.com> Date: Fri, 5 Jan 2024 10:48:22 +0100 Subject: [PATCH 3/3] Type senseid as str in Spanish extractor --- src/wiktextract/extractor/es/gloss.py | 3 ++- src/wiktextract/extractor/es/models.py | 2 +- tests/test_es_gloss.py | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py index 47924a61..b8bb567c 100644 --- a/src/wiktextract/extractor/es/gloss.py +++ b/src/wiktextract/extractor/es/gloss.py @@ -2,6 +2,7 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList + from wiktextract.extractor.es.models import Sense, WordEntry from wiktextract.extractor.es.sense_data import process_sense_data_list from wiktextract.page import clean_node @@ -38,7 +39,7 @@ def extract_gloss( match = re.match(r"^(\d+)", gloss_note) if match: - gloss_data.senseid = int(match.group(1)) + gloss_data.senseid = match.group(1) tag_string = gloss_note[len(match.group(1)) :].strip() else: tag_string = gloss_note.strip() diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 838b913c..77ed1026 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -93,7 +93,7 @@ class Sense(BaseModelWrap): # subsenses: list["Sense"] = Field( # default=[], description="List of subsenses" # ) - senseid: Optional[int] = Field( + senseid: Optional[str] = Field( default=None, description="Sense number used in Wiktionary" ) antonyms: Optional[list[Linkage]] = [] diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py index a22409f3..5105821a 100644 --- a/tests/test_es_gloss.py +++ b/tests/test_es_gloss.py @@ -44,13 +44,13 @@ def test_es_extract_glosses(self): "glosses": [ "Contribuir esfuerzo o recursos para la realización de algo." ], - "senseid": 1, + "senseid": "1", }, { "glosses": [ "Por antonomasia, cooperar a que alguno salga de una situación dificultosa" ], - "senseid": 2, + "senseid": "2", }, ], ) @@ -80,7 +80,7 @@ def test_es_extract_gloss_categories(self): "glosses": [ "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" ], - "senseid": 1, + "senseid": "1", "tags": ["Humanidades"], "categories": ["ES:Sentimientos"], }