Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align schemas of Spanish, Russian and German extractors #452

Merged
merged 3 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.models import WordEntry

from wiktextract.extractor.de.models import Linkage, WordEntry
from wiktextract.extractor.share import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -25,7 +26,7 @@ def extract_linkages(
)

# Extract links
linkages: list[str] = []
linkages: list[Linkage] = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
Expand Down Expand Up @@ -90,12 +91,12 @@ def extract_linkages(


def process_link(
wxr: WiktextractContext, semantic_links: list[str], link: WikiNode
wxr: WiktextractContext, semantic_links: list[Linkage], link: WikiNode
):
clean_link = clean_node(wxr, {}, link)
if clean_link.startswith("Verzeichnis:"):
return
semantic_links.append(clean_link)
semantic_links.append(Linkage(word=clean_link))


def contains_dash(text: str):
Expand Down
48 changes: 25 additions & 23 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")


class Linkage(BaseModelWrap):
word: str


class Translation(BaseModelWrap):
sense: Optional[str] = Field(
default=None, description="A gloss of the sense being translated"
Expand All @@ -16,7 +20,7 @@ class Translation(BaseModelWrap):
default=None,
description="Wiktionary language code of the translation term",
)
lang_name: Optional[str] = Field(
lang: Optional[str] = Field(
default=None, description="Localized language name"
)
uncertain: Optional[bool] = Field(
Expand Down Expand Up @@ -120,15 +124,15 @@ class Sense(BaseModelWrap):
default=None, description="Sense number used in Wiktionary"
)
translations: Optional[list[Translation]] = []
antonyms: Optional[list[str]] = []
derived: Optional[list[str]] = []
hyponyms: Optional[list[str]] = []
hypernyms: Optional[list[str]] = []
holonyms: Optional[list[str]] = []
expressions: Optional[list[str]] = []
coordinate_terms: Optional[list[str]] = []
proverbs: Optional[list[str]] = []
synonyms: Optional[list[str]] = []
antonyms: Optional[list[Linkage]] = []
derived: Optional[list[Linkage]] = []
hyponyms: Optional[list[Linkage]] = []
hypernyms: Optional[list[Linkage]] = []
holonyms: Optional[list[Linkage]] = []
expressions: Optional[list[Linkage]] = []
coordinate_terms: Optional[list[Linkage]] = []
proverbs: Optional[list[Linkage]] = []
synonyms: Optional[list[Linkage]] = []


class Sound(BaseModelWrap):
Expand All @@ -147,9 +151,7 @@ class Sound(BaseModelWrap):
lang_code: list[str] = Field(
default=[], description="Wiktionary language code"
)
lang_name: list[str] = Field(
default=[], description="Localized language name"
)
lang: list[str] = Field(default=[], description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
Expand All @@ -175,7 +177,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["español"]
)
senses: Optional[list[Sense]] = []
Expand All @@ -185,12 +187,12 @@ class WordEntry(BaseModelWrap):
# )
translations: Optional[list[Translation]] = []
sounds: Optional[list[Sound]] = []
antonyms: Optional[list[str]] = []
derived: Optional[list[str]] = []
hyponyms: Optional[list[str]] = []
hypernyms: Optional[list[str]] = []
holonyms: Optional[list[str]] = []
expressions: Optional[list[str]] = []
coordinate_terms: Optional[list[str]] = []
proverbs: Optional[list[str]] = []
synonyms: Optional[list[str]] = []
antonyms: Optional[list[Linkage]] = []
derived: Optional[list[Linkage]] = []
hyponyms: Optional[list[Linkage]] = []
hypernyms: Optional[list[Linkage]] = []
holonyms: Optional[list[Linkage]] = []
expressions: Optional[list[Linkage]] = []
coordinate_terms: Optional[list[Linkage]] = []
proverbs: Optional[list[Linkage]] = []
synonyms: Optional[list[Linkage]] = []
12 changes: 6 additions & 6 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,15 +272,15 @@ def parse_page(
for level2_node in tree.find_child(NodeKind.LEVEL2):
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# The language sections are marked with
# == <title> ({{Sprache|<lang_name>}}) ==
# where <title> is the title of the page and <lang_name> is the
# == <title> ({{Sprache|<lang>}}) ==
# where <title> is the title of the page and <lang> is the
# German name of the language of the section.
if subtitle_template.template_name == "Sprache":
lang_name = subtitle_template.template_parameters.get(1)
lang_code = name_to_code(lang_name, "de")
lang = subtitle_template.template_parameters.get(1)
lang_code = name_to_code(lang, "de")
if lang_code == "":
wxr.wtp.warning(
f"Unknown language: {lang_name}",
f"Unknown language: {lang}",
sortid="extractor/de/page/parse_page/76",
)
if (
Expand All @@ -290,7 +290,7 @@ def parse_page(
continue

base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
parse_section(wxr, page_data, base_data, level2_node.children)

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ def process_lautschrift_template(

lang_code = template_parameters.get("spr")
if lang_code:
lang_name = code_to_name(lang_code, "de")
lang = code_to_name(lang_code, "de")
add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
{
"ipa": [ipa],
"lang_code": lang_code,
"lang_name": lang_name,
"lang": lang,
},
)
else:
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def process_translation_list(

lang_code = node.template_parameters.get(1)
translation_data.lang_code = lang_code
translation_data.lang_name = code_to_name(lang_code, "de")
if translation_data.lang_name == "":
translation_data.lang = code_to_name(lang_code, "de")
if translation_data.lang == "":
wxr.wtp.debug(
f"Unknown language code: {translation_data.lang_name}",
f"Unknown language code: {translation_data.lang}",
sortid="extractor/de/translation/process_translation_list/70",
)
if node.template_name[-1] == "?":
Expand Down
3 changes: 2 additions & 1 deletion src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.extractor.es.sense_data import process_sense_data_list
from wiktextract.page import clean_node
Expand Down Expand Up @@ -38,7 +39,7 @@ def extract_gloss(
match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data.senseid = int(match.group(1))
gloss_data.senseid = match.group(1)
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_note.strip()
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class Sense(BaseModelWrap):
# subsenses: list["Sense"] = Field(
# default=[], description="List of subsenses"
# )
senseid: Optional[int] = Field(
senseid: Optional[str] = Field(
default=None, description="Sense number used in Wiktionary"
)
antonyms: Optional[list[Linkage]] = []
Expand Down Expand Up @@ -156,7 +156,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["español"]
)
senses: Optional[list[Sense]] = []
Expand Down
7 changes: 4 additions & 3 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.etymology import process_etymology_block
from wiktextract.extractor.es.example import extract_example
from wiktextract.extractor.es.gloss import extract_gloss
Expand Down Expand Up @@ -368,10 +369,10 @@ def parse_page(
):
continue

lang_name = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang_name)
lang = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang)
base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
base_data.categories.extend(categories["categories"])
parse_entries(wxr, page_data, base_data, level2_node)
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Translation(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code of the translation term"
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the translation term"
)
sense: Optional[str] = Field(
Expand Down Expand Up @@ -112,7 +112,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["ru"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["Русский"]
)
categories: list[str] = Field(
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/ru/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ def parse_page(

categories = {"categories": []}

lang_name = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang_name)
lang = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang)

base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
base_data.categories.extend(categories["categories"])

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/ru/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def extract_translations(
for key, raw_value in template_node.template_parameters.items():
if isinstance(key, str):
lang_code = key
lang_name = code_to_name(lang_code, "ru")
lang = code_to_name(lang_code, "ru")

for value_node in (
raw_value
Expand All @@ -36,7 +36,7 @@ def extract_translations(
word_entry.translations.append(
Translation(
lang_code=lang_code,
lang_name=lang_name,
lang=lang,
word=word,
sense=sense if sense else None,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_page_data(self) -> list[WordEntry]:
return [WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")]
return [WordEntry(word="Beispiel", lang_code="de", lang="Deutsch")]

def test_de_extract_examples(self):
self.wxr.wtp.start_page("")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_word_entry(self):
return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel")
return WordEntry(lang_code="de", lang="Deutsch", word="Beispiel")

def test_de_extract_glosses(self):
self.wxr.wtp.start_page("")
Expand Down
26 changes: 18 additions & 8 deletions tests/test_de_linkages.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_word_entry(self) -> WordEntry:
return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")
return WordEntry(word="Beispiel", lang_code="de", lang="Deutsch")

def test_de_extract_linkages(self):
test_cases = [
Expand All @@ -33,11 +33,17 @@ def test_de_extract_linkages(self):
"senses": [
{
"senseid": "1",
"coordinate_terms": ["Beleg", "Exempel"],
"coordinate_terms": [
{"word": "Beleg"},
{"word": "Exempel"},
],
},
{
"senseid": "2",
"coordinate_terms": ["Muster", "Vorbild"],
"coordinate_terms": [
{"word": "Muster"},
{"word": "Vorbild"},
],
},
]
},
Expand All @@ -50,7 +56,9 @@ def test_de_extract_linkages(self):
"expected": {
"senses": [
{
"expressions": ["ein gutes Beispiel geben"],
"expressions": [
{"word": "ein gutes Beispiel geben"}
],
}
]
},
Expand All @@ -60,7 +68,9 @@ def test_de_extract_linkages(self):
"input": "====Synonyme====\n:[[Synonym1]]",
"senses": [Sense(senseid="1")],
"expected": {
"senses": [{"senseid": "1", "synonyms": ["Synonym1"]}],
"senses": [
{"senseid": "1", "synonyms": [{"word": "Synonym1"}]}
],
},
},
# https://de.wiktionary.org/wiki/Kokospalme
Expand All @@ -73,8 +83,8 @@ def test_de_extract_linkages(self):
{
"senseid": "1",
"synonyms": [
"Kokosnusspalme",
"Cocos nucifera",
{"word": "Kokosnusspalme"},
{"word": "Cocos nucifera"},
],
}
],
Expand All @@ -95,7 +105,7 @@ def test_de_extract_linkages(self):
self.assertEqual(
word_entry.model_dump(
exclude_defaults=True,
exclude={"word", "lang_code", "lang_name"},
exclude={"word", "lang_code", "lang"},
),
case["expected"],
)
Loading