Skip to content

Commit

Permalink
Merge pull request #452 from empiriker/master
Browse files Browse the repository at this point in the history
Align schemas of Spanish, Russian and German extractors
  • Loading branch information
kristian-clausal authored Jan 5, 2024
2 parents 90f9f71 + 1f7b3f1 commit 1f6f32a
Show file tree
Hide file tree
Showing 26 changed files with 139 additions and 117 deletions.
9 changes: 5 additions & 4 deletions src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.models import WordEntry

from wiktextract.extractor.de.models import Linkage, WordEntry
from wiktextract.extractor.share import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -25,7 +26,7 @@ def extract_linkages(
)

# Extract links
linkages: list[str] = []
linkages: list[Linkage] = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
Expand Down Expand Up @@ -90,12 +91,12 @@ def extract_linkages(


def process_link(
wxr: WiktextractContext, semantic_links: list[str], link: WikiNode
wxr: WiktextractContext, semantic_links: list[Linkage], link: WikiNode
):
clean_link = clean_node(wxr, {}, link)
if clean_link.startswith("Verzeichnis:"):
return
semantic_links.append(clean_link)
semantic_links.append(Linkage(word=clean_link))


def contains_dash(text: str):
Expand Down
48 changes: 25 additions & 23 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")


class Linkage(BaseModelWrap):
word: str


class Translation(BaseModelWrap):
sense: Optional[str] = Field(
default=None, description="A gloss of the sense being translated"
Expand All @@ -16,7 +20,7 @@ class Translation(BaseModelWrap):
default=None,
description="Wiktionary language code of the translation term",
)
lang_name: Optional[str] = Field(
lang: Optional[str] = Field(
default=None, description="Localized language name"
)
uncertain: Optional[bool] = Field(
Expand Down Expand Up @@ -120,15 +124,15 @@ class Sense(BaseModelWrap):
default=None, description="Sense number used in Wiktionary"
)
translations: Optional[list[Translation]] = []
antonyms: Optional[list[str]] = []
derived: Optional[list[str]] = []
hyponyms: Optional[list[str]] = []
hypernyms: Optional[list[str]] = []
holonyms: Optional[list[str]] = []
expressions: Optional[list[str]] = []
coordinate_terms: Optional[list[str]] = []
proverbs: Optional[list[str]] = []
synonyms: Optional[list[str]] = []
antonyms: Optional[list[Linkage]] = []
derived: Optional[list[Linkage]] = []
hyponyms: Optional[list[Linkage]] = []
hypernyms: Optional[list[Linkage]] = []
holonyms: Optional[list[Linkage]] = []
expressions: Optional[list[Linkage]] = []
coordinate_terms: Optional[list[Linkage]] = []
proverbs: Optional[list[Linkage]] = []
synonyms: Optional[list[Linkage]] = []


class Sound(BaseModelWrap):
Expand All @@ -147,9 +151,7 @@ class Sound(BaseModelWrap):
lang_code: list[str] = Field(
default=[], description="Wiktionary language code"
)
lang_name: list[str] = Field(
default=[], description="Localized language name"
)
lang: list[str] = Field(default=[], description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
Expand All @@ -175,7 +177,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["español"]
)
senses: Optional[list[Sense]] = []
Expand All @@ -185,12 +187,12 @@ class WordEntry(BaseModelWrap):
# )
translations: Optional[list[Translation]] = []
sounds: Optional[list[Sound]] = []
antonyms: Optional[list[str]] = []
derived: Optional[list[str]] = []
hyponyms: Optional[list[str]] = []
hypernyms: Optional[list[str]] = []
holonyms: Optional[list[str]] = []
expressions: Optional[list[str]] = []
coordinate_terms: Optional[list[str]] = []
proverbs: Optional[list[str]] = []
synonyms: Optional[list[str]] = []
antonyms: Optional[list[Linkage]] = []
derived: Optional[list[Linkage]] = []
hyponyms: Optional[list[Linkage]] = []
hypernyms: Optional[list[Linkage]] = []
holonyms: Optional[list[Linkage]] = []
expressions: Optional[list[Linkage]] = []
coordinate_terms: Optional[list[Linkage]] = []
proverbs: Optional[list[Linkage]] = []
synonyms: Optional[list[Linkage]] = []
12 changes: 6 additions & 6 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,15 +272,15 @@ def parse_page(
for level2_node in tree.find_child(NodeKind.LEVEL2):
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# The language sections are marked with
# == <title> ({{Sprache|<lang_name>}}) ==
# where <title> is the title of the page and <lang_name> is the
# == <title> ({{Sprache|<lang>}}) ==
# where <title> is the title of the page and <lang> is the
# German name of the language of the section.
if subtitle_template.template_name == "Sprache":
lang_name = subtitle_template.template_parameters.get(1)
lang_code = name_to_code(lang_name, "de")
lang = subtitle_template.template_parameters.get(1)
lang_code = name_to_code(lang, "de")
if lang_code == "":
wxr.wtp.warning(
f"Unknown language: {lang_name}",
f"Unknown language: {lang}",
sortid="extractor/de/page/parse_page/76",
)
if (
Expand All @@ -290,7 +290,7 @@ def parse_page(
continue

base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
parse_section(wxr, page_data, base_data, level2_node.children)

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ def process_lautschrift_template(

lang_code = template_parameters.get("spr")
if lang_code:
lang_name = code_to_name(lang_code, "de")
lang = code_to_name(lang_code, "de")
add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
{
"ipa": [ipa],
"lang_code": lang_code,
"lang_name": lang_name,
"lang": lang,
},
)
else:
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def process_translation_list(

lang_code = node.template_parameters.get(1)
translation_data.lang_code = lang_code
translation_data.lang_name = code_to_name(lang_code, "de")
if translation_data.lang_name == "":
translation_data.lang = code_to_name(lang_code, "de")
if translation_data.lang == "":
wxr.wtp.debug(
f"Unknown language code: {translation_data.lang_name}",
f"Unknown language code: {translation_data.lang}",
sortid="extractor/de/translation/process_translation_list/70",
)
if node.template_name[-1] == "?":
Expand Down
3 changes: 2 additions & 1 deletion src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.extractor.es.sense_data import process_sense_data_list
from wiktextract.page import clean_node
Expand Down Expand Up @@ -38,7 +39,7 @@ def extract_gloss(
match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data.senseid = int(match.group(1))
gloss_data.senseid = match.group(1)
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_note.strip()
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class Sense(BaseModelWrap):
# subsenses: list["Sense"] = Field(
# default=[], description="List of subsenses"
# )
senseid: Optional[int] = Field(
senseid: Optional[str] = Field(
default=None, description="Sense number used in Wiktionary"
)
antonyms: Optional[list[Linkage]] = []
Expand Down Expand Up @@ -156,7 +156,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["español"]
)
senses: Optional[list[Sense]] = []
Expand Down
7 changes: 4 additions & 3 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.etymology import process_etymology_block
from wiktextract.extractor.es.example import extract_example
from wiktextract.extractor.es.gloss import extract_gloss
Expand Down Expand Up @@ -368,10 +369,10 @@ def parse_page(
):
continue

lang_name = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang_name)
lang = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang)
base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
base_data.categories.extend(categories["categories"])
parse_entries(wxr, page_data, base_data, level2_node)
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Translation(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code of the translation term"
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the translation term"
)
sense: Optional[str] = Field(
Expand Down Expand Up @@ -112,7 +112,7 @@ class WordEntry(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code", examples=["ru"]
)
lang_name: str = Field(
lang: str = Field(
description="Localized language name of the word", examples=["Русский"]
)
categories: list[str] = Field(
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/ru/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ def parse_page(

categories = {"categories": []}

lang_name = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang_name)
lang = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang)

base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
lang=lang, lang_code=lang_code, word=wxr.wtp.title
)
base_data.categories.extend(categories["categories"])

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/ru/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def extract_translations(
for key, raw_value in template_node.template_parameters.items():
if isinstance(key, str):
lang_code = key
lang_name = code_to_name(lang_code, "ru")
lang = code_to_name(lang_code, "ru")

for value_node in (
raw_value
Expand All @@ -36,7 +36,7 @@ def extract_translations(
word_entry.translations.append(
Translation(
lang_code=lang_code,
lang_name=lang_name,
lang=lang,
word=word,
sense=sense if sense else None,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_page_data(self) -> list[WordEntry]:
return [WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")]
return [WordEntry(word="Beispiel", lang_code="de", lang="Deutsch")]

def test_de_extract_examples(self):
self.wxr.wtp.start_page("")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_word_entry(self):
return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel")
return WordEntry(lang_code="de", lang="Deutsch", word="Beispiel")

def test_de_extract_glosses(self):
self.wxr.wtp.start_page("")
Expand Down
26 changes: 18 additions & 8 deletions tests/test_de_linkages.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_word_entry(self) -> WordEntry:
return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")
return WordEntry(word="Beispiel", lang_code="de", lang="Deutsch")

def test_de_extract_linkages(self):
test_cases = [
Expand All @@ -33,11 +33,17 @@ def test_de_extract_linkages(self):
"senses": [
{
"senseid": "1",
"coordinate_terms": ["Beleg", "Exempel"],
"coordinate_terms": [
{"word": "Beleg"},
{"word": "Exempel"},
],
},
{
"senseid": "2",
"coordinate_terms": ["Muster", "Vorbild"],
"coordinate_terms": [
{"word": "Muster"},
{"word": "Vorbild"},
],
},
]
},
Expand All @@ -50,7 +56,9 @@ def test_de_extract_linkages(self):
"expected": {
"senses": [
{
"expressions": ["ein gutes Beispiel geben"],
"expressions": [
{"word": "ein gutes Beispiel geben"}
],
}
]
},
Expand All @@ -60,7 +68,9 @@ def test_de_extract_linkages(self):
"input": "====Synonyme====\n:[[Synonym1]]",
"senses": [Sense(senseid="1")],
"expected": {
"senses": [{"senseid": "1", "synonyms": ["Synonym1"]}],
"senses": [
{"senseid": "1", "synonyms": [{"word": "Synonym1"}]}
],
},
},
# https://de.wiktionary.org/wiki/Kokospalme
Expand All @@ -73,8 +83,8 @@ def test_de_extract_linkages(self):
{
"senseid": "1",
"synonyms": [
"Kokosnusspalme",
"Cocos nucifera",
{"word": "Kokosnusspalme"},
{"word": "Cocos nucifera"},
],
}
],
Expand All @@ -95,7 +105,7 @@ def test_de_extract_linkages(self):
self.assertEqual(
word_entry.model_dump(
exclude_defaults=True,
exclude={"word", "lang_code", "lang_name"},
exclude={"word", "lang_code", "lang"},
),
case["expected"],
)
Loading

0 comments on commit 1f6f32a

Please sign in to comment.