Skip to content

Commit

Permalink
In examples, "taxonomic" is the same as "english"
Browse files Browse the repository at this point in the history
Issue #604, Czech translations (continued)

In translations like

```
ví ucho ― Leonotis nepetifolia (literally, “lion's ear”)
```

the translation part starting with "Leonotis" is has its
classification returned as "taxonomic" due to the heuristics
used in classify_desc().

I've been trying to kludge something better here, but for
this specifically the right call to make is to change it so
that if a description is either "english" or "taxonomic",
that counts as English. There is not meaningful distinction
here in the examples when trying to figure out translation
stuff.

The heuristics could be better, which is what I tried to
figure out, but it works fine for now...
  • Loading branch information
kristian-clausal committed Jul 11, 2024
1 parent 287646b commit a311fa9
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 44 deletions.
75 changes: 40 additions & 35 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2399,10 +2399,7 @@ def item_recurse(
and v[0][0] == ":"
):
v = [v[0][1:]] + list(v[1:]) # type:ignore
if (
isinstance(v[0], str)
and not v[0].isalnum()
):
if isinstance(v[0], str) and not v[0].isalnum():
links_that_should_not_be_split.append(
"".join(v[0])
) # type: ignore
Expand Down Expand Up @@ -2918,8 +2915,7 @@ def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
if pos.lower() not in POS_TITLES:
wxr.wtp.debug(
"unhandled see translation subpage: "
"language={} sub={} wxr.wtp.subsection={}"
.format(
"language={} sub={} wxr.wtp.subsection={}".format(
language, sub, wxr.wtp.subsection
),
sortid="page/2478",
Expand Down Expand Up @@ -3632,8 +3628,9 @@ def usex_template_fn(
# and the output to each other.
lines = [parts[0].strip()]
tr = parts[1].strip()
elif (
len(parts) == 2 and classify_desc(parts[1]) == "english"
elif len(parts) == 2 and classify_desc(parts[1]) in (
"english",
"taxonomic",
):
# These other branches just do some simple heuristics w/
# the expanded output of the template (if applicable).
Expand All @@ -3643,16 +3640,16 @@ def usex_template_fn(
len(parts) == 3
and classify_desc(parts[1])
in ("romanization", "english")
and classify_desc(parts[2]) == "english"
and classify_desc(parts[2]) in ("english", "taxonomic")
):
lines = [parts[0].strip()]
roman = parts[1].strip()
tr = parts[2].strip()
else:
parts = re.split(r"\s+-\s+", lines[0])
if (
len(parts) == 2
and classify_desc(parts[1]) == "english"
if len(parts) == 2 and classify_desc(parts[1]) in (
"english",
"taxonomic",
):
lines = [parts[0].strip()]
tr = parts[1].strip()
Expand All @@ -3675,12 +3672,13 @@ def usex_template_fn(
if (
lang_code != "en"
and len(lines) >= 2
and classify_desc(lines[-1]) == "english"
and classify_desc(lines[-1])
in ("english", "taxonomic")
):
i = len(lines) - 1
while (
i > 1
and classify_desc(lines[i - 1]) == "english"
while i > 1 and classify_desc(lines[i - 1]) in (
"english",
"taxonomic",
):
i -= 1
tr = "\n".join(lines[i:])
Expand All @@ -3696,22 +3694,27 @@ def usex_template_fn(
elif lang_code != "en" and len(lines) == 2:
cls1 = classify_desc(lines[0])
cls2 = classify_desc(lines[1])
if cls2 == "english" and cls1 != "english":
if (
cls2 in ("english", "taxonomic")
and cls1 != "english"
):
tr = lines[1]
lines = [lines[0]]
elif cls1 == "english" and cls2 != "english":
tr = lines[0]
lines = [lines[1]]
elif (
re.match(r"^[#*]*:+", lines[1])
and classify_desc(
re.sub(r"^[#*:]+\s*", "", lines[1])
)
== "english"
cls1 in ("english", "taxonomic")
and cls2 != "english"
):
tr = lines[0]
lines = [lines[1]]
elif re.match(r"^[#*]*:+", lines[1]) and classify_desc(
re.sub(r"^[#*:]+\s*", "", lines[1])
) in ("english", "taxonomic"):
tr = re.sub(r"^[#*:]+\s*", "", lines[1])
lines = [lines[0]]
elif cls1 == "english" and cls2 == "english":
elif cls1 == "english" and cls2 in (
"english",
"taxonomic",
):
# Both were classified as English, but
# presumably one is not. Assume first is
# non-English, as that seems more common.
Expand All @@ -3727,7 +3730,7 @@ def usex_template_fn(
cls3 = classify_desc(lines[2])
if (
cls3 == "english"
and cls2 in ["english", "romanization"]
and cls2 in ("english", "romanization")
and cls1 != "english"
):
tr = lines[2].strip()
Expand All @@ -3747,9 +3750,9 @@ def usex_template_fn(
cls1 = classify_desc(lines[-1])
if cls1 == "english":
i = len(lines) - 1
while (
i > 1
and classify_desc(lines[i - 1]) == "english"
while i > 1 and classify_desc(lines[i - 1]) == (
"english",
"taxonomic",
):
i -= 1
tr = "\n".join(lines[i:])
Expand All @@ -3774,7 +3777,7 @@ def usex_template_fn(
original_lines.append(i)
elif cl == "romanization":
roman += line
elif cl == "english":
elif cl in ("english", "taxonomic"):
tr += line
lines = [lines[i] for i in original_lines]

Expand All @@ -3796,14 +3799,16 @@ def usex_template_fn(
subtext = "\n".join(x for x in lines if x)
if not tr and lang_code != "en":
m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
if m and classify_desc(m.group(2)) == "english":
if m and classify_desc(m.group(2)) in (
"english",
"taxonomic",
):
tr = m.group(2)
subtext = subtext[: m.start()] + m.group(1)
elif lines:
parts = re.split(r"\s*[―—]+\s*", lines[0])
if (
len(parts) == 2
and classify_desc(parts[1]) == "english"
if len(parts) == 2 and classify_desc(parts[1]) in (
"english, taxonomic"
):
subtext = parts[0].strip()
tr = parts[1].strip()
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/ruby.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def parse_ruby(
def extract_ruby(
wxr: WiktextractContext,
contents: GeneralNode,
) -> tuple[list[tuple[str, ...]], list[Union[WikiNode, str]]]:
) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]:
# If contents is a list, process each element separately
extracted = []
new_contents = []
Expand Down
23 changes: 15 additions & 8 deletions src/wiktextract/taxondata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
# The original specieswiki dump can be downloaded from
# https://dumps.wikimedia.org.

known_species = set(["Aptenodytes forsteri",
known_species = set(
[
"Aptenodytes forsteri",
"Aptenodytes patagonicus",
"Eudyptes sclateri",
"Eudyptes pachyrhynchus",
Expand Down Expand Up @@ -86840,7 +86842,7 @@
"Chalcodrya hilaris",
"Chalcodrya variegata",
"Philpottia mollis",
"Chalcodryidae \"Cyphaleus\"",
'Chalcodryidae "Cyphaleus"',
"Cyphaleus valdivianus",
"Aralius wollastoni",
"Aralius olivieri",
Expand Down Expand Up @@ -116660,7 +116662,7 @@
"Eolagurus przewalskii",
"Eothenomys cachinus",
"Neostenetroides stocki",
"\"Stenetrium sp.\"",
'"Stenetrium sp."',
"Macrostylis dellacrocei",
"Munneurycope hadalis",
"Haaniella dehaanii",
Expand Down Expand Up @@ -535311,9 +535313,12 @@
"Ctenophysis chilensis",
"Pelecopsis bigibba",
"Pelecopsis brunea",
"Pelecopsis montana"]);

known_firsts = set(["Condylopodium",
"Pelecopsis montana",
]
)
known_firsts = set(
[
"Condylopodium",
"Leptocereus",
"Perenethis",
"Dryocoetini",
Expand Down Expand Up @@ -677454,7 +677459,7 @@
"Tropoleptusa",
"Vas",
"Paradisanthus",
"\"Stenetrium",
'"Stenetrium',
"Memoremea",
"Melanortocarya",
"Greenmaniella",
Expand Down Expand Up @@ -682675,4 +682680,6 @@
"Multitestis",
"Leptolicoa",
"Sergey",
"Heteroscydmus"]);
"Heteroscydmus",
]
)

0 comments on commit a311fa9

Please sign in to comment.