In examples, "taxonomic" is the same as "english"

Issue #604, Czech translations (continued) In translations like ``` ví ucho ― Leonotis nepetifolia (literally, “lion's ear”) ``` the translation part starting with "Leonotis" is has its classification returned as "taxonomic" due to the heuristics used in classify_desc(). I've been trying to kludge something better here, but for this specifically the right call to make is to change it so that if a description is either "english" or "taxonomic", that counts as English. There is not meaningful distinction here in the examples when trying to figure out translation stuff. The heuristics could be better, which is what I tried to figure out, but it works fine for now...
tatuylonen · Jul 11, 2024 · a311fa9 · a311fa9
1 parent 287646b
commit a311fa9
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 44 deletions.
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -2399,10 +2399,7 @@ def item_recurse(
                                     and v[0][0] == ":"
                                 ):
                                     v = [v[0][1:]] + list(v[1:])  # type:ignore
-                                if (
-                                    isinstance(v[0], str)
-                                    and not v[0].isalnum()
-                                ):
+                                if isinstance(v[0], str) and not v[0].isalnum():
                                     links_that_should_not_be_split.append(
                                         "".join(v[0])
                                     )  # type: ignore
@@ -2918,8 +2915,7 @@ def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
                         if pos.lower() not in POS_TITLES:
                             wxr.wtp.debug(
                                 "unhandled see translation subpage: "
-                                "language={} sub={} wxr.wtp.subsection={}"
-                                .format(
+                                "language={} sub={} wxr.wtp.subsection={}".format(
                                     language, sub, wxr.wtp.subsection
                                 ),
                                 sortid="page/2478",
@@ -3632,8 +3628,9 @@ def usex_template_fn(
                         # and the output to each other.
                         lines = [parts[0].strip()]
                         tr = parts[1].strip()
-                    elif (
-                        len(parts) == 2 and classify_desc(parts[1]) == "english"
+                    elif len(parts) == 2 and classify_desc(parts[1]) in (
+                        "english",
+                        "taxonomic",
                     ):
                         # These other branches just do some simple heuristics w/
                         # the expanded output of the template (if applicable).
@@ -3643,16 +3640,16 @@ def usex_template_fn(
                         len(parts) == 3
                         and classify_desc(parts[1])
                         in ("romanization", "english")
-                        and classify_desc(parts[2]) == "english"
+                        and classify_desc(parts[2]) in ("english", "taxonomic")
                     ):
                         lines = [parts[0].strip()]
                         roman = parts[1].strip()
                         tr = parts[2].strip()
                     else:
                         parts = re.split(r"\s+-\s+", lines[0])
-                        if (
-                            len(parts) == 2
-                            and classify_desc(parts[1]) == "english"
+                        if len(parts) == 2 and classify_desc(parts[1]) in (
+                            "english",
+                            "taxonomic",
                         ):
                             lines = [parts[0].strip()]
                             tr = parts[1].strip()
@@ -3675,12 +3672,13 @@ def usex_template_fn(
                         if (
                             lang_code != "en"
                             and len(lines) >= 2
-                            and classify_desc(lines[-1]) == "english"
+                            and classify_desc(lines[-1])
+                            in ("english", "taxonomic")
                         ):
                             i = len(lines) - 1
-                            while (
-                                i > 1
-                                and classify_desc(lines[i - 1]) == "english"
+                            while i > 1 and classify_desc(lines[i - 1]) in (
+                                "english",
+                                "taxonomic",
                             ):
                                 i -= 1
                             tr = "\n".join(lines[i:])
@@ -3696,22 +3694,27 @@ def usex_template_fn(
                     elif lang_code != "en" and len(lines) == 2:
                         cls1 = classify_desc(lines[0])
                         cls2 = classify_desc(lines[1])
-                        if cls2 == "english" and cls1 != "english":
+                        if (
+                            cls2 in ("english", "taxonomic")
+                            and cls1 != "english"
+                        ):
                             tr = lines[1]
                             lines = [lines[0]]
-                        elif cls1 == "english" and cls2 != "english":
-                            tr = lines[0]
-                            lines = [lines[1]]
                         elif (
-                            re.match(r"^[#*]*:+", lines[1])
-                            and classify_desc(
-                                re.sub(r"^[#*:]+\s*", "", lines[1])
-                            )
-                            == "english"
+                            cls1 in ("english", "taxonomic")
+                            and cls2 != "english"
                         ):
+                            tr = lines[0]
+                            lines = [lines[1]]
+                        elif re.match(r"^[#*]*:+", lines[1]) and classify_desc(
+                            re.sub(r"^[#*:]+\s*", "", lines[1])
+                        ) in ("english", "taxonomic"):
                             tr = re.sub(r"^[#*:]+\s*", "", lines[1])
                             lines = [lines[0]]
-                        elif cls1 == "english" and cls2 == "english":
+                        elif cls1 == "english" and cls2 in (
+                            "english",
+                            "taxonomic",
+                        ):
                             # Both were classified as English, but
                             # presumably one is not.  Assume first is
                             # non-English, as that seems more common.
@@ -3727,7 +3730,7 @@ def usex_template_fn(
                         cls3 = classify_desc(lines[2])
                         if (
                             cls3 == "english"
-                            and cls2 in ["english", "romanization"]
+                            and cls2 in ("english", "romanization")
                             and cls1 != "english"
                         ):
                             tr = lines[2].strip()
@@ -3747,9 +3750,9 @@ def usex_template_fn(
                         cls1 = classify_desc(lines[-1])
                         if cls1 == "english":
                             i = len(lines) - 1
-                            while (
-                                i > 1
-                                and classify_desc(lines[i - 1]) == "english"
+                            while i > 1 and classify_desc(lines[i - 1]) == (
+                                "english",
+                                "taxonomic",
                             ):
                                 i -= 1
                             tr = "\n".join(lines[i:])
@@ -3774,7 +3777,7 @@ def usex_template_fn(
                                 original_lines.append(i)
                             elif cl == "romanization":
                                 roman += line
-                            elif cl == "english":
+                            elif cl in ("english", "taxonomic"):
                                 tr += line
                         lines = [lines[i] for i in original_lines]
 
@@ -3796,14 +3799,16 @@ def usex_template_fn(
                 subtext = "\n".join(x for x in lines if x)
                 if not tr and lang_code != "en":
                     m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
-                    if m and classify_desc(m.group(2)) == "english":
+                    if m and classify_desc(m.group(2)) in (
+                        "english",
+                        "taxonomic",
+                    ):
                         tr = m.group(2)
                         subtext = subtext[: m.start()] + m.group(1)
                     elif lines:
                         parts = re.split(r"\s*[―—]+\s*", lines[0])
-                        if (
-                            len(parts) == 2
-                            and classify_desc(parts[1]) == "english"
+                        if len(parts) == 2 and classify_desc(parts[1]) in (
+                            "english, taxonomic"
                         ):
                             subtext = parts[0].strip()
                             tr = parts[1].strip()

diff --git a/src/wiktextract/extractor/ruby.py b/src/wiktextract/extractor/ruby.py
@@ -43,7 +43,7 @@ def parse_ruby(
 def extract_ruby(
     wxr: WiktextractContext,
     contents: GeneralNode,
-) -> tuple[list[tuple[str, ...]], list[Union[WikiNode, str]]]:
+) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]:
     # If contents is a list, process each element separately
     extracted = []
     new_contents = []

diff --git a/src/wiktextract/taxondata.py b/src/wiktextract/taxondata.py
@@ -10,7 +10,9 @@
 # The original specieswiki dump can be downloaded from
 # https://dumps.wikimedia.org.
 
-known_species = set(["Aptenodytes forsteri",
+known_species = set(
+    [
+        "Aptenodytes forsteri",
         "Aptenodytes patagonicus",
         "Eudyptes sclateri",
         "Eudyptes pachyrhynchus",
@@ -86840,7 +86842,7 @@
         "Chalcodrya hilaris",
         "Chalcodrya variegata",
         "Philpottia mollis",
-        "Chalcodryidae \"Cyphaleus\"",
+        'Chalcodryidae "Cyphaleus"',
         "Cyphaleus valdivianus",
         "Aralius wollastoni",
         "Aralius olivieri",
@@ -116660,7 +116662,7 @@
         "Eolagurus przewalskii",
         "Eothenomys cachinus",
         "Neostenetroides stocki",
-        "\"Stenetrium sp.\"",
+        '"Stenetrium sp."',
         "Macrostylis dellacrocei",
         "Munneurycope hadalis",
         "Haaniella dehaanii",
@@ -535311,9 +535313,12 @@
         "Ctenophysis chilensis",
         "Pelecopsis bigibba",
         "Pelecopsis brunea",
-        "Pelecopsis montana"]);
-
-known_firsts = set(["Condylopodium",
+        "Pelecopsis montana",
+    ]
+)
+known_firsts = set(
+    [
+        "Condylopodium",
         "Leptocereus",
         "Perenethis",
         "Dryocoetini",
@@ -677454,7 +677459,7 @@
         "Tropoleptusa",
         "Vas",
         "Paradisanthus",
-        "\"Stenetrium",
+        '"Stenetrium',
         "Memoremea",
         "Melanortocarya",
         "Greenmaniella",
@@ -682675,4 +682680,6 @@
         "Multitestis",
         "Leptolicoa",
         "Sergey",
-        "Heteroscydmus"]);
+        "Heteroscydmus",
+    ]
+)