Skip to content

Commit

Permalink
Add hybrid traiter/GPT reconciler for event date
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Dec 1, 2023
1 parent 35d62b8 commit 67616d2
Show file tree
Hide file tree
Showing 12 changed files with 1,347 additions and 21 deletions.
5 changes: 4 additions & 1 deletion tests/dwc/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@

class TestDate(unittest.TestCase):
def test_date_dwc_01(self):
self.assertEqual(to_dwc(LABEL, "14 JAN. 1987"), {"dwc:eventDate": "1987-01-14"})
self.assertEqual(
to_dwc(LABEL, "14 JAN. 1987"),
{"dwc:eventDate": "1987-01-14", "dwc:verbatimEventDate": "14 JAN. 1987"},
)
13 changes: 13 additions & 0 deletions tests/rules/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,16 @@ def test_date_21(self):
parse("± 4 x 3 mm."),
[],
)

def test_date_22(self):
self.assertEqual(
parse("20091 19 X 1998"),
[
Date(
date="1998-10-19",
trait="date",
start=6,
end=15,
),
],
)
14 changes: 14 additions & 0 deletions tests/rules/test_elevation.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,17 @@ def test_elevation_10(self):
),
],
)

def test_elevation_11(self):
self.assertEqual(
parse("""Elev 85’."""),
[
Elevation(
trait="elevation",
elevation=25.908,
units="m",
start=0,
end=8,
),
],
)
27 changes: 24 additions & 3 deletions traiter/pylib/darwin_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import csv
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

DYN = "dwc:dynamicProperties"
Expand All @@ -10,9 +12,28 @@
SEP = " | "
FIELD_SEP = " ~ "

DUBLIN = """
type modified language license rightsHolder accessRights bibliographicCitation
references location """.split()

def read_dwc_terms():
core, dublin = {}, {}

path = Path(__file__).parent / "rules" / "terms" / "dwc_terms.csv"
with open(path) as f:
for row in csv.DictReader(f):
name = row["term_localName"]

if row["iri"].find("dublincore") > -1:
name = DC + name
dublin[name] = row

else:
name = DWC + name

core[name] = row

return core, dublin


CORE, DUBLIN = read_dwc_terms()


@dataclass
Expand Down
35 changes: 35 additions & 0 deletions traiter/pylib/hybrid/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Callable


class Template:
def __init__(self):
self._actions = []

@property
def reconcile(self) -> list[Callable]:
return self._actions

@reconcile.setter
def reconcile(self, actions: list[Callable]):
self._actions += actions


TEMPLATE = Template()


class Base:
def __init__(self, *args):
TEMPLATE.reconcile = args

@staticmethod
def search(other, keys: list[str], default=None):
for key in keys:
if other.get(key):
return other[key]
return default

@staticmethod
def case(*args) -> list[str]:
keys = " ".join(args).split()
keys += [k.lower() for k in keys]
return sorted(set(keys))
44 changes: 44 additions & 0 deletions traiter/pylib/hybrid/date_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Any

from .base import Base


class Date(Base):
date = "dwc:eventDate"
verbatim = "dwc:verbatimEventDate"
match_verb = Base.case(verbatim)
match = Base.case(
date, "dwc:collectionDate dwc:earliestDateCollected dwc:latestDateCollected"
)

def __init__(self):
super().__init__(self.event_date)

def event_date(
self, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
t_val = traiter.get(self.date)
o_val = self.search(other, self.match)

if t_val and not o_val:
raise ValueError(f"MISSING in OpenAI output {self.date} = {t_val}")

if not o_val:
return {}

if not t_val or t_val == o_val or o_val in t_val:
reconciled = {self.date: o_val}
if verb := self.search(other, self.match_verb, traiter.get(self.verbatim)):
reconciled[self.verbatim] = verb
return reconciled

if o_val == traiter.get(self.verbatim):
return {
self.date: traiter[self.date],
self.verbatim: traiter[self.verbatim],
}

if t_val != o_val:
raise ValueError(f"MISMATCH {self.date}: {o_val} != {t_val}")

raise ValueError(f"UNKNOWN error in {self.date}")
23 changes: 23 additions & 0 deletions traiter/pylib/hybrid/elevation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from . import base


class Elevation(base.Base):
keys = [
"dwc:decimalElevation",
"dwc:elevation",
"dwc:elevationAccuracy",
"dwc:elevationInMeters",
"dwc:maxElevationInMeters",
"dwc:maximumElevationInFeet",
"dwc:maximumElevationInFeet",
"dwc:maximumElevationInMeters",
"dwc:minElevationInMeters",
"dwc:minimumElevationInMeters",
"dwc:verbatimElevation",
]

def __init__(self):
super().__init__(self.elevation)

def elevation(self):
...
9 changes: 4 additions & 5 deletions traiter/pylib/pattern_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""
import copy
import re
from typing import Optional, Union
from warnings import warn


Expand All @@ -19,11 +18,11 @@ def keep(self) -> list[str]:
return list(self._keep.keys())

@keep.setter
def keep(self, labels: Union[str, list[str]]):
def keep(self, labels: str | list[str]):
labels = labels if isinstance(labels, list) else [labels]
self._keep |= {lb: 1 for lb in labels}

def delete(self, labels: Union[str, list[str]]):
def delete(self, labels: str | list[str]):
labels = labels if isinstance(labels, list) else [labels]
self._keep = {lb: 1 for lb in self._keep if lb not in labels}

Expand All @@ -38,9 +37,9 @@ def __init__(
label: str,
patterns: list[str],
decoder: dict[str, dict],
on_match: Union[str, None] = None,
on_match: str | None = None,
id: str = "",
keep: Optional[Union[str, list[str]]] = None, # Traits we want to keep
keep: str | list[str] | None = None, # Traits we want to keep
):
self.label = label
self.raw_patterns = patterns
Expand Down
23 changes: 13 additions & 10 deletions traiter/pylib/rules/date_.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ class Date(Base):
missing_day: bool = None

def to_dwc(self, dwc) -> DarwinCore:
return dwc.add(eventDate=self.date)
return dwc.add(
eventDate=self.date,
verbatimEventDate=self._text,
)

@property
def key(self):
Expand All @@ -49,13 +52,13 @@ def pipe(cls, nlp: Language):
@classmethod
def date_patterns(cls):
decoder = {
"-": {"TEXT": {"REGEX": rf"^[{cls.sep}]\Z"}},
"/": {"TEXT": {"REGEX": r"^/\Z"}},
"99": {"TEXT": {"REGEX": r"^\d\d?\Z"}},
"99-99": {"TEXT": {"REGEX": rf"^\d\d?[{cls.sep}]+\d\d\Z"}},
"99-9999": {"TEXT": {"REGEX": rf"^\d\d?[{cls.sep}]+[12]\d\d\d\Z"}},
"9999": {"TEXT": {"REGEX": r"^[12]\d{3}\Z"}},
":": {"TEXT": {"REGEX": r"^[:=]+\Z"}},
"-": {"TEXT": {"REGEX": rf"^[{cls.sep}]$"}},
"/": {"TEXT": {"REGEX": r"^/$"}},
"99": {"TEXT": {"REGEX": r"^\d\d?$"}},
"99-99": {"TEXT": {"REGEX": rf"^\d\d?[{cls.sep}]+\d\d$"}},
"99-9999": {"TEXT": {"REGEX": rf"^\d\d?[{cls.sep}]+[12]\d\d\d$"}},
"9999": {"TEXT": {"REGEX": r"^[12]\d\d\d$"}},
":": {"TEXT": {"REGEX": r"^[:=]+$"}},
"label": {"ENT_TYPE": "date_label"},
"month": {"ENT_TYPE": "month"},
"roman": {"ENT_TYPE": "roman"},
Expand All @@ -71,8 +74,8 @@ def date_patterns(cls):
"label? :? 99 -* month -* 9999",
"label? :? 9999 -* month -* 99",
"label :? 99 -* roman -* 99",
"label :? 99 -* roman -* 9999",
"label :? 9999 -* roman -* 99",
"label? :? 99 -* roman -* 9999",
"label? :? 9999 -* roman -* 99",
"label? :? 99 - 99 - 99",
"label? :? 99 - 99 - 9999",
"label? :? month+ -* 99 -* 9999",
Expand Down
5 changes: 3 additions & 2 deletions traiter/pylib/rules/elevation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ class Elevation(Base):
)
unit_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_length_terms.csv"
about_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "about_terms.csv"
all_csvs: ClassVar[list[Path]] = [elevation_csv, unit_csv, about_csv]
tic_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_tic_terms.csv"
all_csvs: ClassVar[list[Path]] = [elevation_csv, unit_csv, about_csv, tic_csv]

replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
factors_cm: ClassVar[dict[str, float]] = term_util.term_data(
unit_csv, "factor_cm", float
(unit_csv, tic_csv), "factor_cm", float
)
factors_m: ClassVar[dict[str, float]] = {
k: v / 100.0 for k, v in factors_cm.items()
Expand Down
Loading

0 comments on commit 67616d2

Please sign in to comment.