Skip to content

Commit

Permalink
Add traiter/ChatGPT reconcilers
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Dec 11, 2023
1 parent eb90e1c commit 9f33ad0
Show file tree
Hide file tree
Showing 16 changed files with 318 additions and 217 deletions.
1 change: 1 addition & 0 deletions traiter/pylib/darwin_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def read_dwc_terms():
with open(path) as f:
for row in csv.DictReader(f):
name = row["term_localName"]
name = name[0].lower() + name[1:]

if row["iri"].find("dublincore") > -1:
name = DC + name
Expand Down
56 changes: 36 additions & 20 deletions traiter/pylib/reconcilers/base.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,54 @@
from typing import Callable
from pathlib import Path
from typing import Any, Callable

from traiter.pylib import term_util

from ..rules import terms


class Template:
def __init__(self):
self._actions = []
def __init__(self, *actions):
self._actions = [a.reconcile for a in actions]

@property
def reconcile(self) -> list[Callable]:
def actions(self) -> list[Callable]:
return self._actions

@reconcile.setter
def reconcile(self, actions: list[Callable]):
self._actions += actions


TEMPLATE = Template()
def append(self, action):
self._actions.append(action.reconcile)


class Base:
def __init__(self, *args):
TEMPLATE.reconcile = args

@staticmethod
def search(other, keys: list[str], default=None):
for key in keys:
if other.get(key):
return other[key]
nil = "null none not provided not specified".casefold()

unit_csv = Path(terms.__file__).parent / "unit_length_terms.csv"
tic_csv = Path(terms.__file__).parent / "unit_tic_terms.csv"
factors_cm = term_util.term_data((unit_csv, tic_csv), "factor_cm", float)
factors_m = {k: v / 100.0 for k, v in factors_cm.items()}
print("test")

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
raise NotImplementedError

@classmethod
def search(cls, other: dict[str, Any], aliases: list[str], default: Any = ""):
for alias in aliases:
if value := other.get(alias):
if isinstance(value, str) and value.casefold() in cls.nil:
return default
return value
return default

@staticmethod
def wildcard(other, pattern: str, default=None):
@classmethod
def wildcard(cls, other, pattern: str, default=""):
pattern = pattern.casefold()
for key in other.keys():
folded = key.casefold()
if folded in cls.nil:
return default
if folded.find(pattern) > -1:
return other[key]
return default
Expand Down
16 changes: 16 additions & 0 deletions traiter/pylib/reconcilers/coordinate_precision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Any

from .base import Base


class CoordinatePrecision(Base):
label = "dwc:coordinatePrecision"
aliases = Base.case(label)

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
if o_val := cls.search(other, cls.aliases):
return {cls.label: o_val}
return {}
23 changes: 23 additions & 0 deletions traiter/pylib/reconcilers/coordinate_uncertainty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Any

from .. import darwin_core as dwc
from .base import Base


class CoordinateUncertainty(Base):
label = "dwc:coordinateUncertaintyInMeters"
aliases = Base.case(label, "")

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
o_val = cls.search(other, cls.aliases)

if isinstance(o_val, list):
return {cls.label: dwc.SEP.join(o_val)}
elif o_val:
return {cls.label: o_val}
elif t_val := traiter.get(cls.label):
return {cls.label: t_val}
return {}
16 changes: 16 additions & 0 deletions traiter/pylib/reconcilers/decimal_latitude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Any

from .base import Base


class DecimalLatitude(Base):
label = "dwc:decimalLatitude"
aliases = Base.case(label, "dwc:latitude dwc:verbatimLatitude")

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
if o_val := cls.search(other, cls.aliases):
return {cls.label: o_val}
return {}
16 changes: 16 additions & 0 deletions traiter/pylib/reconcilers/decimal_longitude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Any

from .base import Base


class DecimalLongitude(Base):
label = "dwc:decimalLongitude"
aliases = Base.case(label, "dwc:longitude dwc:verbatimLongitude")

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
if o_val := cls.search(other, cls.aliases):
return {cls.label: o_val}
return {}
79 changes: 0 additions & 79 deletions traiter/pylib/reconcilers/elevation.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,24 @@
from .base import Base


class Date(Base):
date_lb = "dwc:eventDate"
verb_lb = "dwc:verbatimEventDate"
match_verb = Base.case(verb_lb)
match = Base.case(
date_lb,
"dwc:collectionDate dwc:earliestDateCollected dwc:latestDateCollected dwc:date",
class EventDate(Base):
label = "dwc:eventDate"
verbatim_label = "dwc:verbatimEventDate"
aliases = Base.case(
"""
dwc:collectionDate dwc:earliestDateCollected dwc:latestDateCollected
dwc:date"""
)
verbatim_aliases = Base.case(verbatim_label)

def __init__(self):
super().__init__(self.reconcile)

@classmethod
def reconcile(
self, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
t_val = traiter.get(self.date_lb)
o_val = self.search(other, self.match)
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, Any]:
t_val = traiter.get(cls.label)
o_val = cls.search(other, cls.aliases)
t_verbatim = traiter.get(cls.verbatim_label)
o_verbatim = cls.search(other, cls.verbatim_aliases)

# If OpenAI returns a dict see if we can use it
if o_val and isinstance(o_val, dict):
Expand All @@ -41,46 +42,48 @@ def reconcile(
# Handle when OpenAI returns a list of dates
if o_val and isinstance(o_val, list) and t_val:
if any(v in t_val for v in o_val):
return {self.date_lb: dwc.SEP.join(o_val)}
return {cls.label: dwc.SEP.join(o_val)}

# Traiter found an event date but GPT did not
if not o_val and t_val:
# Does it match any other date?
if self.wildcard(other, "date"):
if cls.wildcard(other, "date"):
return {}
raise ValueError(f"MISSING in OpenAI output {self.date_lb} = {t_val}")
raise ValueError(f"MISSING in OpenAI output {cls.label} = {t_val}")

# Neither found an event date
if not o_val:
return {}

# GPT found a date, and it matches a date in traiter or traiter did not find one
if not t_val or o_val == t_val or o_val in t_val:
obj = {self.date_lb: o_val}
if v := self.search(other, self.match_verb, traiter.get(self.verb_lb)):
obj[self.verb_lb] = v
obj = {cls.label: o_val}
if o_verbatim:
obj[cls.verbatim_label] = o_verbatim
elif t_verbatim:
obj[cls.verbatim_label] = t_verbatim
return obj

# GPT's date matches Traiter's verbatim date. Use traiter's version
if o_val == traiter.get(self.verb_lb):
if o_val == t_val:
return {
self.date_lb: traiter[self.date_lb],
self.verb_lb: traiter[self.verb_lb],
cls.label: t_val,
cls.verbatim_label: t_verbatim,
}

# Try converting the OpenAI date
if o_val != t_val:
try:
new = parser.parse(o_val).date().isoformat()[:10]
except (parser.ParserError, IllegalMonthError):
raise ValueError(f"MISMATCH {self.date_lb}: {o_val} != {t_val}")
raise ValueError(f"MISMATCH {cls.label}: {o_val} != {t_val}")

if new in t_val:
return {
self.date_lb: traiter[self.date_lb],
self.verb_lb: traiter[self.verb_lb],
cls.label: t_val,
cls.verbatim_label: t_verbatim,
}

raise ValueError(f"MISMATCH {self.date_lb}: {o_val} != {t_val}")
raise ValueError(f"MISMATCH {cls.label}: {o_val} != {t_val}")

raise ValueError(f"UNKNOWN error in {self.date_lb}")
raise ValueError(f"UNKNOWN error in {cls.label}")
18 changes: 18 additions & 0 deletions traiter/pylib/reconcilers/geodetic_datum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import Any

from .base import Base


class GeodeticDatum(Base):
label = "dwc:geodeticDatum"
aliases = Base.case(label, "datum")

@classmethod
def reconcile(
cls, traiter: dict[str, Any], other: dict[str, Any]
) -> dict[str, str]:
if o_val := cls.search(other, cls.aliases):
return {cls.label: o_val}
elif t_val := traiter.get(cls.label):
return {cls.label: t_val}
return {}
24 changes: 8 additions & 16 deletions traiter/pylib/reconcilers/habitat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,11 @@


class Habitat(Base):
habitat_lb = "dwc:habitat"
match = Base.case(habitat_lb)

def __init__(self):
super().__init__(self.reconcile)

def reconcile(self, _: dict[str, Any], other: dict[str, Any]) -> dict[str, Any]:
o_val = self.search(other, self.match)

obj = {}

# Just use whatever is in the OpenAI output
if o_val:
obj[self.habitat_lb] = o_val

return obj
label = "dwc:habitat"
aliases = Base.case(label)

@classmethod
def reconcile(cls, _: dict[str, Any], other: dict[str, Any]) -> dict[str, Any]:
if o_val := cls.search(other, cls.aliases):
return {cls.label: o_val}
return {}
Loading

0 comments on commit 9f33ad0

Please sign in to comment.