Skip to content

Commit

Permalink
Bug fix for previous mappings during alignment (#477)
Browse files Browse the repository at this point in the history
* Add fix for obo

* Update utils.py

* Update struct.py

* Update utils.py

* Update utils.py

* Update utils.py

* Update utils.py

* clean up

* Update utils.py

* Clean up

* Update utils.py
  • Loading branch information
cthoyt authored Jul 30, 2022
1 parent 7b2522f commit 4ac318a
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 34 deletions.
4 changes: 2 additions & 2 deletions src/bioregistry/align/obofoundry.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ class OBOFoundryAligner(Aligner):

key = "obofoundry"
getter = get_obofoundry
curation_header = ("name", "description")
curation_header = ("deprecated", "name", "description")
include_new = True
normalize_invmap = True

def get_skip(self) -> Mapping[str, str]:
"""Get the prefixes in the OBO Foundry that should be skipped."""
return {
"bila": "website is not longer active",
"obo_rel": "replaced",
}

Expand Down
62 changes: 37 additions & 25 deletions src/bioregistry/align/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class Aligner(ABC):

subkey: ClassVar[str] = "prefix"

normalize_invmap: ClassVar[bool] = False

def __init__(self):
"""Instantiate the aligner."""
if self.key not in read_metaregistry():
Expand All @@ -60,7 +62,10 @@ def __init__(self):
self.skip_external = self.get_skip()

# Get all of the pre-curated mappings from the Bioregistry
self.external_id_to_bioregistry_id = self.manager.get_registry_invmap(self.key)
self.external_id_to_bioregistry_id = self.manager.get_registry_invmap(
self.key,
normalize=self.normalize_invmap,
)

# Run lexical alignment
self._align()
Expand All @@ -81,46 +86,53 @@ def _align(self):
continue

bioregistry_id = self.external_id_to_bioregistry_id.get(external_id)
# There's already a mapping for this external ID to a bioregistry
# entry. Just add all of the latest metadata and move on
if bioregistry_id is not None:
self._align_action(bioregistry_id, external_id, external_entry)
continue

# try to lookup with lexical match
if bioregistry_id is None:
if not self.alt_key_match:
bioregistry_id = self.manager.normalize_prefix(external_id)
else:
alt_match = external_entry.get(self.alt_key_match)
if alt_match:
bioregistry_id = self.manager.normalize_prefix(alt_match)
if not self.alt_key_match:
bioregistry_id = self.manager.normalize_prefix(external_id)
else:
alt_match = external_entry.get(self.alt_key_match)
if alt_match:
bioregistry_id = self.manager.normalize_prefix(alt_match)

# A lexical match was possible
if bioregistry_id is not None:
# check this external ID for curated mismatches, and move
# on if one has already been curated
if is_mismatch(bioregistry_id, self.key, external_id):
continue
if self.skip_deprecated and self.manager.is_deprecated(bioregistry_id):
continue
self._align_action(bioregistry_id, external_id, external_entry)
continue

# add the identifier from an external resource if it's been marked as high quality
if bioregistry_id is None and self.include_new:
elif self.include_new:
bioregistry_id = norm(external_id)
if is_mismatch(bioregistry_id, self.key, external_id):
continue
self.internal_registry[bioregistry_id] = Resource(prefix=bioregistry_id)

if self._do_align_action(bioregistry_id):
self._align_action(bioregistry_id, external_id, external_entry)
continue

def _do_align_action(self, prefix: Optional[str]) -> bool:
# a match was found if the prefix is not None
return prefix is not None and (
not self.skip_deprecated or not self.manager.is_deprecated(prefix)
)

def _align_action(self, bioregistry_id, external_id, external_entry):
# skip mismatches
if is_mismatch(bioregistry_id, self.key, external_id):
return

# Add mapping
def _align_action(
self, bioregistry_id: str, external_id: str, external_entry: Dict[str, Any]
) -> None:
if self.internal_registry[bioregistry_id].mappings is None:
self.internal_registry[bioregistry_id].mappings = {}
self.internal_registry[bioregistry_id].mappings[self.key] = external_id
self.internal_registry[bioregistry_id].mappings[self.key] = external_id # type:ignore

_entry = self.prepare_external(external_id, external_entry)
_entry[self.subkey] = external_id
self.internal_registry[bioregistry_id][self.key] = _entry
self.external_id_to_bioregistry_id[external_id] = bioregistry_id

def prepare_external(self, external_id, external_entry) -> Dict[str, Any]:
def prepare_external(self, external_id: str, external_entry: Dict[str, Any]) -> Dict[str, Any]:
"""Prepare a dictionary to be added to the bioregistry for each external registry entry.
The default implementation returns `external_entry` unchanged.
Expand Down
17 changes: 14 additions & 3 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -4351,6 +4351,7 @@
"uri_format": "http://purl.obolibrary.org/obo/BILA_$1"
},
"deprecated": true,
"example": "0000000",
"fairsharing": {
"abbreviation": "BILA",
"description": "This is a record of the Bilateria anatomy terminology and ontology. We have been unable to find any further information about this resource.",
Expand All @@ -4364,6 +4365,11 @@
"obofoundry": "bila"
},
"obofoundry": {
"contact": "[email protected]",
"contact.label": "Thorsten Heinrich",
"deprecated": true,
"download.owl": "http://purl.obolibrary.org/obo/bila.owl",
"homepage": "http://4dx.embl.de/4DXpress",
"name": "Bilateria anatomy",
"prefix": "bila"
}
Expand Down Expand Up @@ -12968,6 +12974,7 @@
"prefix": "GRO-CPGA"
},
"deprecated": true,
"description": "A structured controlled vocabulary for the anatomy of Graminae. Please note that this ontology has now been superseded by the Plant Ontology.",
"mappings": {
"biocontext": "gro.cpga",
"bioportal": "GRO-CPGA",
Expand All @@ -12978,7 +12985,6 @@
"contact.label": "Plant Ontology Administrators",
"deprecated": true,
"homepage": "http://www.gramene.org/plant_ontology/",
"inactive": true,
"name": "Cereal Plant Gross Anatomy",
"prefix": "gro"
}
Expand Down Expand Up @@ -25291,14 +25297,19 @@
"namespace_in_lui": true,
"obofoundry": {
"contact": "[email protected]",
"contact.github": "hoganwr",
"contact.label": "Bill Hogan",
"contact.orcid": "0000-0002-9881-1017",
"deprecated": false,
"description": "An ontology of geographical entities",
"download.owl": "http://purl.obolibrary.org/obo/geo.owl",
"homepage": "https://github.com/ufbmi/geographical-entity-ontology/wiki",
"license": "CC-BY 4.0",
"license": "CC BY 4.0",
"license.url": "https://creativecommons.org/licenses/by/4.0/",
"name": "Geographical Entity Ontology",
"prefix": "geo"
"preferredPrefix": "GEO",
"prefix": "geo",
"repository": "https://github.com/ufbmi/geographical-entity-ontology"
},
"ols": {
"description": "An ontology of geographical entities",
Expand Down
3 changes: 0 additions & 3 deletions src/bioregistry/data/external/obofoundry/curation.tsv

This file was deleted.

14 changes: 13 additions & 1 deletion src/bioregistry/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,19 @@ def get_registry_map(self, metaprefix: str) -> Dict[str, str]:

@lru_cache(maxsize=None) # noqa:B019
def get_registry_invmap(self, metaprefix: str, normalize: bool = False) -> Dict[str, str]:
"""Get a mapping from prefixes in another registry to Bioregistry prefixes."""
"""Get a mapping from prefixes in another registry to Bioregistry prefixes.
:param metaprefix: Which external registry should be used?
:param normalize: Should the external prefixes be normalized?
:returns: A mapping of external prefixes to bioregistry prefies
>>> from bioregistry import manager
>>> obofoundry_to_bioregistry = manager.get_registry_invmap("obofoundry", normalize=True)
>>> obofoundry_to_bioregistry["go"]
'go'
>>> obofoundry_to_bioregistry["geo"]
'geogeo'
"""
if normalize:
return {
_norm(external_prefix): prefix
Expand Down

0 comments on commit 4ac318a

Please sign in to comment.