Skip to content

Commit

Permalink
Align with Re3data (#489)
Browse files Browse the repository at this point in the history
Closes #478
  • Loading branch information
cthoyt authored Jul 30, 2022
1 parent cce02f5 commit 2d94ba5
Show file tree
Hide file tree
Showing 10 changed files with 35,382 additions and 82 deletions.
2 changes: 2 additions & 0 deletions src/bioregistry/align/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .ols import OLSAligner
from .ontobee import OntobeeAligner
from .prefixcommons import PrefixCommonsAligner
from .re3data import Re3dataAligner
from .uniprot import UniProtAligner
from .utils import Aligner
from .wikidata import WikidataAligner
Expand Down Expand Up @@ -56,6 +57,7 @@
"UniProtAligner",
"WikidataAligner",
"EDAMAligner",
"Re3dataAligner",
]

aligner_resolver = ClassResolver.from_subclasses(
Expand Down
33 changes: 33 additions & 0 deletions src/bioregistry/align/re3data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-

"""Align Registry of Research Data Repositoris (r3data) with the Bioregistry."""

from typing import Sequence

from bioregistry.align.utils import Aligner
from bioregistry.external.re3data import get_re3data

__all__ = [
"Re3dataAligner",
]


class Re3dataAligner(Aligner):
"""Aligner for the Registry of Research Data Repositoris (r3data)."""

key = "re3data"
alt_key_match = "name"
getter = get_re3data
curation_header = ("name", "homepage", "description")

def get_curation_row(self, external_id, external_entry) -> Sequence[str]:
"""Prepare curation rows for unaligned re3data registry entries."""
return [
external_entry["name"],
external_entry["homepage"],
external_entry["description"],
]


if __name__ == "__main__":
Re3dataAligner.align()
1,033 changes: 951 additions & 82 deletions src/bioregistry/data/bioregistry.json

Large diffs are not rendered by default.

4,170 changes: 4,170 additions & 0 deletions src/bioregistry/data/external/re3data/curation.tsv

Large diffs are not rendered by default.

29,989 changes: 29,989 additions & 0 deletions src/bioregistry/data/external/re3data/processed.json

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions src/bioregistry/data/metaregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,49 @@
"prefix": "prefixcommons",
"provider_uri_format": "https://prefixcommons.org/?q=$1"
},
{
"availability": {
"alternate_providers": "missing",
"contact": "present",
"description": "required",
"example": "missing",
"fair": false,
"fair_note": "The API for Re3data does not provide a bulk download",
"homepage": "required",
"license": "present",
"name": "required",
"pattern": "missing",
"provider": "missing",
"search": true,
"synonyms": "missing",
"version": "missing"
},
"bioregistry_prefix": "re3data",
"contact": {
"email": "[email protected]",
"github": "mwittin",
"name": "Michael Witt",
"orcid": "0000-0003-4221-7956"
},
"description": "Re3data is a global registry of research data repositories that covers research data repositories from different academic disciplines.",
"example": "r3d100010772",
"governance": {
"accepts_external_contributions": true,
"curates": true,
"curation": "opaque-review",
"imports": false,
"public_version_control": false,
"review_team": "private",
"scope": "general",
"status": "active"
},
"homepage": "https://www.re3data.org",
"license": "CC 0",
"name": "Registry of Research Data Repositories",
"prefix": "re3data",
"provider_uri_format": "https://www.re3data.org/repository/$1",
"short_name": "re3data"
},
{
"availability": {
"alternate_providers": "missing",
Expand Down
3 changes: 3 additions & 0 deletions src/bioregistry/external/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .ols import get_ols
from .ontobee import get_ontobee
from .prefixcommons import get_prefixcommons
from .re3data import get_re3data
from .uniprot import get_uniprot
from .wikidata import get_wikidata

Expand All @@ -44,6 +45,7 @@
"get_uniprot",
"get_wikidata",
"get_edam",
"get_re3data",
]

GETTERS: List[Tuple[str, str, Callable]] = [
Expand All @@ -68,4 +70,5 @@
("aberowl", "AberOWL", get_aberowl),
("cropoct", "CropOCT", get_cropoct),
("edam", "EDAM", get_edam),
("re3data", "re3data", get_re3data),
]
185 changes: 185 additions & 0 deletions src/bioregistry/external/re3data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-

"""Re3data is a registry of research data repositories.
Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
"""

import json
import logging
from typing import Any, Mapping, Optional, Tuple
from xml.etree import ElementTree

import requests
from tqdm.contrib.concurrent import thread_map
from tqdm.contrib.logging import logging_redirect_tqdm

from bioregistry.constants import EXTERNAL
from bioregistry.utils import removeprefix

__all__ = [
"get_re3data",
]

logger = logging.getLogger(__name__)
DIRECTORY = EXTERNAL / "re3data"
DIRECTORY.mkdir(exist_ok=True, parents=True)
PROCESSED_PATH = DIRECTORY / "processed.json"

BASE_URL = "https://www.re3data.org"
SCHEMA = "{http://www.re3data.org/schema/2-2}"


def get_re3data(force_download: bool = False):
"""Get the re3data registry.
This takes about 9 minutes since it has to look up each of the ~3K
records with their own API call.
:param force_download: If true, re-downloads the data
:returns: The re3data pre-processed data
"""
if PROCESSED_PATH.exists() and not force_download:
with PROCESSED_PATH.open() as file:
return json.load(file)

res = requests.get(f"{BASE_URL}/api/v1/repositories")
tree = ElementTree.fromstring(res.text)

identifier_to_doi = {}
for repository in tree.findall("repository"):
identifier_element = repository.find("id")
if identifier_element is None or identifier_element.text is None:
continue

doi_element = repository.find("doi")
doi = (
removeprefix(doi_element.text, "https://doi.org/")
if doi_element and doi_element.text
else None
)
identifier_to_doi[identifier_element.text.strip()] = doi

records = dict(
thread_map(
_get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data"
)
)

# backfill DOIs
for identifier, record in records.items():
doi = identifier_to_doi.get(identifier)
if doi:
record["doi"] = doi

with PROCESSED_PATH.open("w") as file:
json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False)

return records


def _get_record(identifier: str) -> Tuple[str, Mapping[str, Any]]:
res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}")
tree = ElementTree.fromstring(res.text)[0]
return identifier, _process_record(identifier, tree)


def _process_record(identifier: str, tree_inner):
xrefs = (
_clean_xref(element.text.strip())
for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier")
)
data = {
"prefix": identifier,
"name": tree_inner.find(f"{SCHEMA}repositoryName").text,
"description": tree_inner.find(f"{SCHEMA}description").text,
"homepage": tree_inner.find(f"{SCHEMA}repositoryURL").text,
"synonyms": [
element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName")
],
"xrefs": dict(tup for tup in xrefs if tup),
}

license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName")
if license_element:
data["license"] = license_element.text

return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v}


def _clean_xref(xref: str) -> Optional[Tuple[str, str]]:
if (
xref.startswith("FAIRsharing_DOI:10.25504/")
or xref.startswith("FAIRsharing_doi:10.25504/")
or xref.startswith("FAIRsharing_dOI:10.25504/")
or xref.startswith("FAIRSharing_doi:10.25504/")
or xref.startswith("FAIRsharing_doi;10.25504/")
or xref.startswith("FAIRsharing_doi: 10.25504/")
or xref.startswith("fairsharing_DOI:10.25504/")
or xref.startswith("fairsharing_doi:10.25504/")
or xref.startswith("FAIRsharin_doi:10.25504/")
or xref.startswith("FAIRsharing_doi.:10.25504/")
or xref.startswith("FAIRsharing_DOI: 10.25504/")
or xref.startswith("FAIRsharing_doi::10.25504/")
or xref.startswith("FAIRsharing_doi:10.24404/")
):
return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :]

for start, key in [
("biodbcore-", "biodbcore"),
("MIR:", "miriam"),
("ROR:", "ror"),
("OMICS_", "omics"),
("Omics_", "omics"),
("omics_", "omics"),
("ISSN ", "issn"),
("ISSN: ", "issn"),
("nif-", "nif"),
("ISNI:", "isni"),
("doi.org/", "doi"),
("doi:", "doi"),
("DOI:", "doi"),
("DOI: ", "doi"),
("RID:nlx_", "nlx"),
("PSSB-", "pssb"),
("OpenDOAR:", "opendoar"),
("openDOAR:", "opendoar"),
("ROAR:", "roar"), # e.g., see http://roar.eprints.org/14208/
("hdl:", "hdl"),
("https://fairsharing.org/", "fairsharing.legacy"),
("http://fairsharing.org/", "fairsharing.legacy"),
("Wikidata:", "wikidata"),
("https://doi.org/10.5281/zenodo.", "zenodo"),
("https://doi.org/", "doi"),
]:
if xref.startswith(start):
return key, xref[len(start) :]

if xref.startswith("RRID:"):
inner_xref = xref[len("RRID:") :]
if "_" in inner_xref:
prefix, identifier = inner_xref.split("_", 1)
return prefix.lower(), identifier
elif "-" in inner_xref:
try:
prefix, identifier = inner_xref.split("-", 1)
except ValueError:
logger.debug("can't parse RRID: %s", xref)
else:
return prefix.lower(), identifier
else:
logger.debug("unknown RRID: %s", xref)
return None

if "doi:" in xref:
for part in xref.split(" "):
if part.startswith("doi"):
return "doi", part[len("doi:") :]

logger.debug("re3data record had unparsable xref: %s", xref)
return None


if __name__ == "__main__":
with logging_redirect_tqdm():
get_re3data(force_download=True)
4 changes: 4 additions & 0 deletions src/bioregistry/schema/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,10 @@
"edam": {
"title": "Edam",
"type": "object"
},
"re3data": {
"title": "Re3Data",
"type": "object"
}
},
"required": [
Expand Down
2 changes: 2 additions & 0 deletions src/bioregistry/schema/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ class Resource(BaseModel):
biocontext: Optional[Mapping[str, Any]]
#: External data from EDAM ontology
edam: Optional[Mapping[str, Any]]
#: External data from re3data
re3data: Optional[Mapping[str, Any]]

def get_external(self, metaprefix) -> Mapping[str, Any]:
"""Get an external registry."""
Expand Down

0 comments on commit 2d94ba5

Please sign in to comment.