Align with Re3data (#489)

Closes #478
biopragmatics · Jul 30, 2022 · 2d94ba5 · 2d94ba5
1 parent cce02f5
commit 2d94ba5
Show file tree

Hide file tree

Showing 10 changed files with 35,382 additions and 82 deletions.
diff --git a/src/bioregistry/align/__init__.py b/src/bioregistry/align/__init__.py
@@ -26,6 +26,7 @@
 from .ols import OLSAligner
 from .ontobee import OntobeeAligner
 from .prefixcommons import PrefixCommonsAligner
+from .re3data import Re3dataAligner
 from .uniprot import UniProtAligner
 from .utils import Aligner
 from .wikidata import WikidataAligner
@@ -56,6 +57,7 @@
     "UniProtAligner",
     "WikidataAligner",
     "EDAMAligner",
+    "Re3dataAligner",
 ]
 
 aligner_resolver = ClassResolver.from_subclasses(

diff --git a/src/bioregistry/align/re3data.py b/src/bioregistry/align/re3data.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+"""Align Registry of Research Data Repositoris (r3data) with the Bioregistry."""
+
+from typing import Sequence
+
+from bioregistry.align.utils import Aligner
+from bioregistry.external.re3data import get_re3data
+
+__all__ = [
+    "Re3dataAligner",
+]
+
+
+class Re3dataAligner(Aligner):
+    """Aligner for the Registry of Research Data Repositoris (r3data)."""
+
+    key = "re3data"
+    alt_key_match = "name"
+    getter = get_re3data
+    curation_header = ("name", "homepage", "description")
+
+    def get_curation_row(self, external_id, external_entry) -> Sequence[str]:
+        """Prepare curation rows for unaligned re3data registry entries."""
+        return [
+            external_entry["name"],
+            external_entry["homepage"],
+            external_entry["description"],
+        ]
+
+
+if __name__ == "__main__":
+    Re3dataAligner.align()
diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json
diff --git a/src/bioregistry/data/external/re3data/curation.tsv b/src/bioregistry/data/external/re3data/curation.tsv
diff --git a/src/bioregistry/data/external/re3data/processed.json b/src/bioregistry/data/external/re3data/processed.json
diff --git a/src/bioregistry/data/metaregistry.json b/src/bioregistry/data/metaregistry.json
@@ -894,6 +894,49 @@
       "prefix": "prefixcommons",
       "provider_uri_format": "https://prefixcommons.org/?q=$1"
     },
+    {
+      "availability": {
+        "alternate_providers": "missing",
+        "contact": "present",
+        "description": "required",
+        "example": "missing",
+        "fair": false,
+        "fair_note": "The API for Re3data does not provide a bulk download",
+        "homepage": "required",
+        "license": "present",
+        "name": "required",
+        "pattern": "missing",
+        "provider": "missing",
+        "search": true,
+        "synonyms": "missing",
+        "version": "missing"
+      },
+      "bioregistry_prefix": "re3data",
+      "contact": {
+        "email": "[email protected]",
+        "github": "mwittin",
+        "name": "Michael Witt",
+        "orcid": "0000-0003-4221-7956"
+      },
+      "description": "Re3data is a global registry of research data repositories that covers research data repositories from different academic disciplines.",
+      "example": "r3d100010772",
+      "governance": {
+        "accepts_external_contributions": true,
+        "curates": true,
+        "curation": "opaque-review",
+        "imports": false,
+        "public_version_control": false,
+        "review_team": "private",
+        "scope": "general",
+        "status": "active"
+      },
+      "homepage": "https://www.re3data.org",
+      "license": "CC 0",
+      "name": "Registry of Research Data Repositories",
+      "prefix": "re3data",
+      "provider_uri_format": "https://www.re3data.org/repository/$1",
+      "short_name": "re3data"
+    },
     {
       "availability": {
         "alternate_providers": "missing",

diff --git a/src/bioregistry/external/__init__.py b/src/bioregistry/external/__init__.py
@@ -21,6 +21,7 @@
 from .ols import get_ols
 from .ontobee import get_ontobee
 from .prefixcommons import get_prefixcommons
+from .re3data import get_re3data
 from .uniprot import get_uniprot
 from .wikidata import get_wikidata
 
@@ -44,6 +45,7 @@
     "get_uniprot",
     "get_wikidata",
     "get_edam",
+    "get_re3data",
 ]
 
 GETTERS: List[Tuple[str, str, Callable]] = [
@@ -68,4 +70,5 @@
     ("aberowl", "AberOWL", get_aberowl),
     ("cropoct", "CropOCT", get_cropoct),
     ("edam", "EDAM", get_edam),
+    ("re3data", "re3data", get_re3data),
 ]
diff --git a/src/bioregistry/external/re3data.py b/src/bioregistry/external/re3data.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+
+"""Re3data is a registry of research data repositories.
+
+Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772
+"""
+
+import json
+import logging
+from typing import Any, Mapping, Optional, Tuple
+from xml.etree import ElementTree
+
+import requests
+from tqdm.contrib.concurrent import thread_map
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+from bioregistry.constants import EXTERNAL
+from bioregistry.utils import removeprefix
+
+__all__ = [
+    "get_re3data",
+]
+
+logger = logging.getLogger(__name__)
+DIRECTORY = EXTERNAL / "re3data"
+DIRECTORY.mkdir(exist_ok=True, parents=True)
+PROCESSED_PATH = DIRECTORY / "processed.json"
+
+BASE_URL = "https://www.re3data.org"
+SCHEMA = "{http://www.re3data.org/schema/2-2}"
+
+
+def get_re3data(force_download: bool = False):
+    """Get the re3data registry.
+
+    This takes about 9 minutes since it has to look up each of the ~3K
+    records with their own API call.
+
+    :param force_download: If true, re-downloads the data
+    :returns: The re3data pre-processed data
+    """
+    if PROCESSED_PATH.exists() and not force_download:
+        with PROCESSED_PATH.open() as file:
+            return json.load(file)
+
+    res = requests.get(f"{BASE_URL}/api/v1/repositories")
+    tree = ElementTree.fromstring(res.text)
+
+    identifier_to_doi = {}
+    for repository in tree.findall("repository"):
+        identifier_element = repository.find("id")
+        if identifier_element is None or identifier_element.text is None:
+            continue
+
+        doi_element = repository.find("doi")
+        doi = (
+            removeprefix(doi_element.text, "https://doi.org/")
+            if doi_element and doi_element.text
+            else None
+        )
+        identifier_to_doi[identifier_element.text.strip()] = doi
+
+    records = dict(
+        thread_map(
+            _get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data"
+        )
+    )
+
+    # backfill DOIs
+    for identifier, record in records.items():
+        doi = identifier_to_doi.get(identifier)
+        if doi:
+            record["doi"] = doi
+
+    with PROCESSED_PATH.open("w") as file:
+        json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False)
+
+    return records
+
+
+def _get_record(identifier: str) -> Tuple[str, Mapping[str, Any]]:
+    res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}")
+    tree = ElementTree.fromstring(res.text)[0]
+    return identifier, _process_record(identifier, tree)
+
+
+def _process_record(identifier: str, tree_inner):
+    xrefs = (
+        _clean_xref(element.text.strip())
+        for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier")
+    )
+    data = {
+        "prefix": identifier,
+        "name": tree_inner.find(f"{SCHEMA}repositoryName").text,
+        "description": tree_inner.find(f"{SCHEMA}description").text,
+        "homepage": tree_inner.find(f"{SCHEMA}repositoryURL").text,
+        "synonyms": [
+            element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName")
+        ],
+        "xrefs": dict(tup for tup in xrefs if tup),
+    }
+
+    license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName")
+    if license_element:
+        data["license"] = license_element.text
+
+    return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v}
+
+
+def _clean_xref(xref: str) -> Optional[Tuple[str, str]]:
+    if (
+        xref.startswith("FAIRsharing_DOI:10.25504/")
+        or xref.startswith("FAIRsharing_doi:10.25504/")
+        or xref.startswith("FAIRsharing_dOI:10.25504/")
+        or xref.startswith("FAIRSharing_doi:10.25504/")
+        or xref.startswith("FAIRsharing_doi;10.25504/")
+        or xref.startswith("FAIRsharing_doi: 10.25504/")
+        or xref.startswith("fairsharing_DOI:10.25504/")
+        or xref.startswith("fairsharing_doi:10.25504/")
+        or xref.startswith("FAIRsharin_doi:10.25504/")
+        or xref.startswith("FAIRsharing_doi.:10.25504/")
+        or xref.startswith("FAIRsharing_DOI: 10.25504/")
+        or xref.startswith("FAIRsharing_doi::10.25504/")
+        or xref.startswith("FAIRsharing_doi:10.24404/")
+    ):
+        return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :]
+
+    for start, key in [
+        ("biodbcore-", "biodbcore"),
+        ("MIR:", "miriam"),
+        ("ROR:", "ror"),
+        ("OMICS_", "omics"),
+        ("Omics_", "omics"),
+        ("omics_", "omics"),
+        ("ISSN ", "issn"),
+        ("ISSN: ", "issn"),
+        ("nif-", "nif"),
+        ("ISNI:", "isni"),
+        ("doi.org/", "doi"),
+        ("doi:", "doi"),
+        ("DOI:", "doi"),
+        ("DOI: ", "doi"),
+        ("RID:nlx_", "nlx"),
+        ("PSSB-", "pssb"),
+        ("OpenDOAR:", "opendoar"),
+        ("openDOAR:", "opendoar"),
+        ("ROAR:", "roar"),  # e.g., see http://roar.eprints.org/14208/
+        ("hdl:", "hdl"),
+        ("https://fairsharing.org/", "fairsharing.legacy"),
+        ("http://fairsharing.org/", "fairsharing.legacy"),
+        ("Wikidata:", "wikidata"),
+        ("https://doi.org/10.5281/zenodo.", "zenodo"),
+        ("https://doi.org/", "doi"),
+    ]:
+        if xref.startswith(start):
+            return key, xref[len(start) :]
+
+    if xref.startswith("RRID:"):
+        inner_xref = xref[len("RRID:") :]
+        if "_" in inner_xref:
+            prefix, identifier = inner_xref.split("_", 1)
+            return prefix.lower(), identifier
+        elif "-" in inner_xref:
+            try:
+                prefix, identifier = inner_xref.split("-", 1)
+            except ValueError:
+                logger.debug("can't parse RRID: %s", xref)
+            else:
+                return prefix.lower(), identifier
+        else:
+            logger.debug("unknown RRID: %s", xref)
+            return None
+
+    if "doi:" in xref:
+        for part in xref.split(" "):
+            if part.startswith("doi"):
+                return "doi", part[len("doi:") :]
+
+    logger.debug("re3data record had unparsable xref: %s", xref)
+    return None
+
+
+if __name__ == "__main__":
+    with logging_redirect_tqdm():
+        get_re3data(force_download=True)
diff --git a/src/bioregistry/schema/schema.json b/src/bioregistry/schema/schema.json
@@ -481,6 +481,10 @@
         "edam": {
           "title": "Edam",
           "type": "object"
+        },
+        "re3data": {
+          "title": "Re3Data",
+          "type": "object"
         }
       },
       "required": [

diff --git a/src/bioregistry/schema/struct.py b/src/bioregistry/schema/struct.py
@@ -430,6 +430,8 @@ class Resource(BaseModel):
     biocontext: Optional[Mapping[str, Any]]
     #: External data from EDAM ontology
     edam: Optional[Mapping[str, Any]]
+    #: External data from re3data
+    re3data: Optional[Mapping[str, Any]]
 
     def get_external(self, metaprefix) -> Mapping[str, Any]:
         """Get an external registry."""