-
-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes #478
- Loading branch information
Showing
10 changed files
with
35,382 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
"""Align Registry of Research Data Repositoris (r3data) with the Bioregistry.""" | ||
|
||
from typing import Sequence | ||
|
||
from bioregistry.align.utils import Aligner | ||
from bioregistry.external.re3data import get_re3data | ||
|
||
__all__ = [ | ||
"Re3dataAligner", | ||
] | ||
|
||
|
||
class Re3dataAligner(Aligner): | ||
"""Aligner for the Registry of Research Data Repositoris (r3data).""" | ||
|
||
key = "re3data" | ||
alt_key_match = "name" | ||
getter = get_re3data | ||
curation_header = ("name", "homepage", "description") | ||
|
||
def get_curation_row(self, external_id, external_entry) -> Sequence[str]: | ||
"""Prepare curation rows for unaligned re3data registry entries.""" | ||
return [ | ||
external_entry["name"], | ||
external_entry["homepage"], | ||
external_entry["description"], | ||
] | ||
|
||
|
||
if __name__ == "__main__": | ||
Re3dataAligner.align() |
Large diffs are not rendered by default.
Oops, something went wrong.
4,170 changes: 4,170 additions & 0 deletions
4,170
src/bioregistry/data/external/re3data/curation.tsv
Large diffs are not rendered by default.
Oops, something went wrong.
29,989 changes: 29,989 additions & 0 deletions
29,989
src/bioregistry/data/external/re3data/processed.json
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -894,6 +894,49 @@ | |
"prefix": "prefixcommons", | ||
"provider_uri_format": "https://prefixcommons.org/?q=$1" | ||
}, | ||
{ | ||
"availability": { | ||
"alternate_providers": "missing", | ||
"contact": "present", | ||
"description": "required", | ||
"example": "missing", | ||
"fair": false, | ||
"fair_note": "The API for Re3data does not provide a bulk download", | ||
"homepage": "required", | ||
"license": "present", | ||
"name": "required", | ||
"pattern": "missing", | ||
"provider": "missing", | ||
"search": true, | ||
"synonyms": "missing", | ||
"version": "missing" | ||
}, | ||
"bioregistry_prefix": "re3data", | ||
"contact": { | ||
"email": "[email protected]", | ||
"github": "mwittin", | ||
"name": "Michael Witt", | ||
"orcid": "0000-0003-4221-7956" | ||
}, | ||
"description": "Re3data is a global registry of research data repositories that covers research data repositories from different academic disciplines.", | ||
"example": "r3d100010772", | ||
"governance": { | ||
"accepts_external_contributions": true, | ||
"curates": true, | ||
"curation": "opaque-review", | ||
"imports": false, | ||
"public_version_control": false, | ||
"review_team": "private", | ||
"scope": "general", | ||
"status": "active" | ||
}, | ||
"homepage": "https://www.re3data.org", | ||
"license": "CC 0", | ||
"name": "Registry of Research Data Repositories", | ||
"prefix": "re3data", | ||
"provider_uri_format": "https://www.re3data.org/repository/$1", | ||
"short_name": "re3data" | ||
}, | ||
{ | ||
"availability": { | ||
"alternate_providers": "missing", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
"""Re3data is a registry of research data repositories. | ||
Example API endpoint: https://www.re3data.org/api/v1/repository/r3d100010772 | ||
""" | ||
|
||
import json | ||
import logging | ||
from typing import Any, Mapping, Optional, Tuple | ||
from xml.etree import ElementTree | ||
|
||
import requests | ||
from tqdm.contrib.concurrent import thread_map | ||
from tqdm.contrib.logging import logging_redirect_tqdm | ||
|
||
from bioregistry.constants import EXTERNAL | ||
from bioregistry.utils import removeprefix | ||
|
||
__all__ = [ | ||
"get_re3data", | ||
] | ||
|
||
logger = logging.getLogger(__name__) | ||
DIRECTORY = EXTERNAL / "re3data" | ||
DIRECTORY.mkdir(exist_ok=True, parents=True) | ||
PROCESSED_PATH = DIRECTORY / "processed.json" | ||
|
||
BASE_URL = "https://www.re3data.org" | ||
SCHEMA = "{http://www.re3data.org/schema/2-2}" | ||
|
||
|
||
def get_re3data(force_download: bool = False): | ||
"""Get the re3data registry. | ||
This takes about 9 minutes since it has to look up each of the ~3K | ||
records with their own API call. | ||
:param force_download: If true, re-downloads the data | ||
:returns: The re3data pre-processed data | ||
""" | ||
if PROCESSED_PATH.exists() and not force_download: | ||
with PROCESSED_PATH.open() as file: | ||
return json.load(file) | ||
|
||
res = requests.get(f"{BASE_URL}/api/v1/repositories") | ||
tree = ElementTree.fromstring(res.text) | ||
|
||
identifier_to_doi = {} | ||
for repository in tree.findall("repository"): | ||
identifier_element = repository.find("id") | ||
if identifier_element is None or identifier_element.text is None: | ||
continue | ||
|
||
doi_element = repository.find("doi") | ||
doi = ( | ||
removeprefix(doi_element.text, "https://doi.org/") | ||
if doi_element and doi_element.text | ||
else None | ||
) | ||
identifier_to_doi[identifier_element.text.strip()] = doi | ||
|
||
records = dict( | ||
thread_map( | ||
_get_record, identifier_to_doi, unit_scale=True, unit="record", desc="Getting re3data" | ||
) | ||
) | ||
|
||
# backfill DOIs | ||
for identifier, record in records.items(): | ||
doi = identifier_to_doi.get(identifier) | ||
if doi: | ||
record["doi"] = doi | ||
|
||
with PROCESSED_PATH.open("w") as file: | ||
json.dump(records, file, indent=2, sort_keys=True, ensure_ascii=False) | ||
|
||
return records | ||
|
||
|
||
def _get_record(identifier: str) -> Tuple[str, Mapping[str, Any]]: | ||
res = requests.get(f"{BASE_URL}/api/v1/repository/{identifier}") | ||
tree = ElementTree.fromstring(res.text)[0] | ||
return identifier, _process_record(identifier, tree) | ||
|
||
|
||
def _process_record(identifier: str, tree_inner): | ||
xrefs = ( | ||
_clean_xref(element.text.strip()) | ||
for element in tree_inner.findall(f"{SCHEMA}repositoryIdentifier") | ||
) | ||
data = { | ||
"prefix": identifier, | ||
"name": tree_inner.find(f"{SCHEMA}repositoryName").text, | ||
"description": tree_inner.find(f"{SCHEMA}description").text, | ||
"homepage": tree_inner.find(f"{SCHEMA}repositoryURL").text, | ||
"synonyms": [ | ||
element.text.strip() for element in tree_inner.findall(f"{SCHEMA}additionalName") | ||
], | ||
"xrefs": dict(tup for tup in xrefs if tup), | ||
} | ||
|
||
license_element = tree_inner.find(f"{SCHEMA}databaseLicense/{SCHEMA}databaseLicenseName") | ||
if license_element: | ||
data["license"] = license_element.text | ||
|
||
return {k: v.strip() if isinstance(v, str) else v for k, v in data.items() if v} | ||
|
||
|
||
def _clean_xref(xref: str) -> Optional[Tuple[str, str]]: | ||
if ( | ||
xref.startswith("FAIRsharing_DOI:10.25504/") | ||
or xref.startswith("FAIRsharing_doi:10.25504/") | ||
or xref.startswith("FAIRsharing_dOI:10.25504/") | ||
or xref.startswith("FAIRSharing_doi:10.25504/") | ||
or xref.startswith("FAIRsharing_doi;10.25504/") | ||
or xref.startswith("FAIRsharing_doi: 10.25504/") | ||
or xref.startswith("fairsharing_DOI:10.25504/") | ||
or xref.startswith("fairsharing_doi:10.25504/") | ||
or xref.startswith("FAIRsharin_doi:10.25504/") | ||
or xref.startswith("FAIRsharing_doi.:10.25504/") | ||
or xref.startswith("FAIRsharing_DOI: 10.25504/") | ||
or xref.startswith("FAIRsharing_doi::10.25504/") | ||
or xref.startswith("FAIRsharing_doi:10.24404/") | ||
): | ||
return "fairsharing", xref[len("FAIRsharing_DOI:10.25504/") :] | ||
|
||
for start, key in [ | ||
("biodbcore-", "biodbcore"), | ||
("MIR:", "miriam"), | ||
("ROR:", "ror"), | ||
("OMICS_", "omics"), | ||
("Omics_", "omics"), | ||
("omics_", "omics"), | ||
("ISSN ", "issn"), | ||
("ISSN: ", "issn"), | ||
("nif-", "nif"), | ||
("ISNI:", "isni"), | ||
("doi.org/", "doi"), | ||
("doi:", "doi"), | ||
("DOI:", "doi"), | ||
("DOI: ", "doi"), | ||
("RID:nlx_", "nlx"), | ||
("PSSB-", "pssb"), | ||
("OpenDOAR:", "opendoar"), | ||
("openDOAR:", "opendoar"), | ||
("ROAR:", "roar"), # e.g., see http://roar.eprints.org/14208/ | ||
("hdl:", "hdl"), | ||
("https://fairsharing.org/", "fairsharing.legacy"), | ||
("http://fairsharing.org/", "fairsharing.legacy"), | ||
("Wikidata:", "wikidata"), | ||
("https://doi.org/10.5281/zenodo.", "zenodo"), | ||
("https://doi.org/", "doi"), | ||
]: | ||
if xref.startswith(start): | ||
return key, xref[len(start) :] | ||
|
||
if xref.startswith("RRID:"): | ||
inner_xref = xref[len("RRID:") :] | ||
if "_" in inner_xref: | ||
prefix, identifier = inner_xref.split("_", 1) | ||
return prefix.lower(), identifier | ||
elif "-" in inner_xref: | ||
try: | ||
prefix, identifier = inner_xref.split("-", 1) | ||
except ValueError: | ||
logger.debug("can't parse RRID: %s", xref) | ||
else: | ||
return prefix.lower(), identifier | ||
else: | ||
logger.debug("unknown RRID: %s", xref) | ||
return None | ||
|
||
if "doi:" in xref: | ||
for part in xref.split(" "): | ||
if part.startswith("doi"): | ||
return "doi", part[len("doi:") :] | ||
|
||
logger.debug("re3data record had unparsable xref: %s", xref) | ||
return None | ||
|
||
|
||
if __name__ == "__main__": | ||
with logging_redirect_tqdm(): | ||
get_re3data(force_download=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters