Skip to content

Commit

Permalink
More RDF fixes (#196)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmillanacosta authored Nov 18, 2024
2 parents 3a332ae + 38cbed0 commit a7b14dd
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 128 deletions.
16 changes: 15 additions & 1 deletion src/pyBiodatafuse/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,14 @@
"umls": "https://uts-ws.nlm.nih.gov/rest/semantic-network/2015AB/CUI/",
"ensembl": "https://identifiers.org/ensembl:",
"dcat": "http://www.w3.org/ns/dcat#",
"biodatafuse": "https://biodatafuse.org/",
"bdf": "https://biodatafuse.org/",
"foaf": "http://xmlns.com/foaf/0.1/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"owl": "http://www.w3.org/2002/07/owl#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"bdfo": "https://biodatafuse.org/onto/bdf#",
}

# Patterns URIs for nodes (one for each node in the schema)
Expand Down Expand Up @@ -535,6 +536,8 @@
"approved_compound": "http://purl.obolibrary.org/obo/NCIT_C172573",
"aid": "http://purl.obolibrary.org/obo/CLO_0037244",
"developmental_stage_node": "http://purl.obolibrary.org/obo/NCIT_C43531",
"el_node": "https://biodatafuse.org/onto/bdf#DisGeNET_Evidence_Level",
"ei_node": "https://biodatafuse.org/onto/bdf#DisGeNET_Evidence_Index",
}

# PREDICATES
Expand Down Expand Up @@ -614,3 +617,14 @@
"ensembl": "http://identifiers.org/ensembl#",
"stringdb": "https://string-db.org/network/",
}

NAMESPACE_SHAPES = {
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
"http://example.org/": "ex",
"http://weso.es/shapes/": ":",
"http://www.w3.org/2001/XMLSchema#": "xsd",
"http://www.w3.org/2002/07/owl#": "owl",
"http://purl.obolibrary.org/obo/": "obo",
"http://purl.obolibrary.org/obo/so#": "so",
"https://biodatafuse.org/onto/bdf#": "bdfo",
}
174 changes: 80 additions & 94 deletions src/pyBiodatafuse/graph/rdf/nodes/gene_disease.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
# gene_disease.py

"""Populate a BDF RDF graph with gene-disease relationship data."""


from bioregistry import get_iri
from rdflib import Graph, Literal, URIRef
from rdflib.namespace import OWL, RDF, RDFS, XSD

from pyBiodatafuse.constants import DISEASE_IDENTIFIER_TYPES, NODE_TYPES, PREDICATES
from pyBiodatafuse.graph.rdf.metadata import ( # pylint: disable=no-name-in-module
add_data_source_node,
)
from pyBiodatafuse.graph.rdf.metadata import add_data_source_node


def add_gene_disease_associations(
Expand All @@ -28,70 +23,44 @@ def add_gene_disease_associations(
:param id_number: Unique identifier for the associations.
:param source_idx: Source index for the associations.
:param gene_node: URIRef of the gene node associated with the disease data.
:param disease_data: List of dictionaries containing disease association information.
:param disease_data: Dictionary containing disease association information.
:param new_uris: Dictionary with updated project base URIs for the nodes.
:param i: the index of the row
:param i: The index of the row.
"""
data = disease_data
gene_id = g.value(subject=gene_node, predicate=RDFS.label)
disease_node = add_disease_node(g, data)
if disease_node:
gene_disease_assoc_node = URIRef(
f"{new_uris['gene_disease_association']}/{id_number}{i}{source_idx}"
disease_node = add_disease_node(g, disease_data)
if not disease_node:
return

assoc_node = URIRef(f"{new_uris['gene_disease_association']}/{id_number}{i}{source_idx}")
g.add((assoc_node, RDF.type, URIRef(NODE_TYPES["gene_disease_association"])))
g.add((assoc_node, URIRef(PREDICATES["sio_refers_to"]), gene_node))
g.add((assoc_node, URIRef(PREDICATES["sio_refers_to"]), disease_node))
umlscui = disease_data.get("disease_umlscui", "")
score = disease_data.get("score", "")
if disease_data.get("score"):
score_node = add_score_node(
g,
id_number,
source_idx,
umlscui,
score,
new_uris,
i,
gene_id,
)
g.add((gene_disease_assoc_node, RDF.type, URIRef(NODE_TYPES["gene_disease_association"])))
g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_refers_to"]), gene_node))
g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_refers_to"]), disease_node))
if data.get("score"):
disease_umlscui = disease_data.get("disease_umlscui", None)
if disease_umlscui:
score_node = add_score_node(
g=g,
id_number=id_number,
source_idx=source_idx,
score=data["score"],
new_uris=new_uris,
i=i,
disease_id=disease_umlscui,
gene_id=gene_id,
)
g.add(
(
gene_disease_assoc_node,
URIRef(PREDICATES["sio_has_measurement_value"]),
score_node,
)
)
data_source_node = add_data_source_node(g, "DISGENET")
g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_has_source"]), data_source_node))


def add_evidence_idx_node(
g: Graph, id_number: str, source_idx: str, disease_id: str, ei: float, new_uris: dict, i
) -> URIRef:
"""Create and add an evidence index (EI) node to the RDF graph.
g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), score_node))

:param g: RDF graph to which the EI node will be added.
:param id_number: Unique identifier for the EI node.
:param source_idx: Source index for the association.
:param disease_id: Disease identifier associated with the EI.
:param ei: Evidence index value for the gene-disease association.
:param new_uris: Dictionary with updated project base URIs for the nodes.
:param i: Index or iteration number used for node URI construction.
:return: URIRef for the created EI node.
"""
evidence_idx_node = URIRef(
f"{new_uris['score_base_node']}/{id_number}{i}{source_idx}_{disease_id}"
)
g.add((evidence_idx_node, RDF.type, URIRef(NODE_TYPES["evidence_idx_node"])))
g.add(
(
evidence_idx_node,
URIRef(PREDICATES["sio_has_value"]),
Literal(ei, datatype=XSD.double),
)
)
return evidence_idx_node
if disease_data.get("ei"):
ei_node = add_evidence_node(g, id_number, source_idx, "ei", disease_data, new_uris, i)
g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), ei_node))

if disease_data.get("el"):
el_node = add_evidence_node(g, id_number, source_idx, "el", disease_data, new_uris, i)
g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), el_node))

data_source_node = add_data_source_node(g, "DISGENET")
g.add((assoc_node, URIRef(PREDICATES["sio_has_source"]), data_source_node))


def add_disease_node(g: Graph, disease_data: dict) -> URIRef:
Expand All @@ -101,34 +70,25 @@ def add_disease_node(g: Graph, disease_data: dict) -> URIRef:
:param disease_data: Dictionary containing disease information.
:return: URIRef for the created disease node.
"""
# UMLS IRIs not in Bioregistry
disease_curie = disease_data.get("disease_umlscui")
if disease_curie is None:
disease_curie = disease_data.get("UMLS")
disease_curie = disease_data.get("disease_umlscui") or disease_data.get("UMLS")
if not disease_curie:
return None

disease_iri = f"https://www.ncbi.nlm.nih.gov/medgen/{disease_curie}"
# disease_data.get()
disease_node = URIRef(disease_iri)
g.add((disease_node, RDF.type, URIRef(NODE_TYPES["disease_node"])))
g.add(
(disease_node, RDFS.label, Literal(disease_data.get("disease_name"), datatype=XSD.string))
)
for identifier_type in DISEASE_IDENTIFIER_TYPES:
curie_field = disease_data.get(identifier_type, None)
if curie_field and "ncit" not in curie_field:
curies = [curie_field]
if "," in (curie_field):
curies = [i for i in curie_field.split(", ")]

for identifier in DISEASE_IDENTIFIER_TYPES:
if disease_data.get(identifier):
curies = disease_data.get(identifier, "").split(", ")
for curie in curies:
disease_source_iri = get_iri(curie)
if disease_source_iri is None:
if ":" in curie:
curie = curie.split(":")[1]
disease_source_iri = get_iri("obo:" + curie)
g.add(
(disease_node, OWL.SameAs, URIRef(disease_source_iri))
) # Some of the data does not look like a skos:exactMatch
else:
g.add((disease_node, OWL.sameAs, URIRef(disease_source_iri)))
source_iri = get_iri(curie) or get_iri(f"obo:{curie.split(':')[-1]}")
if source_iri:
g.add((disease_node, OWL.sameAs, URIRef(source_iri)))

return disease_node


Expand All @@ -151,18 +111,44 @@ def add_score_node(
:param score: Score value for the gene-disease association.
:param new_uris: Dictionary with updated project base URIs for the nodes.
:param i: Index or iteration number used for node URI construction.
:param gene_id: String value of the gene ID
:param gene_id: String value of the gene ID.
:return: URIRef for the created score node.
"""
score_node = URIRef(
f"{new_uris['score_base_node']}/{id_number}{i}{source_idx}_{disease_id}{gene_id}"
)
g.add((score_node, RDF.type, URIRef(NODE_TYPES["score_node"])))
g.add(
(
score_node,
URIRef(PREDICATES["sio_has_value"]),
Literal(score, datatype=XSD.double),
)
)
g.add((score_node, URIRef(PREDICATES["sio_has_value"]), Literal(score, datatype=XSD.double)))
return score_node


def add_evidence_node(
g: Graph,
id_number: str,
source_idx: str,
evidence_type: str,
disease_data: dict,
new_uris: dict,
i: int,
) -> URIRef:
"""Create and add an evidence node (EI or EL) to the RDF graph.
:param g: RDF graph to which the evidence node will be added.
:param id_number: Unique identifier for the evidence node.
:param source_idx: Source index for the association.
:param evidence_type: Type of evidence ("ei" or "el").
:param disease_data: Dictionary containing disease information.
:param new_uris: Dictionary with updated project base URIs for the nodes.
:param i: Index or iteration number used for node URI construction.
:return: URIRef for the created evidence node.
"""
value = disease_data.get(evidence_type)
if value is None:
return None

node = URIRef(
f"{new_uris['score_base_node']}/{evidence_type}/{id_number}{i}{source_idx}_{disease_data['disease_umlscui']}"
)
g.add((node, RDF.type, URIRef(NODE_TYPES[f"{evidence_type}_node"])))
g.add((node, URIRef(PREDICATES["sio_has_value"]), Literal(value, datatype=XSD.double)))
return node
43 changes: 25 additions & 18 deletions src/pyBiodatafuse/graph/rdf/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,22 +117,29 @@ def generate_rdf(self, df: pd.DataFrame, metadata: dict, open_only: bool = False
stringdb_data = row.get(STRING_PPI_COL, None)
# molmedb_data = row.get("MolMeDB_transporter_inhibitor", None)
# transporter_inhibited_data = row.get(MOLMEDB_COMPOUND_PROTEIN_COL, None)
# Initialize an empty list to store disease data
disease_data = []
for source in [DISGENET_DISEASE_COL, OPENTARGETS_DISEASE_COL]:
if open_only and source == DISGENET_DISEASE_COL:
continue
source_el = row.get(source)
if isinstance(source_el, list):
disease_data += source_el

if (
pd.isna(source_idx)
or pd.isna(source_namespace)
or pd.isna(target_idx)
or pd.isna(target_namespace)

# Iterate over the specified data columns
for source_col in [DISGENET_DISEASE_COL, OPENTARGETS_DISEASE_COL]:
if open_only and source_col == DISGENET_DISEASE_COL:
continue # TODO fix open data only feature
# Extract the source data from the current row
source_data = row.get(source_col, [])
# Ensure source_data is not None and is of type array or list
if len(source_data) > 0:
disease_data.extend(source_data)
# Check for missing required indices or namespaces and skip the row if any are NaN
if any(
pd.isna(val) for val in [source_idx, source_namespace, target_idx, target_namespace]
):
continue

# Check for missing required indices or namespaces and skip the row if any are NaN
if any(
pd.isna(val) for val in [source_idx, source_namespace, target_idx, target_namespace]
):
continue
source_curie = normalize_curie(f"{source_namespace}:{source_idx}")
target_curie = normalize_curie(f"{target_namespace}:{target_idx}")

Expand Down Expand Up @@ -307,12 +314,12 @@ def add_metadata(self, metadata):
:param metadata: Metadata to be added.
"""
add_metadata(
self,
self.version_iri,
self.author,
self.orcid,
metadata,
self.version_iri,
g=self,
graph_uri=self.version_iri, # TODO fix
version_iri=self.version_iri,
author=self.author,
orcid=self.orcid,
metadata=metadata,
)

def shex(
Expand Down
20 changes: 5 additions & 15 deletions src/pyBiodatafuse/graph/rdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from shexer.consts import SHACL_TURTLE, TURTLE
from shexer.shaper import Shaper

from pyBiodatafuse.constants import DATA_SOURCES, NAMESPACE_BINDINGS, NODE_TYPES
from pyBiodatafuse.constants import DATA_SOURCES, NAMESPACE_BINDINGS, NAMESPACE_SHAPES, NODE_TYPES

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -106,27 +106,17 @@ def get_shapes(
except AttributeError as exc:
raise ValueError("graph_type must be a string.") from exc

# Default namespaces
namespaces_dict = {
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
"http://example.org/": "ex",
"http://weso.es/shapes/": ":",
"http://www.w3.org/2001/XMLSchema#": "xsd",
"http://www.w3.org/2002/07/owl#": "owl",
f"{base_uri}": "graph",
"http://purl.obolibrary.org/obo/": "obo",
"http://purl.obolibrary.org/obo/so#": "so",
}

# Default namespaces: NAMESPACE_SHAPES
NAMESPACE_SHAPES[base_uri] = "graph"
# Merge with additional namespaces if provided
if additional_namespaces:
namespaces_dict.update(additional_namespaces)
NAMESPACE_SHAPES.update(additional_namespaces)
# Initialize Shaper with the given graph and namespaces
shaper = Shaper(
all_classes_mode=True,
rdflib_graph=g,
input_format=TURTLE,
namespaces_dict=namespaces_dict,
namespaces_dict=NAMESPACE_SHAPES,
)
graph_result = None
# Generate the appropriate graph (Shex or SHACL)
Expand Down
Loading

0 comments on commit a7b14dd

Please sign in to comment.