More RDF fixes (#196)

BioDataFuse · Nov 18, 2024 · a7b14dd · a7b14dd
2 parents 3a332ae + 38cbed0
commit a7b14dd
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 128 deletions.
diff --git a/src/pyBiodatafuse/constants.py b/src/pyBiodatafuse/constants.py
@@ -489,13 +489,14 @@
     "umls": "https://uts-ws.nlm.nih.gov/rest/semantic-network/2015AB/CUI/",
     "ensembl": "https://identifiers.org/ensembl:",
     "dcat": "http://www.w3.org/ns/dcat#",
-    "biodatafuse": "https://biodatafuse.org/",
+    "bdf": "https://biodatafuse.org/",
     "foaf": "http://xmlns.com/foaf/0.1/",
     "skos": "http://www.w3.org/2004/02/skos/core#",
     "owl": "http://www.w3.org/2002/07/owl#",
     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
     "xsd": "http://www.w3.org/2001/XMLSchema#",
+    "bdfo": "https://biodatafuse.org/onto/bdf#",
 }
 
 # Patterns URIs for nodes (one for each node in the schema)
@@ -535,6 +536,8 @@
     "approved_compound": "http://purl.obolibrary.org/obo/NCIT_C172573",
     "aid": "http://purl.obolibrary.org/obo/CLO_0037244",
     "developmental_stage_node": "http://purl.obolibrary.org/obo/NCIT_C43531",
+    "el_node": "https://biodatafuse.org/onto/bdf#DisGeNET_Evidence_Level",
+    "ei_node": "https://biodatafuse.org/onto/bdf#DisGeNET_Evidence_Index",
 }
 
 # PREDICATES
@@ -614,3 +617,14 @@
     "ensembl": "http://identifiers.org/ensembl#",
     "stringdb": "https://string-db.org/network/",
 }
+
+NAMESPACE_SHAPES = {
+    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
+    "http://example.org/": "ex",
+    "http://weso.es/shapes/": ":",
+    "http://www.w3.org/2001/XMLSchema#": "xsd",
+    "http://www.w3.org/2002/07/owl#": "owl",
+    "http://purl.obolibrary.org/obo/": "obo",
+    "http://purl.obolibrary.org/obo/so#": "so",
+    "https://biodatafuse.org/onto/bdf#": "bdfo",
+}
diff --git a/src/pyBiodatafuse/graph/rdf/nodes/gene_disease.py b/src/pyBiodatafuse/graph/rdf/nodes/gene_disease.py
@@ -1,16 +1,11 @@
-# gene_disease.py
-
 """Populate a BDF RDF graph with gene-disease relationship data."""
 
-
 from bioregistry import get_iri
 from rdflib import Graph, Literal, URIRef
 from rdflib.namespace import OWL, RDF, RDFS, XSD
 
 from pyBiodatafuse.constants import DISEASE_IDENTIFIER_TYPES, NODE_TYPES, PREDICATES
-from pyBiodatafuse.graph.rdf.metadata import (  # pylint: disable=no-name-in-module
-    add_data_source_node,
-)
+from pyBiodatafuse.graph.rdf.metadata import add_data_source_node
 
 
 def add_gene_disease_associations(
@@ -28,70 +23,44 @@ def add_gene_disease_associations(
     :param id_number: Unique identifier for the associations.
     :param source_idx: Source index for the associations.
     :param gene_node: URIRef of the gene node associated with the disease data.
-    :param disease_data: List of dictionaries containing disease association information.
+    :param disease_data: Dictionary containing disease association information.
     :param new_uris: Dictionary with updated project base URIs for the nodes.
-    :param i: the index of the row
+    :param i: The index of the row.
     """
-    data = disease_data
     gene_id = g.value(subject=gene_node, predicate=RDFS.label)
-    disease_node = add_disease_node(g, data)
-    if disease_node:
-        gene_disease_assoc_node = URIRef(
-            f"{new_uris['gene_disease_association']}/{id_number}{i}{source_idx}"
+    disease_node = add_disease_node(g, disease_data)
+    if not disease_node:
+        return
+
+    assoc_node = URIRef(f"{new_uris['gene_disease_association']}/{id_number}{i}{source_idx}")
+    g.add((assoc_node, RDF.type, URIRef(NODE_TYPES["gene_disease_association"])))
+    g.add((assoc_node, URIRef(PREDICATES["sio_refers_to"]), gene_node))
+    g.add((assoc_node, URIRef(PREDICATES["sio_refers_to"]), disease_node))
+    umlscui = disease_data.get("disease_umlscui", "")
+    score = disease_data.get("score", "")
+    if disease_data.get("score"):
+        score_node = add_score_node(
+            g,
+            id_number,
+            source_idx,
+            umlscui,
+            score,
+            new_uris,
+            i,
+            gene_id,
         )
-        g.add((gene_disease_assoc_node, RDF.type, URIRef(NODE_TYPES["gene_disease_association"])))
-        g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_refers_to"]), gene_node))
-        g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_refers_to"]), disease_node))
-        if data.get("score"):
-            disease_umlscui = disease_data.get("disease_umlscui", None)
-            if disease_umlscui:
-                score_node = add_score_node(
-                    g=g,
-                    id_number=id_number,
-                    source_idx=source_idx,
-                    score=data["score"],
-                    new_uris=new_uris,
-                    i=i,
-                    disease_id=disease_umlscui,
-                    gene_id=gene_id,
-                )
-                g.add(
-                    (
-                        gene_disease_assoc_node,
-                        URIRef(PREDICATES["sio_has_measurement_value"]),
-                        score_node,
-                    )
-                )
-        data_source_node = add_data_source_node(g, "DISGENET")
-        g.add((gene_disease_assoc_node, URIRef(PREDICATES["sio_has_source"]), data_source_node))
-
-
-def add_evidence_idx_node(
-    g: Graph, id_number: str, source_idx: str, disease_id: str, ei: float, new_uris: dict, i
-) -> URIRef:
-    """Create and add an evidence index (EI) node to the RDF graph.
+        g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), score_node))
 
-    :param g: RDF graph to which the EI node will be added.
-    :param id_number: Unique identifier for the EI node.
-    :param source_idx: Source index for the association.
-    :param disease_id: Disease identifier associated with the EI.
-    :param ei: Evidence index value for the gene-disease association.
-    :param new_uris: Dictionary with updated project base URIs for the nodes.
-    :param i: Index or iteration number used for node URI construction.
-    :return: URIRef for the created EI node.
-    """
-    evidence_idx_node = URIRef(
-        f"{new_uris['score_base_node']}/{id_number}{i}{source_idx}_{disease_id}"
-    )
-    g.add((evidence_idx_node, RDF.type, URIRef(NODE_TYPES["evidence_idx_node"])))
-    g.add(
-        (
-            evidence_idx_node,
-            URIRef(PREDICATES["sio_has_value"]),
-            Literal(ei, datatype=XSD.double),
-        )
-    )
-    return evidence_idx_node
+    if disease_data.get("ei"):
+        ei_node = add_evidence_node(g, id_number, source_idx, "ei", disease_data, new_uris, i)
+        g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), ei_node))
+
+    if disease_data.get("el"):
+        el_node = add_evidence_node(g, id_number, source_idx, "el", disease_data, new_uris, i)
+        g.add((assoc_node, URIRef(PREDICATES["sio_has_measurement_value"]), el_node))
+
+    data_source_node = add_data_source_node(g, "DISGENET")
+    g.add((assoc_node, URIRef(PREDICATES["sio_has_source"]), data_source_node))
 
 
 def add_disease_node(g: Graph, disease_data: dict) -> URIRef:
@@ -101,34 +70,25 @@ def add_disease_node(g: Graph, disease_data: dict) -> URIRef:
     :param disease_data: Dictionary containing disease information.
     :return: URIRef for the created disease node.
     """
-    # UMLS IRIs not in Bioregistry
-    disease_curie = disease_data.get("disease_umlscui")
-    if disease_curie is None:
-        disease_curie = disease_data.get("UMLS")
+    disease_curie = disease_data.get("disease_umlscui") or disease_data.get("UMLS")
+    if not disease_curie:
+        return None
+
     disease_iri = f"https://www.ncbi.nlm.nih.gov/medgen/{disease_curie}"
-    #  disease_data.get()
     disease_node = URIRef(disease_iri)
     g.add((disease_node, RDF.type, URIRef(NODE_TYPES["disease_node"])))
     g.add(
         (disease_node, RDFS.label, Literal(disease_data.get("disease_name"), datatype=XSD.string))
     )
-    for identifier_type in DISEASE_IDENTIFIER_TYPES:
-        curie_field = disease_data.get(identifier_type, None)
-        if curie_field and "ncit" not in curie_field:
-            curies = [curie_field]
-            if "," in (curie_field):
-                curies = [i for i in curie_field.split(", ")]
+
+    for identifier in DISEASE_IDENTIFIER_TYPES:
+        if disease_data.get(identifier):
+            curies = disease_data.get(identifier, "").split(", ")
             for curie in curies:
-                disease_source_iri = get_iri(curie)
-                if disease_source_iri is None:
-                    if ":" in curie:
-                        curie = curie.split(":")[1]
-                    disease_source_iri = get_iri("obo:" + curie)
-                    g.add(
-                        (disease_node, OWL.SameAs, URIRef(disease_source_iri))
-                    )  # Some of the data does not look like a skos:exactMatch
-                else:
-                    g.add((disease_node, OWL.sameAs, URIRef(disease_source_iri)))
+                source_iri = get_iri(curie) or get_iri(f"obo:{curie.split(':')[-1]}")
+                if source_iri:
+                    g.add((disease_node, OWL.sameAs, URIRef(source_iri)))
+
     return disease_node
 
 
@@ -151,18 +111,44 @@ def add_score_node(
     :param score: Score value for the gene-disease association.
     :param new_uris: Dictionary with updated project base URIs for the nodes.
     :param i: Index or iteration number used for node URI construction.
-    :param gene_id: String value of the gene ID
+    :param gene_id: String value of the gene ID.
     :return: URIRef for the created score node.
     """
     score_node = URIRef(
         f"{new_uris['score_base_node']}/{id_number}{i}{source_idx}_{disease_id}{gene_id}"
     )
     g.add((score_node, RDF.type, URIRef(NODE_TYPES["score_node"])))
-    g.add(
-        (
-            score_node,
-            URIRef(PREDICATES["sio_has_value"]),
-            Literal(score, datatype=XSD.double),
-        )
-    )
+    g.add((score_node, URIRef(PREDICATES["sio_has_value"]), Literal(score, datatype=XSD.double)))
     return score_node
+
+
+def add_evidence_node(
+    g: Graph,
+    id_number: str,
+    source_idx: str,
+    evidence_type: str,
+    disease_data: dict,
+    new_uris: dict,
+    i: int,
+) -> URIRef:
+    """Create and add an evidence node (EI or EL) to the RDF graph.
+
+    :param g: RDF graph to which the evidence node will be added.
+    :param id_number: Unique identifier for the evidence node.
+    :param source_idx: Source index for the association.
+    :param evidence_type: Type of evidence ("ei" or "el").
+    :param disease_data: Dictionary containing disease information.
+    :param new_uris: Dictionary with updated project base URIs for the nodes.
+    :param i: Index or iteration number used for node URI construction.
+    :return: URIRef for the created evidence node.
+    """
+    value = disease_data.get(evidence_type)
+    if value is None:
+        return None
+
+    node = URIRef(
+        f"{new_uris['score_base_node']}/{evidence_type}/{id_number}{i}{source_idx}_{disease_data['disease_umlscui']}"
+    )
+    g.add((node, RDF.type, URIRef(NODE_TYPES[f"{evidence_type}_node"])))
+    g.add((node, URIRef(PREDICATES["sio_has_value"]), Literal(value, datatype=XSD.double)))
+    return node
diff --git a/src/pyBiodatafuse/graph/rdf/rdf.py b/src/pyBiodatafuse/graph/rdf/rdf.py
@@ -117,22 +117,29 @@ def generate_rdf(self, df: pd.DataFrame, metadata: dict, open_only: bool = False
             stringdb_data = row.get(STRING_PPI_COL, None)
             # molmedb_data = row.get("MolMeDB_transporter_inhibitor", None)
             # transporter_inhibited_data = row.get(MOLMEDB_COMPOUND_PROTEIN_COL, None)
+            # Initialize an empty list to store disease data
             disease_data = []
-            for source in [DISGENET_DISEASE_COL, OPENTARGETS_DISEASE_COL]:
-                if open_only and source == DISGENET_DISEASE_COL:
-                    continue
-                source_el = row.get(source)
-                if isinstance(source_el, list):
-                    disease_data += source_el
-
-            if (
-                pd.isna(source_idx)
-                or pd.isna(source_namespace)
-                or pd.isna(target_idx)
-                or pd.isna(target_namespace)
+
+            # Iterate over the specified data columns
+            for source_col in [DISGENET_DISEASE_COL, OPENTARGETS_DISEASE_COL]:
+                if open_only and source_col == DISGENET_DISEASE_COL:
+                    continue  # TODO fix open data only feature
+                # Extract the source data from the current row
+                source_data = row.get(source_col, [])
+                # Ensure source_data is not None and is of type array or list
+                if len(source_data) > 0:
+                    disease_data.extend(source_data)
+            # Check for missing required indices or namespaces and skip the row if any are NaN
+            if any(
+                pd.isna(val) for val in [source_idx, source_namespace, target_idx, target_namespace]
             ):
                 continue
 
+            # Check for missing required indices or namespaces and skip the row if any are NaN
+            if any(
+                pd.isna(val) for val in [source_idx, source_namespace, target_idx, target_namespace]
+            ):
+                continue
             source_curie = normalize_curie(f"{source_namespace}:{source_idx}")
             target_curie = normalize_curie(f"{target_namespace}:{target_idx}")
 
@@ -307,12 +314,12 @@ def add_metadata(self, metadata):
         :param metadata: Metadata to be added.
         """
         add_metadata(
-            self,
-            self.version_iri,
-            self.author,
-            self.orcid,
-            metadata,
-            self.version_iri,
+            g=self,
+            graph_uri=self.version_iri,  # TODO fix
+            version_iri=self.version_iri,
+            author=self.author,
+            orcid=self.orcid,
+            metadata=metadata,
         )
 
     def shex(

diff --git a/src/pyBiodatafuse/graph/rdf/utils.py b/src/pyBiodatafuse/graph/rdf/utils.py
@@ -11,7 +11,7 @@
 from shexer.consts import SHACL_TURTLE, TURTLE
 from shexer.shaper import Shaper
 
-from pyBiodatafuse.constants import DATA_SOURCES, NAMESPACE_BINDINGS, NODE_TYPES
+from pyBiodatafuse.constants import DATA_SOURCES, NAMESPACE_BINDINGS, NAMESPACE_SHAPES, NODE_TYPES
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -106,27 +106,17 @@ def get_shapes(
     except AttributeError as exc:
         raise ValueError("graph_type must be a string.") from exc
 
-    # Default namespaces
-    namespaces_dict = {
-        "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
-        "http://example.org/": "ex",
-        "http://weso.es/shapes/": ":",
-        "http://www.w3.org/2001/XMLSchema#": "xsd",
-        "http://www.w3.org/2002/07/owl#": "owl",
-        f"{base_uri}": "graph",
-        "http://purl.obolibrary.org/obo/": "obo",
-        "http://purl.obolibrary.org/obo/so#": "so",
-    }
-
+    # Default namespaces: NAMESPACE_SHAPES
+    NAMESPACE_SHAPES[base_uri] = "graph"
     # Merge with additional namespaces if provided
     if additional_namespaces:
-        namespaces_dict.update(additional_namespaces)
+        NAMESPACE_SHAPES.update(additional_namespaces)
     # Initialize Shaper with the given graph and namespaces
     shaper = Shaper(
         all_classes_mode=True,
         rdflib_graph=g,
         input_format=TURTLE,
-        namespaces_dict=namespaces_dict,
+        namespaces_dict=NAMESPACE_SHAPES,
     )
     graph_result = None
     # Generate the appropriate graph (Shex or SHACL)