diff --git a/reasoner_transpiler/attributes.py b/reasoner_transpiler/attributes.py index 8fab67a..cf0b93b 100644 --- a/reasoner_transpiler/attributes.py +++ b/reasoner_transpiler/attributes.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path from .biolink import bmt @@ -20,41 +21,63 @@ def get_attribute_types_from_config(): ATTRIBUTE_SKIP_LIST = [] -RESERVED_NODE_PROPS = [ - "id", - "name", - "labels", - "element_id" -] -RESERVED_EDGE_PROPS = [ - "id", - "predicate", - "object", - "subject", - "sources" -] - -# this should really be one representation or the other, or be configurable, -# but we have graphs with each now so temporarily (I hope, hope, hope) looking for both -EDGE_SOURCE_PROPS = [ - "aggregator_knowledge_source", - "primary_knowledge_source", - "biolink:aggregator_knowledge_source", - "biolink:primary_knowledge_source" -] - - -def transform_attributes(result_item, node=False): - - # make a list of attributes to ignore while processing - ignore_list = RESERVED_NODE_PROPS if node else EDGE_SOURCE_PROPS + RESERVED_EDGE_PROPS - ignore_list += ATTRIBUTE_SKIP_LIST +PRIMARY_KNOWLEDGE_SOURCE = "primary_knowledge_source" +AGGREGATOR_KNOWLEDGE_SOURCE = "aggregator_knowledge_source" + +PROVENANCE_TAG = os.environ.get('PROVENANCE_TAG', 'reasoner-transpiler') + + +# This function takes EDGE_SOURCE_PROPS properties from results, converts them into proper +# TRAPI dictionaries, and assigns the proper upstream ids to each resource. It does not currently attempt to avoid +# duplicate aggregator results, which shouldn't exist in the graphs. +def construct_sources_tree(primary_knowledge_source, aggregator_knowledge_sources): + + if not primary_knowledge_source: + return [{"resource_id": PROVENANCE_TAG, + "resource_role": "primary_knowledge_source"}] + + # set the primary knowledge source + formatted_sources = [{"resource_id": primary_knowledge_source, + "resource_role": "primary_knowledge_source"}] + + # walk through the aggregator lists and construct the chains of provenance + terminal_aggregators = set() + for aggregator_list in aggregator_knowledge_sources: + # each aggregator list should be in order, so we can deduce the upstream chains + last_aggregator = None + for aggregator_knowledge_source in aggregator_list: + formatted_sources.append({ + "resource_id": aggregator_knowledge_source, + "resource_role": "aggregator_knowledge_source", + "upstream_resource_ids": [last_aggregator] if last_aggregator else [primary_knowledge_source] + }) + last_aggregator = aggregator_knowledge_source + # store the last aggregator in the list, because this will be an upstream source for the plater one + terminal_aggregators.add(last_aggregator) + # add PROVENANCE_TAG as the most downstream aggregator, + # it will have as upstream either the primary ks or all of the furthest downstream aggregators if they exist + # this will be used by applications like Plater which need to append themselves as an aggregator + formatted_sources.append({ + "resource_id": PROVENANCE_TAG, + "resource_role": "aggregator_knowledge_source", + "upstream_resource_ids": list(terminal_aggregators) if terminal_aggregators else [primary_knowledge_source] + }) + return formatted_sources + + +def transform_attributes(result_entity, node=False): + + # construct a valid TRAPI entity to return in trapi_entity + trapi_entity = {} + + for attribute in ATTRIBUTE_SKIP_LIST: + result_entity.pop(attribute, None) # an "attributes" attribute in neo4j should be a list of json strings, # attempt to start the attributes section of transformed attributes with its contents, # here we are assuming the attributes in "attributes" are already valid trapi json_attributes = [] - json_attributes_attribute = result_item.pop('attributes', None) + json_attributes_attribute = result_entity.pop('attributes', None) if json_attributes_attribute: if isinstance(json_attributes_attribute, list): try: @@ -64,26 +87,33 @@ def transform_attributes(result_item, node=False): print(f'!!! JSONDecodeError while parsing attributes property, ignoring: {json_attributes_attribute}') else: print(f'!!! the attributes edge property should be a list, ignoring: {json_attributes_attribute}') - transformed_attributes = { - 'attributes': json_attributes - } + trapi_attributes = json_attributes + else: + trapi_attributes = [] + # if it's an edge handle provenance (sources) and qualifiers if not node: - # for edges, find and format attributes that are qualifiers - qualifiers = [key for key in result_item if key not in ignore_list - and bmt.is_qualifier(key)] - transformed_attributes['qualifiers'] = [ - {"qualifier_type_id": f"biolink:{key}", - "qualifier_value": value} - for key, value in result_item.items() if key in qualifiers - ] - else: - qualifiers = [] + # extract properties for provenance, construct the sources section + primary_knowledge_source = result_entity.pop(PRIMARY_KNOWLEDGE_SOURCE, None) + # get any properties that start with AGGREGATOR_KNOWLEDGE_SOURCE, this handles the possibility of edges + # with multiple aggregator knowledge source lists like aggregator_knowledge_source_2 + aggregator_knowledge_source_keys = [ks_attribute for ks_attribute in result_entity.keys() + if ks_attribute.startswith(AGGREGATOR_KNOWLEDGE_SOURCE)] + aggregator_knowledge_sources = [result_entity[key] for key in aggregator_knowledge_source_keys] + for ks_property in aggregator_knowledge_source_keys: + result_entity.pop(ks_property) + trapi_entity["sources"] = construct_sources_tree(primary_knowledge_source, aggregator_knowledge_sources) + + # find and format attributes that are qualifiers + qualifiers = [key for key in result_entity if bmt.is_qualifier(key)] + if qualifiers: + trapi_entity["qualifiers"] = [{"qualifier_type_id": f"biolink:{qualifier}", + "qualifier_value": result_entity.pop(qualifier)} + for qualifier in qualifiers] # for attributes that aren't in ATTRIBUTE_TYPES, see if they are valid biolink attributes # add them to ATTRIBUTE_TYPES, so we don't need to look again - for attribute in \ - [key for key in result_item.keys() if key not in ignore_list + qualifiers + list(ATTRIBUTE_TYPES.keys())]: + for attribute in [key for key in result_entity.keys() if key not in list(ATTRIBUTE_TYPES.keys())]: attribute_mapping = DEFAULT_ATTRIBUTE_TYPE bmt_element = bmt.get_element(attribute) if bmt_element: @@ -94,17 +124,18 @@ def transform_attributes(result_item, node=False): ATTRIBUTE_TYPES[attribute] = attribute_mapping # format the rest of the attributes, look up their attribute type and value type - transformed_attributes['attributes'].extend([ + trapi_attributes.extend([ {'original_attribute_name': key, 'value': value, # the following function will return # 'attribute_type_id': 'biolink-ified attribute type id' # 'value_type_id': 'biolink-ified value type id' **ATTRIBUTE_TYPES.get(key)} - for key, value in result_item.items() - if key not in ignore_list + qualifiers + for key, value in result_entity.items() ]) - return transformed_attributes + if trapi_attributes: + trapi_entity["attributes"] = trapi_attributes + return trapi_entity def set_custom_attribute_types(attribute_types: dict): diff --git a/reasoner_transpiler/cypher.py b/reasoner_transpiler/cypher.py index a3c3c21..fc5cf47 100644 --- a/reasoner_transpiler/cypher.py +++ b/reasoner_transpiler/cypher.py @@ -1,16 +1,12 @@ """Tools for compiling QGraph into Cypher query.""" -import os import json from collections import defaultdict -from .attributes import transform_attributes, EDGE_SOURCE_PROPS +from .attributes import transform_attributes, PROVENANCE_TAG from .matching import match_query -PROVENANCE_TAG = os.environ.get('PROVENANCE_TAG', 'reasoner-transpiler') - - def nest_op(operator, *args): """Generate a nested set of operations from a flat expression.""" if len(args) > 2: @@ -51,7 +47,8 @@ def assemble_results(qnodes, qedges, **kwargs): ]) if not edges_assemble: edges_assemble = '[]' - assemble_clause = f"WITH apoc.coll.toSet({nodes_assemble}) AS nodes, apoc.coll.toSet({edges_assemble}) AS edges, collect(DISTINCT [" + assemble_clause = f"WITH apoc.coll.toSet({nodes_assemble}) AS nodes, " \ + f"apoc.coll.toSet({edges_assemble}) AS edges, collect(DISTINCT [" if nodes: assemble_clause += ', '.join(nodes) @@ -117,9 +114,18 @@ def transform_result(cypher_record, nodes, edges, paths = unpack_bolt_record(cypher_record) - # Convert the list of unique result nodes from cypher results to dictionaries - # then convert them to TRAPI format, constructing the knowledge_graph["nodes"] section of the TRAPI response - kg_nodes = transform_nodes_list(nodes) + # Construct the knowledge_graph["nodes"] section of the TRAPI response + kg_nodes = {} + for cypher_node in nodes: + # Convert the list of unique result nodes from cypher results to dictionaries + node = convert_bolt_node_to_dict(cypher_node) + # Convert nodes to TRAPI format + # id, name, and labels are removed before transform_attributes + node_id = node.pop('id') + kg_nodes[node_id] = { + 'name': node.pop('name'), + 'categories': sorted(node.pop('labels'))} + kg_nodes[node_id].update(**transform_attributes(node, node=True)) # Convert the list of unique edges from cypher results to dictionaries # then convert them to TRAPI format, constructing the knowledge_graph["edges"] section of the TRAPI response. @@ -213,7 +219,8 @@ def transform_result(cypher_record, # Check to see if the edge has subclass edges that are connected to it subclass_edge_ids = [] superclass_node_ids = {} - for (subclass_subject_or_object, subclass_qedge_id, superclass_qnode_id) in qedges_with_attached_subclass_edges.get(qedge_id, []): + for (subclass_subject_or_object, subclass_qedge_id, superclass_qnode_id) in \ + qedges_with_attached_subclass_edges.get(qedge_id, []): # If so, check to see if there are results for it qedge, subclass_edge_element_ids = qedge_id_to_results[subclass_qedge_id] if subclass_edge_element_ids: @@ -297,17 +304,6 @@ def transform_result(cypher_record, return transformed_results -def transform_nodes_list(nodes): - kg_nodes = {} - for cypher_node in nodes: - node = convert_bolt_node_to_dict(cypher_node) - kg_nodes[node['id']] = { - 'name': node['name'], - 'categories': sorted(node.pop('labels')), - **transform_attributes(node, node=True)} - return kg_nodes - - def transform_edges_list(edges): # See convert_bolt_edge_to_dict() for details on the contents of edges, # it is a list of lists (which can also be lists), representing unique edges from the graph @@ -352,65 +348,10 @@ def transform_edges_list(edges): return kg_edges, element_id_to_edge_id -# This function takes EDGE_SOURCE_PROPS properties from results, converts them into proper -# TRAPI dictionaries, and assigns the proper upstream ids to each resource. It does not currently attempt to avoid -# duplicate aggregator results, which shouldn't exist in the graphs. -def construct_sources_tree(sources): - - # first find the primary knowledge source, there should always be one - primary_knowledge_source = None - formatted_sources = None - for resource_role, resource_id in sources: - if resource_role == "primary_knowledge_source": - primary_knowledge_source = resource_id - # add it to the formatted TRAPI output - formatted_sources = [{ - "resource_id": primary_knowledge_source, - "resource_role": "primary_knowledge_source" - }] - if not primary_knowledge_source: - # we could hard fail here, every edge should have a primary ks, but I haven't fixed all the tests yet - # raise KeyError(f'primary_knowledge_source missing from sources section of cypher results! ' - # f'sources: {sources}') - return [] - - # then find any aggregator lists - aggregator_list_sources = [] - for resource_role, resource_id in sources: - # this looks weird but the idea is that you could have a few parallel lists like: - # aggregator_knowledge_source, aggregator_knowledge_source_2, aggregator_knowledge_source_3 - if resource_role.startswith("aggregator_knowledge_source"): - aggregator_list_sources.append(resource_id) - # walk through the aggregator lists and construct the chains of provenance - terminal_aggregators = set() - for aggregator_list in aggregator_list_sources: - # each aggregator list should be in order, so we can deduce the upstream chains - last_aggregator = None - for aggregator_knowledge_source in aggregator_list: - formatted_sources.append({ - "resource_id": aggregator_knowledge_source, - "resource_role": "aggregator_knowledge_source", - "upstream_resource_ids": [last_aggregator] if last_aggregator else [primary_knowledge_source] - }) - last_aggregator = aggregator_knowledge_source - # store the last aggregator in the list, because this will be an upstream source for the plater one - terminal_aggregators.add(last_aggregator) - # add PROVENANCE_TAG as the most downstream aggregator, - # it will have as upstream either the primary ks or all of the furthest downstream aggregators if they exist - # this will be used by applications like Plater which need to append themselves as an aggregator - formatted_sources.append({ - "resource_id": PROVENANCE_TAG, - "resource_role": "aggregator_knowledge_source", - "upstream_resource_ids": list(terminal_aggregators) if terminal_aggregators else [primary_knowledge_source] - }) - return list(formatted_sources) - - def convert_bolt_node_to_dict(bolt_node): if not bolt_node: return None node = {key: value for key, value in bolt_node.items()} - # node['element_id'] = bolt_node.element_id node['labels'] = bolt_node.labels return node @@ -442,16 +383,10 @@ def convert_bolt_edge_to_trapi(bolt_edge): # edge_props - any other properties from the edge edge_props = {**bolt_edge[4]} - # get the id if there is one on the edge + # retrieve and remove the id if there is one on the edge edge_id = edge_props.pop('id', None) - # get properties matching EDGE_SOURCE_PROPS keys, remove biolink: if needed, - # then pass (key, value) tuples to construct_sources_tree for formatting, constructing the sources section - converted_edge['sources'] = construct_sources_tree([ - (edge_source_prop.removeprefix('biolink:'), edge_props.pop(edge_source_prop)) - for edge_source_prop in EDGE_SOURCE_PROPS if edge_source_prop in edge_props]) - - # convert all remaining attributes to TRAPI format, constructing the attributes section + # convert all remaining attributes to TRAPI format, constructing the attributes and sources sections converted_edge.update(transform_attributes(edge_props, node=False)) # return the edge id if there was one, and a TRAPI edge diff --git a/tests/initialize_db.py b/tests/initialize_db.py index 802c180..a775e9a 100755 --- a/tests/initialize_db.py +++ b/tests/initialize_db.py @@ -4,8 +4,9 @@ import logging import time +import neo4j.exceptions from neo4j import GraphDatabase -from neo4j.exceptions import ServiceUnavailable, DatabaseUnavailable +from neo4j.exceptions import ServiceUnavailable, DatabaseUnavailable, ClientError LOGGER = logging.getLogger(__name__) @@ -19,11 +20,9 @@ def get_driver(url): while True: try: driver = GraphDatabase.driver(url, auth=("neo4j", "plater_testing_pw")) - # make sure we can start and finish a session - with driver.session() as session: - session.run("SHOW PROCEDURES") + driver.verify_connectivity() return driver - except (OSError, ServiceUnavailable, DatabaseUnavailable) as err: + except (OSError, ServiceUnavailable, DatabaseUnavailable, ClientError) as err: if seconds >= 256: raise err LOGGER.error( @@ -53,6 +52,7 @@ def main(hash: str = None): "name: row.name, id: row.id" "}, apoc.convert.fromJsonMap(row.props))) YIELD node " "RETURN count(*)") + print(f'Nodes added: {result.single()["count(*)"]}') result.consume() # this looks like it doesn't do anything, but it's needed to throw errors if they occur result = session.run(f"LOAD CSV WITH HEADERS FROM \"{edge_file}\" " "AS edge " @@ -62,8 +62,10 @@ def main(hash: str = None): "apoc.map.merge({predicate: edge.predicate, id: edge.id}, " "apoc.convert.fromJsonMap(edge.props)), object) YIELD rel " "RETURN count(*)") - result.consume() # this looks like it doesn't do anything, but it's needed to throw errors if they occur + print(f'Edges added: {result.single()["count(*)"]}') + result.consume() # this looks like it doesn't do anything, but it's needed to throw errors if they occur + driver.close() LOGGER.info("Done. Neo4j is ready for testing.") diff --git a/tests/neo4j_csv/edges.csv b/tests/neo4j_csv/edges.csv index f0b001f..f2d5217 100755 --- a/tests/neo4j_csv/edges.csv +++ b/tests/neo4j_csv/edges.csv @@ -1,26 +1,27 @@ id,subject,predicate,object,props -t2d_isa_disease,MONDO:0005148,biolink:subclass_of,MONDO:0000001,"{}" -disease_superclass_edge,MONDO:0000001,biolink:subclass_of,MONDO:0000000,"{}" -t2dsubtype_is_a_t2d,MONDO:0014488,biolink:subclass_of,MONDO:0005148,"{}" -monogenic_diabetes_is_a_diabetes,MONDO:0015967,biolink:subclass_of,MONDO:0000001,"{}" -albuminaria_isa_phenotype,HP:0012592,biolink:subclass_of,HP:0000118,"{}" -anagliptin_treats_t2d,CHEBI:136043,biolink:treats,MONDO:0005148,"{\"fda_approved\": false}" -bezafibrate_treats_t2d,CHEBI:47612,biolink:treats,MONDO:0005148,"{\"fda_approved\": false}" -bezafibrate_treats_obesity,CHEBI:47612,biolink:treats,MONDO:0011122,"{\"fda_approved\": false}" -metformin_treats_t2d,CHEBI:6801,biolink:treats,MONDO:0005148,"{\"fda_approved\": true, \"primary_knowledge_source\": \"infores:test\", \"aggregator_knowledge_source\": [\"ctd\"]}" -metformin_treats_obesity,CHEBI:6801,biolink:treats,MONDO:0011122,"{}" -t2d_has_phenotype_increased_weight,MONDO:0005148,biolink:has_phenotype,HP:0004324,"{}" -t2d_has_phenotype_albuminaria,MONDO:0005148,biolink:has_phenotype,HP:0012592,"{}" -obesity_has_phenotype_increased_weight,MONDO:0011122,biolink:has_phenotype,HP:0004324,"{}" -CASP3_molecularly_interacts_with_CASP8,NCBIGene:836,biolink:molecularly_interacts_with,NCBIGene:841,"{\"publications\": [\"xxx\"]}" -metformin_increases_expression_of_CASP8,CHEBI:6801,biolink:increases_expression_of,NCBIGene:841,"{}" -metformin_increases_expression_of_CASP3,CHEBI:6801,biolink:increases_expression_of,NCBIGene:836,"{}" -t2d_associated_with_CASP8,MONDO:0005148,biolink:genetically_associated_with,NCBIGene:841,"{\"p_value\": 0.000007}" -t2d_associated_with_CASP3,MONDO:0005148,biolink:genetically_associated_with,NCBIGene:836,"{\"non_biolink_attribute\": \"xxx123\"}" -obesity_ga_CASP3,MONDO:0011122,biolink:genetically_associated_with,NCBIGene:836,"{}" -metformin_treats_carcinoma,CHEBI:6801,biolink:treats,MONDO:0004993,"{}" -carcinoma_associated_with_CASP8,MONDO:0004993,biolink:genetically_associated_with,NCBIGene:841,"{}" -carcinoma_associated_with_BRCA1,NCBIGene:672,biolink:gene_associated_with_condition,MONDO:0004993,"{\"publications\": [\"xxx\"], \"attributes\": [\"{\\\"attribute_type_id\\\": \\\"json_attribute_1\\\", \\\"value\\\": \\\"json_value_1\\\"}\", \"{\\\"attribute_type_id\\\": \\\"json_attribute_2\\\", \\\"value\\\": \\\"json_value_2\\\"}\", \"{\\\"attribute_type_id\\\": \\\"json_attribute_3\\\", \\\"value\\\": \\\"json_value_3\\\", \\\"attributes\\\": [{\\\"attribute_type_id\\\": \\\"nested_json_attribute_1\\\", \\\"value\\\": \\\"nested_json_value_1\\\"}] }\"] }" -t2d_invalid_predicate_albuminaria,MONDO:0005148,biolink:invalid_predicate,HP:0012592,"{}" -qualified_edge_single_qualifier,PUBCHEM.COMPOUND:5460341,biolink:affects,NCBIGene:283871,"{\"qualified_predicate\": \"biolink:causes\"}" -qualified_edge_multiple_qualifier,PUBCHEM.COMPOUND:5460341,biolink:affects,NCBIGene:283871,"{\"qualified_predicate\": \"biolink:causes\",\"object_aspect_qualifier\": \"activity\",\"object_direction_qualifier\": \"decreased\",\"primary_knowledge_source\": \"infores:ctd\"}}" \ No newline at end of file +t2d_isa_disease,MONDO:0005148,biolink:subclass_of,MONDO:0000001,"{\"primary_knowledge_source\": \"infores:test\"}" +disease_superclass_edge,MONDO:0000001,biolink:subclass_of,MONDO:0000000,"{\"primary_knowledge_source\": \"infores:test\"}" +t2dsubtype_is_a_t2d,MONDO:0014488,biolink:subclass_of,MONDO:0005148,"{\"primary_knowledge_source\": \"infores:test\"}" +monogenic_diabetes_is_a_diabetes,MONDO:0015967,biolink:subclass_of,MONDO:0000001,"{\"primary_knowledge_source\": \"infores:test\"}" +albuminaria_isa_phenotype,HP:0012592,biolink:subclass_of,HP:0000118,"{\"primary_knowledge_source\": \"infores:test\"}" +anagliptin_treats_t2d,CHEBI:136043,biolink:treats,MONDO:0005148,"{\"fda_approved\": false, \"primary_knowledge_source\": \"infores:test2\"}" +bezafibrate_treats_t2d,CHEBI:47612,biolink:treats,MONDO:0005148,"{\"fda_approved\": false, \"primary_knowledge_source\": \"infores:test\"}" +bezafibrate_treats_obesity,CHEBI:47612,biolink:treats,MONDO:0011122,"{\"fda_approved\": false, \"primary_knowledge_source\": \"infores:test\"}" +metformin_treats_t2d,CHEBI:6801,biolink:treats,MONDO:0005148,"{\"fda_approved\": true, \"primary_knowledge_source\": \"infores:test\", \"aggregator_knowledge_source\": [\"infores:ctd\"]}" +metformin_treats_obesity,CHEBI:6801,biolink:treats,MONDO:0011122,"{\"primary_knowledge_source\": \"infores:test\"}" +t2d_has_phenotype_increased_weight,MONDO:0005148,biolink:has_phenotype,HP:0004324,"{\"primary_knowledge_source\": \"infores:test\"}" +t2d_has_phenotype_albuminaria,MONDO:0005148,biolink:has_phenotype,HP:0012592,"{\"primary_knowledge_source\": \"infores:test\"}" +obesity_has_phenotype_increased_weight,MONDO:0011122,biolink:has_phenotype,HP:0004324,"{\"primary_knowledge_source\": \"infores:test\"}" +CASP3_molecularly_interacts_with_CASP8,NCBIGene:836,biolink:molecularly_interacts_with,NCBIGene:841,"{\"publications\": [\"xxx\"], \"primary_knowledge_source\": \"infores:test\"}" +metformin_increases_expression_of_CASP8,CHEBI:6801,biolink:increases_expression_of,NCBIGene:841,"{\"primary_knowledge_source\": \"infores:test\"}" +metformin_increases_expression_of_CASP3,CHEBI:6801,biolink:increases_expression_of,NCBIGene:836,"{\"primary_knowledge_source\": \"infores:test\"}" +t2d_associated_with_CASP8,MONDO:0005148,biolink:genetically_associated_with,NCBIGene:841,"{\"p_value\": 0.000007, \"primary_knowledge_source\": \"infores:test\"}" +t2d_associated_with_CASP3,MONDO:0005148,biolink:genetically_associated_with,NCBIGene:836,"{\"non_biolink_attribute\": \"xxx123\", \"primary_knowledge_source\": \"infores:test\"}" +obesity_ga_CASP3,MONDO:0011122,biolink:genetically_associated_with,NCBIGene:836,"{\"primary_knowledge_source\": \"infores:test\"}" +metformin_treats_carcinoma,CHEBI:6801,biolink:treats,MONDO:0004993,"{\"primary_knowledge_source\": \"infores:test\"}" +carcinoma_associated_with_CASP8,MONDO:0004993,biolink:genetically_associated_with,NCBIGene:841,"{\"primary_knowledge_source\": \"infores:test\"}" +carcinoma_associated_with_BRCA1,NCBIGene:672,biolink:gene_associated_with_condition,MONDO:0004993,"{\"primary_knowledge_source\": \"infores:test\", \"publications\": [\"xxx\"], \"attributes\": [\"{\\\"attribute_type_id\\\": \\\"json_attribute_1\\\", \\\"value\\\": \\\"json_value_1\\\"}\", \"{\\\"attribute_type_id\\\": \\\"json_attribute_2\\\", \\\"value\\\": \\\"json_value_2\\\"}\", \"{\\\"attribute_type_id\\\": \\\"json_attribute_3\\\", \\\"value\\\": \\\"json_value_3\\\", \\\"attributes\\\": [{\\\"attribute_type_id\\\": \\\"nested_json_attribute_1\\\", \\\"value\\\": \\\"nested_json_value_1\\\"}] }\"]}" +t2d_invalid_predicate_albuminaria,MONDO:0005148,biolink:invalid_predicate,HP:0012592,"{\"primary_knowledge_source\": \"infores:test\"}" +qualified_edge_single_qualifier,PUBCHEM.COMPOUND:5460341,biolink:affects,NCBIGene:283871,"{\"qualified_predicate\": \"biolink:causes\", \"primary_knowledge_source\": \"infores:test\"}" +qualified_edge_multiple_qualifier,PUBCHEM.COMPOUND:5460341,biolink:affects,NCBIGene:283871,"{\"qualified_predicate\": \"biolink:causes\", \"object_aspect_qualifier\": \"activity\", \"object_direction_qualifier\": \"decreased\",\"primary_knowledge_source\": \"infores:ctd\"}}" +invalid_provenance,MESH:D014612,biolink:causes,MONDO:0005260,"{\"bogus_knowledge_source\": \"internet\"}" diff --git a/tests/neo4j_csv/nodes.csv b/tests/neo4j_csv/nodes.csv index 1152c84..7fa40c2 100755 --- a/tests/neo4j_csv/nodes.csv +++ b/tests/neo4j_csv/nodes.csv @@ -16,4 +16,6 @@ NCBIGene:836,biolink:Gene,CASP3,"{\"length\": 277, \"chromosome\": \"4\"}" NCBIGene:841,biolink:Gene,CASP8,"{\"length\": 479, \"chromosome\": \"2\"}" NCBIGene:672,biolink:Gene,BRCA1,"{\"length\": 1863, \"chromosome\": \"17\"}" PUBCHEM.COMPOUND:5460341,biolink:SmallMolecule,Calcium,"" -NCBIGene:283871,biolink:Gene,PGP,"" \ No newline at end of file +NCBIGene:283871,biolink:Gene,PGP,"" +MESH:D014612,biolink:ComplexMolecularMixture,Vaccines,"" +MONDO:0005260,biolink:Disease,autism,"" \ No newline at end of file diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index 19f8e3d..348e195 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -14,7 +14,7 @@ def test_categories(neo4j_driver): "edges": dict(), } output = neo4j_driver.run(get_query(qgraph), convert_to_trapi=True, qgraph=qgraph) - assert len(output['results']) == 11 + assert len(output['results']) == 12 def test_empty(neo4j_driver): diff --git a/tests/test_props.py b/tests/test_props.py index 64c2bde..ce060e7 100644 --- a/tests/test_props.py +++ b/tests/test_props.py @@ -27,6 +27,7 @@ def test_numeric(neo4j_driver): node_1 = list(output["knowledge_graph"]["nodes"].values())[0] assert node_1["name"] == "CASP3" assert "length" not in node_1 + assert "element_id" not in node_1 def test_string(neo4j_driver): @@ -116,13 +117,12 @@ def test_publications(neo4j_driver): edges = output["knowledge_graph"]["edges"] assert len(edges) == 1 attributes = list(edges.values())[0]["attributes"] - assert len(attributes) == 1 - assert attributes[0] == { + assert any([attribute == { "original_attribute_name": "publications", "attribute_type_id": "biolink:publications", "value": ["xxx"], "value_type_id": "linkml:Uriorcurie" - } + } for attribute in attributes]) def test_empty_constraints(neo4j_driver): @@ -174,13 +174,12 @@ def test_valid_biolink_attribute_without_mapping(neo4j_driver): edge = list(edges.values())[0] assert "p_value" not in edge attributes = edge["attributes"] - assert len(attributes) == 1 - assert attributes[0] == { + assert any([attribute == { "original_attribute_name": "p_value", "attribute_type_id": "biolink:p_value", "value": 0.000007, "value_type_id": DEFAULT_ATTRIBUTE_TYPE['value_type_id'] - } + } for attribute in attributes]) reset_custom_attribute_types() @@ -204,14 +203,14 @@ def test_invalid_biolink_attribute_without_mapping(neo4j_driver): } output = neo4j_driver.run(get_query(qgraph), convert_to_trapi=True, qgraph=qgraph) edges = output["knowledge_graph"]["edges"] - attribute = list(edges.values())[0]["attributes"][0] + attributes = list(edges.values())[0]["attributes"] expected_attribute = { "original_attribute_name": "non_biolink_attribute", "attribute_type_id": DEFAULT_ATTRIBUTE_TYPE['attribute_type_id'], "value": "xxx123", "value_type_id": DEFAULT_ATTRIBUTE_TYPE['value_type_id'] } - assert attribute == expected_attribute + assert any([attribute == expected_attribute for attribute in attributes]) def test_json_attributes(neo4j_driver): @@ -279,7 +278,8 @@ def test_props_customization(neo4j_driver): edges = output["knowledge_graph"]["edges"] assert len(edges) == 1 attributes = list(edges.values())[0]["attributes"] - assert len(attributes) == 0 + assert not any([attribute["attribute_type_id"] == "biolink:publications" for attribute in attributes]) + assert not any([attribute["attribute_type_id"] == "publications" for attribute in attributes]) # reset the skip list set_custom_attribute_skip_list([]) @@ -295,11 +295,10 @@ def test_props_customization(neo4j_driver): edges = output["knowledge_graph"]["edges"] assert len(edges) == 1 attributes = list(edges.values())[0]["attributes"] - assert len(attributes) == 1 - assert attributes[0] == { + assert any([attribute == { "original_attribute_name": "publications", "attribute_type_id": "transpiler:custom_attribute_type", "value": ["xxx"], "value_type_id": "transpiler:custom_value_type" - } + } for attribute in attributes]) reset_custom_attribute_types() diff --git a/tests/test_sources.py b/tests/test_sources.py index ae8ea9b..27fe704 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -34,10 +34,41 @@ def test_primary_source(neo4j_driver): assert not any(['primary_knowledge_source' in edge for edge in output["knowledge_graph"]["edges"].values()]) assert edge_sources["metformin_treats_t2d"] == [ {'resource_id': 'infores:test', 'resource_role': 'primary_knowledge_source'}, - {'resource_id': 'ctd', 'resource_role': 'aggregator_knowledge_source', + {'resource_id': 'infores:ctd', 'resource_role': 'aggregator_knowledge_source', 'upstream_resource_ids': ['infores:test']}, {'resource_id': 'reasoner-transpiler', 'resource_role': 'aggregator_knowledge_source', - 'upstream_resource_ids': ['ctd']}] - # if the attributes are not set expect an empty sources list - assert edge_sources["bezafibrate_treats_t2d"] == [] - assert edge_sources["anagliptin_treats_t2d"] == [] + 'upstream_resource_ids': ['infores:ctd']}] + assert edge_sources["bezafibrate_treats_t2d"] == [ + {'resource_id': 'infores:test', 'resource_role': 'primary_knowledge_source'}, + {'resource_id': 'reasoner-transpiler', 'resource_role': 'aggregator_knowledge_source', + 'upstream_resource_ids': ['infores:test']}] + assert edge_sources["anagliptin_treats_t2d"] == [ + {'resource_id': 'infores:test2', 'resource_role': 'primary_knowledge_source'}, + {'resource_id': 'reasoner-transpiler', 'resource_role': 'aggregator_knowledge_source', + 'upstream_resource_ids': ['infores:test2']}] + + +# if there is no primary knowledge source on the edge, provenance will be just the transpiler/plater +def test_missing_primary_source(neo4j_driver): + qgraph = { + "nodes": { + "n0": { + "ids": ["MESH:D014612"] + }, + "n1": { + "categories": ["biolink:Disease"], + "ids": ["MONDO:0005260"] + } + + }, + "edges": { + "e0": { + "subject": "n0", + "object": "n1", + "predicate": "biolink:causes" + } + } + } + output = neo4j_driver.run(get_query(qgraph), convert_to_trapi=True, qgraph=qgraph) + edge_sources = output["knowledge_graph"]["edges"]["invalid_provenance"]["sources"] + assert edge_sources == [{'resource_id': 'reasoner-transpiler', 'resource_role': 'primary_knowledge_source'}]