Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/biolink/ontobio
Browse files Browse the repository at this point in the history
  • Loading branch information
sierra-moxon committed Dec 9, 2024
2 parents c959fdc + 103cbcd commit 85dce88
Show file tree
Hide file tree
Showing 7 changed files with 6,019 additions and 62 deletions.
6 changes: 3 additions & 3 deletions bin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ alternatively, you can run the following commands to test the validate.produce c
Note: snapshot below in the URL can be changed to any pipeline branch; its listed here for ease of cp/paste.
```bash
poetry install
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset zfin ZFIN --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology.org/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology.org/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology..org/" --only-dataset zfin ZFIN --gpad-gpi-output-version 2.0
```

To test whether a GAF file is valid (passes all the GORules):
Expand Down
88 changes: 60 additions & 28 deletions bin/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import urllib
import shutil
import logging
import traceback

from ontobio.model.association import GoAssociation
from ontobio.model.association import Curie, ExtensionUnit
from ontobio.io.entityparser import GpiParser
from ontobio.ontol_factory import OntologyFactory
Expand All @@ -26,7 +27,7 @@
from ontobio.validation import tools
from ontobio.validation import rules

from typing import Dict, Set
from typing import Dict, Set, List

# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)

Expand Down Expand Up @@ -342,7 +343,7 @@ def make_ttls(dataset, gaf_path, products, ontology_graph):

@tools.gzips
def make_gpads(dataset, gaf_path, products, ontology_graph,
noctua_gpad_file, paint_gaf_src, gpi, gpad_gpi_output_version):
noctua_gpad_file, paint_gaf_src, gpi, gpad_gpi_output_version) -> (List[GoAssociation], List[str]):
"""
Using the gaf files and the noctua gpad file, produce a gpad file that contains both kinds of annotations
without any loss.
Expand All @@ -355,74 +356,103 @@ def make_gpads(dataset, gaf_path, products, ontology_graph,
:param paint_gaf_src: The source of the paint gaf file
:param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
to gene annotations in GAF outputs.
:return: The path to the gpad file
:return: (The path to the gpad file, the headers from all the files that contributed to the final GPAD file)
"""
gpad_file_path = os.path.join(os.path.split(gaf_path)[0], f"{dataset}.gpad")

if not products["gpad"]:
return []
noctua_header = None
all_gaf_headers = None
noctua_associations = []
all_gaf_associations = []

# Open the file once and keep it open for all operations within this block
with open(gpad_file_path, "w") as outfile:
gpadwriter = GpadWriter(file=outfile, version=gpad_gpi_output_version)

# If there's a noctua gpad file, process it
headers = []
# If there's a noctua gpad file, process it, return the parsing Report so we can get its headers for
# the final file provenance
if noctua_gpad_file:
click.echo("Making noctua gpad products...{}".format(noctua_gpad_file))
click.echo("Making noctua gpad products...")
# Process noctua gpad file
process_noctua_gpad_file(noctua_gpad_file, gpadwriter, ontology_graph, gpi)

# Process the GAF file
process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src)
(noctua_associations, noctua_header) = process_noctua_gpad_file(noctua_gpad_file, ontology_graph)
headers.append(noctua_header)
# Process the GAF file, store the report object so we can get its headers for the final file provenance
(all_gaf_associations, all_gaf_headers) = process_gaf_file(gaf_path, ontology_graph, paint_gaf_src)

if noctua_header:
for header in noctua_header:
gpadwriter._write("!Header from source noctua GPAD file\n")
gpadwriter._write("!=================================\n")
gpadwriter._write(header)
if all_gaf_headers:
for header in all_gaf_headers:
gpadwriter._write("!Header from source GAF file(s)\n")
gpadwriter._write("!=================================\n")
for header_line in header:
gpadwriter._write(header_line+"\n")

click.echo("Wrote all headers for GPAD, now writing associations...")
if noctua_associations:
for assoc in noctua_associations:
gpadwriter.write_assoc(assoc)
if all_gaf_associations:
for assoc in all_gaf_associations:
gpadwriter.write_assoc(assoc)

# The file will be automatically closed here, after exiting the 'with' block
return [gpad_file_path]


def process_noctua_gpad_file(noctua_gpad_file, gpadwriter, ontology_graph, gpi):
def process_noctua_gpad_file(noctua_gpad_file, ontology_graph) -> (List[GoAssociation], List[str]):
"""
Process a noctua gpad file and write the associations to the gpad writer.
:param noctua_gpad_file: The path to the noctua gpad file
:param gpadwriter: The gpad writer to write the associations to
:param ontology_graph: The ontology graph to use for parsing the associations
:param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
"""

processed_associations = []
with open(noctua_gpad_file) as nf:
lines = sum(1 for line in nf)
nf.seek(0) # Reset file pointer to the beginning after counting lines
gpadparser = GpadParser(config=assocparser.AssocParserConfig(ontology=ontology_graph,
paint=False,
rule_set="all"))

click.echo("Making noctua gpad products...")
with click.progressbar(iterable=gpadparser.association_generator(file=nf), length=lines) as associations:
for association in associations:
# If the association is an isoform annotation, convert it to a gene annotation
gpadwriter.write_assoc(association)
processed_associations.append(association)

return processed_associations, gpadparser.report.header


def process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src):
def process_gaf_file(gaf_path, ontology_graph, paint_gaf_src) -> (List[GoAssociation], List[str]):
"""
Process a gaf file and write the associations to the gpad writer.
:param gaf_path: The path to the gaf file
:param gpadwriter: The gpad writer to write the associations to
:param ontology_graph: The ontology graph to use for parsing the associations
:param paint_gaf_src: The source of the paint gaf file
:return: The headers from the variious gaf files in a list of Report objects
"""
headers = []
associations = []
with open(gaf_path) as gf:
lines = sum(1 for line in gf)
gf.seek(0) # Reset file pointer to the beginning after counting lines
gafparser = GafParser(config=assocparser.AssocParserConfig(ontology=ontology_graph,
paint=True,
rule_set="all"))
click.echo("Merging in source gaf to gpad product...")
with click.progressbar(iterable=gafparser.association_generator(file=gf), length=lines) as associations:
for association in associations:
gpadwriter.write_assoc(association)
with click.progressbar(iterable=gafparser.association_generator(file=gf), length=lines) as gaf_assocs:
for association in gaf_assocs:
associations.append(association)
headers.append(gafparser.report.header)

if paint_gaf_src is not None:
with open(paint_gaf_src) as pgf:
Expand All @@ -432,10 +462,12 @@ def process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src):
paint=True,
rule_set="all"))
click.echo("Merging in paint gaf to gpad product...")
with click.progressbar(iterable=gafparser.association_generator(file=pgf), length=lines) as associations:
for association in associations:
gpadwriter.write_assoc(association)
with click.progressbar(iterable=gafparser.association_generator(file=pgf), length=lines) as paint_assocs:
for association in paint_assocs:
associations.append(association)
headers.append(gafparser.report.header)

return associations, headers

@tools.gzips
def produce_gpi(dataset, target_dir, gaf_path, ontology_graph, gpad_gpi_output_version):
Expand Down Expand Up @@ -626,7 +658,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
:param metadata_dir: The directory containing the metadata files
:param gpad: Produce GPAD files
:param gpad_gpi_output_version: The version of the GPAD and GPI files to produce
:param ttl: Produce TTL files
:param ttl: TTL files
:param target: The directory to put the files in
:param ontology: The ontology to use for validation
:param exclude: Datasets to exclude
Expand Down Expand Up @@ -662,7 +694,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
replace_existing_files=not skip_existing_files,
only_dataset=only_dataset)

click.echo("Downloaded GAF sources: {}".format(downloaded_gaf_sources))
click.echo("Downloaded GAF sources")
# extract the titles for the go rules, this is a dictionary comprehension
rule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules"))
goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs"))
Expand Down Expand Up @@ -755,6 +787,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
ontology_graph, noctua_gpad_src, paint_gaf_src,
gpi, gpad_gpi_output_version)


end_gaf = mixin_a_dataset(valid_gaf, [noctua_metadata, paint_metadata],
group_metadata["id"], dataset, absolute_target,
ontology_graph, gpipaths=gpi_list, base_download_url=base_download_url,
Expand All @@ -766,8 +799,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
click.echo("Executing the isoform fixing step in validate.produce...")
# run the resulting gaf through one last parse and replace, to handle the isoforms
# see: https://github.com/geneontology/go-site/issues/2291
click.echo("path to end gaf _temp.gaf: {}".format(end_gaf))
click.echo(os.path)
click.echo("path to end gaf _temp.gaf")

click.echo(os.path.split(end_gaf)[0])
temp_output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}_temp.gaf".format(dataset))
Expand Down
68 changes: 39 additions & 29 deletions ontobio/obograph_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,17 @@ def add_obograph_digraph(
Converts a single obograph to Digraph edges and adds to an existing networkx DiGraph
"""
digraph = self.digraph
logger.info("NODES: {}".format(len(og['nodes'])))
logger.info("NODES: {}".format(len(og.get('nodes', []))))

# if client passes an xref_graph we must parse metadata
# If client passes an xref_graph, we must parse metadata
if xref_graph is not None:
parse_meta = True

for node in og['nodes']:
is_obsolete = 'is_obsolete' in node and node['is_obsolete'] == 'true'
for node in og.get('nodes', []):
is_obsolete = node.get('is_obsolete') == 'true'
if is_obsolete:
continue
if node_type is not None and ('type' not in node or node['type'] != node_type):
if node_type is not None and node.get('type') != node_type:
continue
id = self.contract_uri(node['id'])
digraph.add_node(id, **node)
Expand All @@ -64,46 +64,56 @@ def add_obograph_digraph(
node['meta'] = {}
meta = self.transform_meta(node['meta'])
if xref_graph is not None and 'xrefs' in meta:
for x in meta['xrefs']:
xref_graph.add_edge(self.contract_uri(x['val']), id, source=id)
logger.info("EDGES: {}".format(len(og['edges'])))
for edge in og['edges']:
sub = self.contract_uri(edge['sub'])
obj = self.contract_uri(edge['obj'])
pred = self.contract_uri(edge['pred'])
for x in meta.get('xrefs', []):
xref_graph.add_edge(self.contract_uri(x.get('val')), id, source=id)

logger.info("EDGES: {}".format(len(og.get('edges', []))))
for edge in og.get('edges', []):
sub = self.contract_uri(edge.get('sub'))
obj = self.contract_uri(edge.get('obj'))
pred = self.contract_uri(edge.get('pred'))
pred = map_legacy_pred(pred)
if pred == 'is_a':
pred = 'subClassOf'
if predicates is None or pred in predicates:
meta = edge['meta'] if 'meta' in edge else {}
meta = edge.get('meta', {})
if reverse_edges:
digraph.add_edge(obj, sub, pred=pred, **meta)
else:
digraph.add_edge(sub, obj, pred=pred, **meta)

if 'equivalentNodesSets' in og:
nslist = og['equivalentNodesSets']
nslist = og.get('equivalentNodesSets', [])
logger.info("CLIQUES: {}".format(len(nslist)))
for ns in nslist:
equivNodeIds = ns['nodeIds']
for i in ns['nodeIds']:
equivNodeIds = ns.get('nodeIds', [])
for i in equivNodeIds:
ix = self.contract_uri(i)
for j in ns['nodeIds']:
for j in equivNodeIds:
if i != j:
jx = self.contract_uri(j)
digraph.add_edge(ix, jx, pred='equivalentTo')
if logical_definitions is not None and 'logicalDefinitionAxioms' in og:
for a in og['logicalDefinitionAxioms']:
ld = LogicalDefinition(self.contract_uri(a['definedClassId']),
[self.contract_uri(x) for x in a['genusIds']],
[(self.contract_uri(x['propertyId']),
self.contract_uri(x['fillerId'])) for x in a['restrictions'] if x is not None])
logical_definitions.append(ld)
if property_chain_axioms is not None and 'propertyChainAxioms' in og:
for a in og['propertyChainAxioms']:
pca = PropertyChainAxiom(predicate_id=self.contract_uri(a['predicateId']),
chain_predicate_ids=[self.contract_uri(x) for x in a['chainPredicateIds']])
property_chain_axioms.append(pca)

if logical_definitions is not None:
for a in og.get('logicalDefinitionAxioms', []):
defined_class_id = a.get('definedClassId')
genus_ids = [self.contract_uri(x) for x in a.get('genusIds', [])]
restrictions = [
(self.contract_uri(x.get('propertyId')), self.contract_uri(x.get('fillerId')))
for x in a.get('restrictions', []) if x is not None
]
if defined_class_id:
ld = LogicalDefinition(self.contract_uri(defined_class_id), genus_ids, restrictions)
logical_definitions.append(ld)

if property_chain_axioms is not None:
for a in og.get('propertyChainAxioms', []):
predicate_id = a.get('predicateId')
chain_predicate_ids = [self.contract_uri(x) for x in a.get('chainPredicateIds', [])]
if predicate_id:
pca = PropertyChainAxiom(predicate_id=self.contract_uri(predicate_id),
chain_predicate_ids=chain_predicate_ids)
property_chain_axioms.append(pca)

def transform_meta(self, meta):
if 'basicPropertyValues' in meta:
Expand Down
Loading

0 comments on commit 85dce88

Please sign in to comment.