From 6e6bd3f9434abd7c1ac42dd3546c27e213b0ae74 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Tue, 7 May 2024 14:50:12 -0700 Subject: [PATCH 1/6] For #676 --- bin/ontobio-parse-assocs.py | 8 ++++++++ bin/validate.py | 26 +++++++++++++++++++++----- ontobio/io/assocparser.py | 2 ++ ontobio/io/qc.py | 15 +++++++++++++++ ontobio/validation/docs.py | 18 ++++++++++++++++++ tests/test_qc.py | 16 +++++++++++++++- 6 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 ontobio/validation/docs.py diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py index f74174da..6f6cdf14 100755 --- a/bin/ontobio-parse-assocs.py +++ b/bin/ontobio-parse-assocs.py @@ -30,6 +30,7 @@ from ontobio.io import gaference from ontobio.slimmer import get_minimal_subgraph from ontobio.validation import metadata +from ontobio.validation import docs import os import sys import json @@ -81,6 +82,8 @@ def main(): help="GPI file") parser.add_argument("-m", "--metadata_dir", type=dir_path, required=False, help="Path to metadata directory") + parser.add_argument("-d", "--docs_dir", type=dir_path, required=False, + help="Path to docs directory") parser.add_argument("-l", "--rule", action="append", required=None, default=[], dest="rule_set", help="Set of rules to be run. Default is no rules to be run, with the exception \ of gorule-0000027 and gorule-0000020. See command line documentation in the \ @@ -149,6 +152,10 @@ def main(): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + retracted_pubs = None + if args.docs_dir: + retracted_pubs = docs.retracted_pubs_set(os.path.abspath(args.docs_dir)) + # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None config = assocparser.AssocParserConfig( @@ -164,6 +171,7 @@ def main(): gpi_authority_path=args.gpi, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pubs = retracted_pubs, rule_set=rule_set ) p = None diff --git a/bin/validate.py b/bin/validate.py index ed024015..0b03908c 100755 --- a/bin/validate.py +++ b/bin/validate.py @@ -30,10 +30,12 @@ from ontobio.io import gaference from ontobio.rdfgen import assoc_rdfgen from ontobio.rdfgen.gocamgen.gocam_builder import GoCamBuilder, AssocExtractor +from ontobio.validation import docs from ontobio.validation import metadata from ontobio.validation import tools from ontobio.validation import rules + from typing import Dict, Set # logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING) @@ -210,7 +212,7 @@ def create_parser(config, group, dataset, format="gaf"): """ @tools.gzips -def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None, goref_metadata=None, ref_species_metadata=None, db_entities=None, group_idspace=None, format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None, extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2", rule_set=assocparser.RuleSet.ALL): +def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None, goref_metadata=None, ref_species_metadata=None, retracted_pubs=None,db_entities=None, group_idspace=None, format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None, extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2", rule_set=assocparser.RuleSet.ALL): filtered_associations = open(os.path.join(os.path.split(source_gaf)[0], "{}_noiea.gaf".format(dataset)), "w") config = assocparser.AssocParserConfig( ontology=ontology_graph, @@ -221,6 +223,7 @@ def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, rule_metadata=rule_metadata, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pubs=retracted_pubs, entity_idspaces=db_entities, group_idspace=group_idspace, suppress_rule_reporting_tags=suppress_rule_reporting_tags, @@ -481,6 +484,7 @@ def cli(ctx, verbose): @click.pass_context @click.argument("group") @click.option("--metadata", "-m", "metadata_dir", type=click.Path(), required=True) +@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--gpad", default=False, is_flag=True) @click.option("--ttl", default=False, is_flag=True) @click.option("--target", "-t", type=click.Path(), required=True) @@ -493,7 +497,7 @@ def cli(ctx, verbose): @click.option("--only-dataset", default=None) @click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"])) @click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True) -def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set): +def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set, docs_dir): logger.info("Logging is verbose") products = { @@ -507,6 +511,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True) click.echo("Products will go in {}".format(absolute_target)) absolute_metadata = os.path.abspath(metadata_dir) + absolute_docs = os.path.abspath(docs_dir) group_metadata = metadata.dataset_metadata_file(absolute_metadata, group) click.echo("Loading ontology: {}...".format(ontology)) @@ -530,6 +535,8 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base db_entities = metadata.database_entities(absolute_metadata) group_ids = metadata.groups(absolute_metadata) extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) + + retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) gaferences = None if gaferencer_file: @@ -549,6 +556,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base group=group, rule_metadata=rule_metadata, goref_metadata=goref_metadata, + retracted_pubs=retracted_pubs, ref_species_metadata=ref_species_metadata, db_entities=db_entities, group_idspace=group_ids, @@ -627,13 +635,16 @@ def gpad2gocams(ctx, gpad_path, gpi_path, target, ontology, ttl, modelstate): @click.argument("group") @click.argument("dataset") @click.option("--metadata", "-m", type=click.Path(), required=True) +@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--target", type=click.Path(), required=True) @click.option("--ontology", type=click.Path(), required=True) -def paint(group, dataset, metadata, target, ontology): +def paint(group, dataset, metadata, target, ontology, docs_dir): absolute_metadata = os.path.abspath(metadata) absolute_target = os.path.abspath(target) os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True) paint_metadata = metadata.dataset_metadata_file(absolute_metadata, "paint") + absolute_docs = os.path.abspath(docs_dir) + paint_src_gaf = check_and_download_mixin_source(paint_metadata, dataset, absolute_target) click.echo("Loading ontology: {}...".format(ontology)) @@ -641,17 +652,20 @@ def paint(group, dataset, metadata, target, ontology): gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset)) click.echo("Using GPI at {}".format(gpi_path)) - paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path) + retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) + paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path, retracted_pubs=retracted_pubs) @cli.command() @click.option("--metadata", "-m", "metadata_dir", type=click.Path(), required=True) +@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--out", type=click.Path(), required=False) @click.option("--ontology", type=click.Path(), required=True) @click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False, help="Path to Gaferencer output to be used for inferences") -def rule(metadata_dir, out, ontology, gaferencer_file): +def rule(metadata_dir, out, ontology, gaferencer_file, docs_dir): absolute_metadata = os.path.abspath(metadata_dir) + absolute_docs = os.path.abspath(docs_dir) click.echo("Loading ontology: {}...".format(ontology)) ontology_graph = OntologyFactory().create(ontology) @@ -659,6 +673,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) gorule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) click.echo("Found {} GO Rules".format(len(gorule_metadata.keys()))) @@ -673,6 +688,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file): ontology=ontology_graph, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pubs=retracted_pubs, entity_idspaces=db_entities, group_idspace=group_ids, annotation_inferences=gaferences, diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index 8f00c72e..65d3fbc5 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -234,6 +234,7 @@ def __init__(self, ref_species_metadata=None, group_metadata=None, dbxrefs=None, + retracted_pubs=None, suppress_rule_reporting_tags=[], annotation_inferences=None, extensions_constraints=None, @@ -258,6 +259,7 @@ def __init__(self, self.goref_metadata = goref_metadata self.ref_species_metadata = ref_species_metadata self.group_metadata = group_metadata + self.retracted_pubs = retracted_pubs self.suppress_rule_reporting_tags = suppress_rule_reporting_tags self.annotation_inferences = annotation_inferences self.entity_idspaces = entity_idspaces diff --git a/ontobio/io/qc.py b/ontobio/io/qc.py index f15e73f0..786d41de 100644 --- a/ontobio/io/qc.py +++ b/ontobio/io/qc.py @@ -421,6 +421,20 @@ def test(self, annotation: association.GoAssociation, config: assocparser.AssocP return self._result(bool(withfrom)) else: return self._result(True) + +class GoRule22(GoRule): + + def __init__(self): + super().__init__("GORULE:0000022", "Check for, and filter, annotations made to retracted publications", FailMode.HARD) + + def test(self, annotation: association.GoAssociation, config: assocparser.AssocParserConfig, group=None) -> TestResult: + if config.retracted_pubs is not None: + references = annotation.evidence.has_supporting_reference + for ref in references: + ref = str(ref) + if ref in config.retracted_pubs: + return self._result(False) + return self._result(True) class GoRule26(GoRule): @@ -952,6 +966,7 @@ def test(self, annotation: association.GoAssociation, config: assocparser.AssocP "GoRule16": GoRule16(), "GoRule17": GoRule17(), "GoRule18": GoRule18(), + "GoRule22": GoRule22(), "GoRule26": GoRule26(), "GoRule28": GoRule28(), "GoRule29": GoRule29(), diff --git a/ontobio/validation/docs.py b/ontobio/validation/docs.py new file mode 100644 index 00000000..b693d0b9 --- /dev/null +++ b/ontobio/validation/docs.py @@ -0,0 +1,18 @@ +import click +import os + + +def retracted_pubs_set(abs_docs_path)->set[str]: + retracted_path = os.path.join(abs_docs_path, "europe-pmc-retracted.txt") + try: + retracted_pubs = set() + with open(retracted_path, "r") as f: + for line in f: + li=line.strip() + if not li.startswith("!"): + if "," in li: + li = li.partition(',')[0] + retracted_pubs.add(li) + return retracted_pubs + except Exception as e: + raise click.ClickException("Could not find or read {}: {}".format(retracted_path, str(e))) \ No newline at end of file diff --git a/tests/test_qc.py b/tests/test_qc.py index 90d92368..25fbfa63 100644 --- a/tests/test_qc.py +++ b/tests/test_qc.py @@ -354,6 +354,20 @@ def test_go_rule_18(): test_result = qc.GoRule18().test(assoc, all_rules_config()) assert test_result.result_type == qc.ResultType.PASS +def test_go_rule22(): + config = assocparser.AssocParserConfig( + ontology=ontology, + retracted_pubs={"RETRACTED:1234","PMID:37772366"}, + rule_set=assocparser.RuleSet.ALL + ) + assoc = make_annotation(goid="GO:1234567", evidence="IBA", references="PMID:12345").associations[0] + test_result = qc.GoRule22().test(assoc, config) + assert test_result.result_type == qc.ResultType.PASS + + assoc = make_annotation(goid="GO:1234567", evidence="IBA", references="PMID:37772366").associations[0] + test_result = qc.GoRule22().test(assoc, config) + assert test_result.result_type == qc.ResultType.ERROR + def test_go_rule26(): config = assocparser.AssocParserConfig( @@ -819,7 +833,7 @@ def test_all_rules(): assoc = gafparser.to_association(a).associations[0] test_results = qc.test_go_rules(assoc, config).all_results - assert len(test_results.keys()) == 26 + assert len(test_results.keys()) == 27 assert test_results[qc.GoRules.GoRule26.value].result_type == qc.ResultType.PASS assert test_results[qc.GoRules.GoRule29.value].result_type == qc.ResultType.PASS From 00aa34a4299012aed76fe0bf927ce99acc6335fe Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Wed, 8 May 2024 17:18:15 -0700 Subject: [PATCH 2/6] For #676 --- bin/ontobio-parse-assocs.py | 13 ++++--------- bin/validate.py | 29 +++++++++++------------------ ontobio/io/assocparser.py | 4 ++-- ontobio/io/qc.py | 4 ++-- ontobio/validation/docs.py | 18 ------------------ ontobio/validation/metadata.py | 18 +++++++++++++++++- tests/test_qc.py | 2 +- 7 files changed, 37 insertions(+), 51 deletions(-) delete mode 100644 ontobio/validation/docs.py diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py index 6f6cdf14..7b214bfb 100755 --- a/bin/ontobio-parse-assocs.py +++ b/bin/ontobio-parse-assocs.py @@ -30,7 +30,6 @@ from ontobio.io import gaference from ontobio.slimmer import get_minimal_subgraph from ontobio.validation import metadata -from ontobio.validation import docs import os import sys import json @@ -81,9 +80,7 @@ def main(): parser.add_argument("-g", "--gpi", type=str, required=False, default=None, help="GPI file") parser.add_argument("-m", "--metadata_dir", type=dir_path, required=False, - help="Path to metadata directory") - parser.add_argument("-d", "--docs_dir", type=dir_path, required=False, - help="Path to docs directory") + help="Path to metadata directory") parser.add_argument("-l", "--rule", action="append", required=None, default=[], dest="rule_set", help="Set of rules to be run. Default is no rules to be run, with the exception \ of gorule-0000027 and gorule-0000020. See command line documentation in the \ @@ -147,14 +144,12 @@ def main(): goref_metadata = None ref_species_metadata = None + retracted_pub_set = None if args.metadata_dir: absolute_metadata = os.path.abspath(args.metadata_dir) goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") - - retracted_pubs = None - if args.docs_dir: - retracted_pubs = docs.retracted_pubs_set(os.path.abspath(args.docs_dir)) + retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None @@ -171,7 +166,7 @@ def main(): gpi_authority_path=args.gpi, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pubs = retracted_pubs, + retracted_pub_set=retracted_pub_set, rule_set=rule_set ) p = None diff --git a/bin/validate.py b/bin/validate.py index 0b03908c..2b1e8d89 100755 --- a/bin/validate.py +++ b/bin/validate.py @@ -30,7 +30,6 @@ from ontobio.io import gaference from ontobio.rdfgen import assoc_rdfgen from ontobio.rdfgen.gocamgen.gocam_builder import GoCamBuilder, AssocExtractor -from ontobio.validation import docs from ontobio.validation import metadata from ontobio.validation import tools from ontobio.validation import rules @@ -212,7 +211,7 @@ def create_parser(config, group, dataset, format="gaf"): """ @tools.gzips -def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None, goref_metadata=None, ref_species_metadata=None, retracted_pubs=None,db_entities=None, group_idspace=None, format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None, extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2", rule_set=assocparser.RuleSet.ALL): +def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, group="unknown", rule_metadata=None, goref_metadata=None, ref_species_metadata=None, retracted_pub_set=None,db_entities=None, group_idspace=None, format="gaf", suppress_rule_reporting_tags=[], annotation_inferences=None, group_metadata=None, extensions_constraints=None, rule_contexts=[], gaf_output_version="2.2", rule_set=assocparser.RuleSet.ALL): filtered_associations = open(os.path.join(os.path.split(source_gaf)[0], "{}_noiea.gaf".format(dataset)), "w") config = assocparser.AssocParserConfig( ontology=ontology_graph, @@ -223,7 +222,7 @@ def produce_gaf(dataset, source_gaf, ontology_graph, gpipaths=None, paint=False, rule_metadata=rule_metadata, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pubs=retracted_pubs, + retracted_pub_set=retracted_pub_set, entity_idspaces=db_entities, group_idspace=group_idspace, suppress_rule_reporting_tags=suppress_rule_reporting_tags, @@ -484,7 +483,6 @@ def cli(ctx, verbose): @click.pass_context @click.argument("group") @click.option("--metadata", "-m", "metadata_dir", type=click.Path(), required=True) -@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--gpad", default=False, is_flag=True) @click.option("--ttl", default=False, is_flag=True) @click.option("--target", "-t", type=click.Path(), required=True) @@ -497,7 +495,7 @@ def cli(ctx, verbose): @click.option("--only-dataset", default=None) @click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"])) @click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True) -def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set, docs_dir): +def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set): logger.info("Logging is verbose") products = { @@ -511,7 +509,6 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True) click.echo("Products will go in {}".format(absolute_target)) absolute_metadata = os.path.abspath(metadata_dir) - absolute_docs = os.path.abspath(docs_dir) group_metadata = metadata.dataset_metadata_file(absolute_metadata, group) click.echo("Loading ontology: {}...".format(ontology)) @@ -536,7 +533,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base group_ids = metadata.groups(absolute_metadata) extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) - retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) + retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) gaferences = None if gaferencer_file: @@ -556,8 +553,8 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base group=group, rule_metadata=rule_metadata, goref_metadata=goref_metadata, - retracted_pubs=retracted_pubs, ref_species_metadata=ref_species_metadata, + retracted_pub_set=retracted_pub_set, db_entities=db_entities, group_idspace=group_ids, suppress_rule_reporting_tags=suppress_rule_reporting_tag, @@ -635,15 +632,13 @@ def gpad2gocams(ctx, gpad_path, gpi_path, target, ontology, ttl, modelstate): @click.argument("group") @click.argument("dataset") @click.option("--metadata", "-m", type=click.Path(), required=True) -@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--target", type=click.Path(), required=True) @click.option("--ontology", type=click.Path(), required=True) -def paint(group, dataset, metadata, target, ontology, docs_dir): +def paint(group, dataset, metadata, target, ontology): absolute_metadata = os.path.abspath(metadata) absolute_target = os.path.abspath(target) os.makedirs(os.path.join(absolute_target, "groups"), exist_ok=True) paint_metadata = metadata.dataset_metadata_file(absolute_metadata, "paint") - absolute_docs = os.path.abspath(docs_dir) paint_src_gaf = check_and_download_mixin_source(paint_metadata, dataset, absolute_target) @@ -652,20 +647,18 @@ def paint(group, dataset, metadata, target, ontology, docs_dir): gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset)) click.echo("Using GPI at {}".format(gpi_path)) - retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) - paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path, retracted_pubs=retracted_pubs) + retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) + paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path, retracted_pub_set=retracted_pub_set) @cli.command() @click.option("--metadata", "-m", "metadata_dir", type=click.Path(), required=True) -@click.option("--docs", "-d", "docs_dir", type=click.Path(), required=True) @click.option("--out", type=click.Path(), required=False) @click.option("--ontology", type=click.Path(), required=True) @click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False, help="Path to Gaferencer output to be used for inferences") -def rule(metadata_dir, out, ontology, gaferencer_file, docs_dir): +def rule(metadata_dir, out, ontology, gaferencer_file): absolute_metadata = os.path.abspath(metadata_dir) - absolute_docs = os.path.abspath(docs_dir) click.echo("Loading ontology: {}...".format(ontology)) ontology_graph = OntologyFactory().create(ontology) @@ -673,7 +666,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file, docs_dir): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) gorule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") - retracted_pubs = docs.retracted_pubs_set(os.path.abspath(absolute_docs)) + retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) click.echo("Found {} GO Rules".format(len(gorule_metadata.keys()))) @@ -688,7 +681,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file, docs_dir): ontology=ontology_graph, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pubs=retracted_pubs, + retracted_pub_set=retracted_pub_set, entity_idspaces=db_entities, group_idspace=group_ids, annotation_inferences=gaferences, diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index 65d3fbc5..bf0a490e 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -234,7 +234,7 @@ def __init__(self, ref_species_metadata=None, group_metadata=None, dbxrefs=None, - retracted_pubs=None, + retracted_pub_set=None, suppress_rule_reporting_tags=[], annotation_inferences=None, extensions_constraints=None, @@ -259,7 +259,7 @@ def __init__(self, self.goref_metadata = goref_metadata self.ref_species_metadata = ref_species_metadata self.group_metadata = group_metadata - self.retracted_pubs = retracted_pubs + self.retracted_pub_set = retracted_pub_set self.suppress_rule_reporting_tags = suppress_rule_reporting_tags self.annotation_inferences = annotation_inferences self.entity_idspaces = entity_idspaces diff --git a/ontobio/io/qc.py b/ontobio/io/qc.py index 786d41de..f08668a2 100644 --- a/ontobio/io/qc.py +++ b/ontobio/io/qc.py @@ -428,11 +428,11 @@ def __init__(self): super().__init__("GORULE:0000022", "Check for, and filter, annotations made to retracted publications", FailMode.HARD) def test(self, annotation: association.GoAssociation, config: assocparser.AssocParserConfig, group=None) -> TestResult: - if config.retracted_pubs is not None: + if config.retracted_pub_set is not None: references = annotation.evidence.has_supporting_reference for ref in references: ref = str(ref) - if ref in config.retracted_pubs: + if ref in config.retracted_pub_set: return self._result(False) return self._result(True) diff --git a/ontobio/validation/docs.py b/ontobio/validation/docs.py deleted file mode 100644 index b693d0b9..00000000 --- a/ontobio/validation/docs.py +++ /dev/null @@ -1,18 +0,0 @@ -import click -import os - - -def retracted_pubs_set(abs_docs_path)->set[str]: - retracted_path = os.path.join(abs_docs_path, "europe-pmc-retracted.txt") - try: - retracted_pubs = set() - with open(retracted_path, "r") as f: - for line in f: - li=line.strip() - if not li.startswith("!"): - if "," in li: - li = li.partition(',')[0] - retracted_pubs.add(li) - return retracted_pubs - except Exception as e: - raise click.ClickException("Could not find or read {}: {}".format(retracted_path, str(e))) \ No newline at end of file diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index 8b370635..9a164d13 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -182,7 +182,23 @@ def yaml_set(metadata, yaml_file_name, field) -> Set[str]: except Exception as e: raise click.ClickException("Could not find or read {}: {}".format(yaml_path, str(e))) - return set([yaml[field] for yaml in yaml_list]) + return set([yaml[field] for yaml in yaml_list]) + + +def retracted_pub_set(metadata)->set[str]: + retracted_path = os.path.join(metadata, "retracted.txt") + try: + retracted_pubs = set() + with open(retracted_path, "r") as f: + for line in f: + li=line.strip() + if not li.startswith("!"): + if "," in li: + li = li.partition(',')[0] + retracted_pubs.add(li) + return retracted_pubs + except Exception as e: + raise click.ClickException("Could not find or read {}: {}".format(retracted_path, str(e))) diff --git a/tests/test_qc.py b/tests/test_qc.py index 25fbfa63..ef4bd800 100644 --- a/tests/test_qc.py +++ b/tests/test_qc.py @@ -357,7 +357,7 @@ def test_go_rule_18(): def test_go_rule22(): config = assocparser.AssocParserConfig( ontology=ontology, - retracted_pubs={"RETRACTED:1234","PMID:37772366"}, + retracted_pub_set={"RETRACTED:1234","PMID:37772366"}, rule_set=assocparser.RuleSet.ALL ) assoc = make_annotation(goid="GO:1234567", evidence="IBA", references="PMID:12345").associations[0] From 468b7b7e8100e004a52a09b29eeb8dde5ed8bc35 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Thu, 9 May 2024 13:44:34 -0700 Subject: [PATCH 3/6] For #676 --- ontobio/validation/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index 9a164d13..28d7dbba 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -186,7 +186,7 @@ def yaml_set(metadata, yaml_file_name, field) -> Set[str]: def retracted_pub_set(metadata)->set[str]: - retracted_path = os.path.join(metadata, "retracted.txt") + retracted_path = os.path.join(metadata, "retracted-publications.txt") try: retracted_pubs = set() with open(retracted_path, "r") as f: From 14919f2459ebd6becea90826ea3cfea077781473 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Fri, 10 May 2024 15:20:38 -0700 Subject: [PATCH 4/6] For #676 --- bin/ontobio-parse-assocs.py | 12 +++++++++--- bin/validate.py | 22 ++++++++++++---------- ontobio/validation/metadata.py | 19 ++++++++++++++----- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py index 7b214bfb..b465af85 100755 --- a/bin/ontobio-parse-assocs.py +++ b/bin/ontobio-parse-assocs.py @@ -81,6 +81,8 @@ def main(): help="GPI file") parser.add_argument("-m", "--metadata_dir", type=dir_path, required=False, help="Path to metadata directory") + parser.add_argument("--retracted_pub_set", type=argparse.FileType('r'), required=False, + help="Path to retracted publications file") parser.add_argument("-l", "--rule", action="append", required=None, default=[], dest="rule_set", help="Set of rules to be run. Default is no rules to be run, with the exception \ of gorule-0000027 and gorule-0000020. See command line documentation in the \ @@ -143,13 +145,17 @@ def main(): rule_set = assocparser.RuleSet.ALL goref_metadata = None - ref_species_metadata = None - retracted_pub_set = None + ref_species_metadata = None if args.metadata_dir: absolute_metadata = os.path.abspath(args.metadata_dir) goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") - retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) + + retracted_pub_set = None + if args.retracted_pub_set: + retracted_pub_set = metadata.retracted_pub_set(args.retracted_pub_set.name) + elif args.metadata_dir: + retracted_pub_set = metadata.retracted_pub_set_from_meta(absolute_metadata) # set configuration filtered_evidence_file = open(args.filtered_file, "w") if args.filtered_file else None diff --git a/bin/validate.py b/bin/validate.py index 2b1e8d89..1bcee133 100755 --- a/bin/validate.py +++ b/bin/validate.py @@ -495,7 +495,8 @@ def cli(ctx, verbose): @click.option("--only-dataset", default=None) @click.option("--gaf-output-version", default="2.2", type=click.Choice(["2.1", "2.2"])) @click.option("--rule-set", "-l", "rule_set", default=[assocparser.RuleSet.ALL], multiple=True) -def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set): +@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file") +def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base_download_url, suppress_rule_reporting_tag, skip_existing_files, gaferencer_file, only_dataset, gaf_output_version, rule_set, retracted_pub_set): logger.info("Logging is verbose") products = { @@ -531,9 +532,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base db_entities = metadata.database_entities(absolute_metadata) group_ids = metadata.groups(absolute_metadata) - extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) - - retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) + extensions_constraints = metadata.extensions_constraints_file(absolute_metadata) gaferences = None if gaferencer_file: @@ -543,6 +542,12 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base if rule_set == (assocparser.RuleSet.ALL,): rule_set = assocparser.RuleSet.ALL + retracted_pubs = None + if retracted_pub_set: + retracted_pubs = metadata.retracted_pub_set(retracted_pub_set) + else: + retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata) + for dataset_metadata, source_gaf in downloaded_gaf_sources: dataset = dataset_metadata["dataset"] # Set paint to True when the group is "paint". @@ -554,7 +559,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base rule_metadata=rule_metadata, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pub_set=retracted_pub_set, + retracted_pub_set=retracted_pubs, db_entities=db_entities, group_idspace=group_ids, suppress_rule_reporting_tags=suppress_rule_reporting_tag, @@ -646,9 +651,8 @@ def paint(group, dataset, metadata, target, ontology): ontology_graph = OntologyFactory().create(ontology) gpi_path = os.path.join(absolute_target, "groups", dataset, "{}.gpi".format(dataset)) - click.echo("Using GPI at {}".format(gpi_path)) - retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) - paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path, retracted_pub_set=retracted_pub_set) + click.echo("Using GPI at {}".format(gpi_path)) + paint_gaf = produce_gaf("paint_{}".format(dataset), paint_src_gaf, ontology_graph, gpipath=gpi_path) @cli.command() @@ -666,7 +670,6 @@ def rule(metadata_dir, out, ontology, gaferencer_file): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) gorule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") - retracted_pub_set = metadata.retracted_pub_set(absolute_metadata) click.echo("Found {} GO Rules".format(len(gorule_metadata.keys()))) @@ -681,7 +684,6 @@ def rule(metadata_dir, out, ontology, gaferencer_file): ontology=ontology_graph, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pub_set=retracted_pub_set, entity_idspaces=db_entities, group_idspace=group_ids, annotation_inferences=gaferences, diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index 28d7dbba..b5be0c1d 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -184,12 +184,21 @@ def yaml_set(metadata, yaml_file_name, field) -> Set[str]: return set([yaml[field] for yaml in yaml_list]) - -def retracted_pub_set(metadata)->set[str]: +def retracted_pub_set_from_meta(metadata) -> Set: retracted_path = os.path.join(metadata, "retracted-publications.txt") + if os.access(retracted_path, os.R_OK): + return retracted_pub_set_use_abspath(retracted_path) + else: + return set() + +def retracted_pub_set(abspath_retracted_file) -> Set: + return retracted_pub_set_use_abspath(os.path.abspath(abspath_retracted_file)) + +def retracted_pub_set_use_abspath(abspath_retracted_file) -> Set: try: - retracted_pubs = set() - with open(retracted_path, "r") as f: + retracted_pubs = None + with open(abspath_retracted_file, "r") as f: + retracted_pubs = set() for line in f: li=line.strip() if not li.startswith("!"): @@ -198,7 +207,7 @@ def retracted_pub_set(metadata)->set[str]: retracted_pubs.add(li) return retracted_pubs except Exception as e: - raise click.ClickException("Could not find or read {}: {}".format(retracted_path, str(e))) + raise click.ClickException("Could not find or read {}: {}".format(abspath_retracted_file, str(e))) From cd830b98ae030b001f38620612954c38ed44cafd Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Fri, 10 May 2024 16:42:21 -0700 Subject: [PATCH 5/6] For #676 --- bin/validate.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/validate.py b/bin/validate.py index 1bcee133..25cd0f83 100755 --- a/bin/validate.py +++ b/bin/validate.py @@ -559,7 +559,7 @@ def produce(ctx, group, metadata_dir, gpad, ttl, target, ontology, exclude, base rule_metadata=rule_metadata, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, - retracted_pub_set=retracted_pubs, + retracted_pub_set=retracted_pubs, db_entities=db_entities, group_idspace=group_ids, suppress_rule_reporting_tags=suppress_rule_reporting_tag, @@ -661,7 +661,8 @@ def paint(group, dataset, metadata, target, ontology): @click.option("--ontology", type=click.Path(), required=True) @click.option("--gaferencer-file", "-I", type=click.Path(exists=True), default=None, required=False, help="Path to Gaferencer output to be used for inferences") -def rule(metadata_dir, out, ontology, gaferencer_file): +@click.option("--retracted_pub_set", type=click.Path(exists=True), default=None, required=False, help="Path to retracted publications file") +def rule(metadata_dir, out, ontology, gaferencer_file, retracted_pub_set): absolute_metadata = os.path.abspath(metadata_dir) click.echo("Loading ontology: {}...".format(ontology)) @@ -670,6 +671,12 @@ def rule(metadata_dir, out, ontology, gaferencer_file): goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) gorule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + retracted_pubs = None + if retracted_pub_set: + retracted_pubs = metadata.retracted_pub_set(retracted_pub_set) + else: + retracted_pubs = metadata.retracted_pub_set_from_meta(absolute_metadata) + click.echo("Found {} GO Rules".format(len(gorule_metadata.keys()))) @@ -684,6 +691,7 @@ def rule(metadata_dir, out, ontology, gaferencer_file): ontology=ontology_graph, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + retracted_pub_set=retracted_pubs, entity_idspaces=db_entities, group_idspace=group_ids, annotation_inferences=gaferences, From 44ad77a8f674159d8f00a6ef43ab802feb5c08e9 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Thu, 16 May 2024 10:36:47 -0700 Subject: [PATCH 6/6] Upgrade to v2.8.25 --- ontobio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ontobio/__init__.py b/ontobio/__init__.py index e76ee76b..430adbb8 100644 --- a/ontobio/__init__.py +++ b/ontobio/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = '2.8.24' +__version__ = '2.8.25' from .ontol_factory import OntologyFactory