merge main

bokulich-lab · Oct 9, 2024 · 2aab907 · 2aab907
2 parents 4f76e2e + 8ad824c
commit 2aab907
Show file tree

Hide file tree

Showing 11 changed files with 283 additions and 277 deletions.
diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py
@@ -1,3 +1,4 @@
+import os
 from typing import Union
 
 from q2_types.feature_data_mag import MAGSequencesDirFmt
@@ -58,10 +59,9 @@ def annotate(
     )
 
     # Set up common parameters for _run_amrfinderplus_analyse
-    common_params = locals().copy()
-    del common_params["sequences"]
-    del common_params["proteins"]
-    del common_params["loci"]
+    common_params = {
+        k: v for k, v in locals().items() if k not in ("sequences", "proteins", "loci")
+    }
 
     # Innit output formats
     amr_annotations = AMRFinderPlusAnnotationsDirFmt()
@@ -86,27 +86,30 @@ def annotate(
             sample_id,
         )
 
-        for id, file_fp in files_dict.items():
+        for _id, file_fp in files_dict.items():
             # Construct and validate file input paths for amrfinderplus
             dna_path, protein_path, gff_path = _get_file_paths(
                 sequences,
                 proteins,
                 loci,
-                id,
+                _id,
                 file_fp,
                 sample_id,
             )
 
             # Define paths for output files
-            amr_annotations_path = (
-                amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv"
+            amr_annotations_path = os.path.join(
+                str(amr_annotations), sample_id, f"{_id}_amr_annotations.tsv"
             )
-            amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta"
-            amr_proteins_path = (
-                amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta"
+            amr_genes_path = os.path.join(
+                str(amr_genes), sample_id, f"{_id}_amr_genes.fasta"
             )
-            amr_all_mutations_path = (
-                amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv"
+
+            amr_proteins_path = os.path.join(
+                str(amr_proteins), sample_id, f"{_id}_amr_proteins.fasta"
+            )
+            amr_all_mutations_path = os.path.join(
+                str(amr_all_mutations), sample_id, f"{_id}_amr_all_mutations.tsv"
             )
 
             # Run amrfinderplus

diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py
@@ -34,9 +34,10 @@
     version=__version__,
     website="https://github.com/bokulich-lab/q2-amrfinderplus",
     package="q2_amrfinderplus",
-    description="A plugin to find acquired antimicrobial resistance genes and point "
-    "mutations in protein and/or assembled nucleotide sequences with "
-    "NCBI-AMRFinderPlus.",
+    description=(
+        "A plugin to find acquired antimicrobial resistance genes and point mutations "
+        "in protein and/or assembled nucleotide sequences with NCBI-AMRFinderPlus."
+    ),
     short_description="AMR annotation.",
     citations=[],
 )
@@ -58,7 +59,9 @@
 
 organisms = [
     "Acinetobacter_baumannii",
+    "Acinetobacter",
     "Burkholderia_cepacia",
+    "Burkholderia_cepacia_complex",
     "Burkholderia_pseudomallei",
     "Campylobacter",
     "Citrobacter_freundii",
@@ -68,12 +71,15 @@
     "Enterococcus_faecalis",
     "Enterococcus_faecium",
     "Escherichia",
+    "Escherichia_coli_Shigella",
+    "Klebsiella",
     "Klebsiella_oxytoca",
     "Klebsiella_pneumoniae",
     "Neisseria_gonorrhoeae",
     "Neisseria_meningitidis",
     "Pseudomonas_aeruginosa",
     "Salmonella",
+    "Serratia",
     "Serratia_marcescens",
     "Staphylococcus_aureus",
     "Staphylococcus_pseudintermedius",
@@ -83,11 +89,6 @@
     "Vibrio_cholerae",
     "Vibrio_parahaemolyticus",
     "Vibrio_vulnificus",
-    "Acinetobacter",
-    "Burkholderia_cepacia_complex",
-    "Escherichia_coli_Shigella",
-    "Klebsiella",
-    "Serratia",
 ]
 
 
@@ -147,64 +148,83 @@
 }
 
 amrfinderplus_parameter_descriptions = {
-    "organism": "Taxon used for screening known resistance causing point mutations "
-    "and blacklisting of common, non-informative genes. Pathogen Detection "
-    "taxgroup names can also be used.",
-    "plus": "Provide results from 'Plus' genes such as virulence factors, "
-    "stress-response genes, etc.",
-    "report_all_equal": "Report all equally scoring BLAST and HMM matches. This "
-    "will report multiple lines for a single element if there "
-    "are multiple reference proteins that have the same score. "
-    "On those lines the fields Accession of closest sequence "
-    "and Name of closest sequence will be different showing "
-    "each of the database proteins that are equally close to "
-    "the query sequence.",
-    "ident_min": "Minimum identity for a blast-based hit (Methods BLAST or "
-    "PARTIAL). Setting this value to something other than -1 "
-    "will override curated similarity cutoffs. We only recommend "
-    "using this option if you have a specific reason.",
-    "curated_ident": "Use the curated threshold for a blast-based hit, if it "
-    "exists and 0.9 otherwise. This will overwrite the value specified with the "
-    "'ident_min' parameter.",
-    "coverage_min": "Minimum proportion of reference gene covered for a "
-    "BLAST-based hit (Methods BLAST or PARTIAL).",
+    "organism": (
+        "Taxon used for screening known resistance causing point mutations and "
+        "blacklisting of common, non-informative genes. Pathogen Detection taxgroup "
+        "names can also be used."
+    ),
+    "plus": (
+        "Provide results from 'Plus' genes such as virulence factors, stress-response "
+        "genes, etc."
+    ),
+    "report_all_equal": (
+        "Report all equally scoring BLAST and HMM matches. This will report multiple "
+        "lines for a single element if there are multiple reference proteins that have "
+        "the same score. On those lines the fields Accession of closest sequence and "
+        "Name of closest sequence will be different showing each of the database "
+        "proteins that are equally close to the query sequence."
+    ),
+    "ident_min": (
+        "Minimum identity for a blast-based hit (Methods BLAST or PARTIAL). Setting "
+        "this value to something other than -1 will override curated similarity "
+        "cutoffs. We only recommend using this option if you have a specific reason."
+    ),
+    "curated_ident": (
+        "Use the curated threshold for a blast-based hit, if it exists and 0.9 "
+        "otherwise."
+    ),
+    "coverage_min": (
+        "Minimum proportion of reference gene covered for a BLAST-based hit (Methods "
+        "BLAST or PARTIAL)."
+    ),
     "translation_table": "Translation table used for BLASTX.",
+    "annotation_format": (
+        "Specify the format of the GFF file in the loci input. 'standart' refers to "
+        "NCBI resources such as GenBank and RefSeq."
+    ),
     "report_common": "Report proteins common to a taxonomy group.",
-    "threads": "The number of threads to use for processing. AMRFinderPlus "
-    "defaults to 4 on hosts with >= 4 cores. Setting this number higher"
-    " than the number of cores on the running host may cause blastp to "
-    "fail. Using more than 4 threads may speed up searches.",
+    "threads": (
+        "The number of threads to use for processing. AMRFinderPlus defaults to 4 on "
+        "hosts with >= 4 cores. Setting this number higher than the number of cores on "
+        "the running host may cause blastp to fail. Using more than 4 threads may "
+        "speed up searches."
+    ),
 }
 
 amrfinderplus_output_descriptions = {
     "amr_annotations": "Annotated AMR genes and mutations.",
-    "amr_all_mutations": "Report of genotypes at all locations screened for point "
-    "mutations. These files allow you to distinguish between called "
-    "point mutations that were the sensitive variant and the point "
-    "mutations that could not be called because the sequence was not "
-    "found. This file will contain all detected variants from the "
-    "reference sequence, so it could be used as an initial screen for "
-    "novel variants. Note 'Gene symbols' for mutations not in the "
-    "database (identifiable by [UNKNOWN] in the Sequence name field) "
-    "have offsets that are relative to the start of the sequence "
-    "indicated in the field 'Accession of closest sequence' while "
-    "'Gene symbols' from known point-mutation sites have gene symbols "
-    "that match the Pathogen Detection Reference Gene Catalog "
-    "standardized nomenclature for point mutations.",
-    "amr_genes": "Sequences that were identified by AMRFinderPlus as AMR genes. "
-    "This will include the entire region that aligns to the references for "
-    "point mutations.",
-    "amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as "
-    "AMR genes. This will include the entire region that aligns to the references "
-    "for point mutations.",
+    "amr_all_mutations": (
+        "Report of genotypes at all locations screened for point mutations. These "
+        "files allow you to distinguish between called point mutations that were the "
+        "sensitive variant and the point mutations that could not be called because "
+        "the sequence was not found. This file will contain all detected variants from "
+        "the reference sequence, so it could be used as an initial screen for novel "
+        "variants. Note 'Gene symbols' for mutations not in the database (identifiable "
+        "by [UNKNOWN] in the Sequence name field) have offsets that are relative to "
+        "the start of the sequence indicated in the field 'Accession of closest "
+        "sequence' while 'Gene symbols' from known point-mutation sites have gene "
+        "symbols that match the Pathogen Detection Reference Gene Catalog standardized "
+        "nomenclature for point mutations."
+    ),
+    "amr_genes": (
+        "Sequences that were identified by AMRFinderPlus as AMR genes. This will "
+        "include the entire region that aligns to the references for point mutations."
+    ),
+    "amr_proteins": (
+        "Protein Sequences that were identified by AMRFinderPlus as AMR genes. This "
+        "will include the entire region that aligns to the references for point "
+        "mutations"
+    ),
 }
 
 
 amrfinderplus_input_descriptions = {
     "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.",
     "proteins": "Protein sequences to be annotated with AMRFinderPlus.",
-    "loci": "GFF files to give sequence coordinates for proteins input. Required "
-    "for combined searches of protein and DNA sequences.",
+    "loci": (
+        "GFF files to give sequence coordinates for proteins input. Required for "
+        "combined searches of protein and DNA sequences."
+    ),
     "amrfinderplus_db": "AMRFinderPlus Database.",
 }
 
@@ -228,9 +248,10 @@
     parameter_descriptions=amrfinderplus_parameter_descriptions,
     output_descriptions=amrfinderplus_output_descriptions,
     name="Annotate MAGs or contigs with AMRFinderPlus.",
-    description="Annotate sample data MAGs or contigs with antimicrobial resistance "
-    "genes with AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for "
-    "documentation.",
+    description=(
+        "Annotate MAGs or contigs with antimicrobial resistance genes with "
+        "AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for documentation."
+    ),
     citations=[citations["feldgarden2021amrfinderplus"]],
 )
 

diff --git a/q2_amrfinderplus/tests/data/contigs/sample1_contigs.fasta b/q2_amrfinderplus/tests/data/contigs/sample1_contigs.fasta
diff --git a/q2_amrfinderplus/tests/data/feature_data_mag/30ef72ed-84fd-4348-a418-9d68a9b88729.fasta b/q2_amrfinderplus/tests/data/feature_data_mag/30ef72ed-84fd-4348-a418-9d68a9b88729.fasta
diff --git a/q2_amrfinderplus/tests/data/loci_per_sample/sample1/genome1.gff b/q2_amrfinderplus/tests/data/loci_per_sample/sample1/genome1.gff
diff --git a/q2_amrfinderplus/tests/data/proteins/sample1.fasta b/q2_amrfinderplus/tests/data/proteins/sample1.fasta
diff --git a/q2_amrfinderplus/tests/data/proteins_per_sample/sample1/genome1.fasta b/q2_amrfinderplus/tests/data/proteins_per_sample/sample1/genome1.fasta
diff --git a/q2_amrfinderplus/tests/data/sample_data_mags/sample1/mag.fasta b/q2_amrfinderplus/tests/data/sample_data_mags/sample1/mag.fasta