Merge pull request #143 from BenKaehler/run-time-analysis

Run time analysis
caporaso-lab · Jul 11, 2017 · ad15dd6 · ad15dd6
2 parents 857c9ee + aab2c59
commit ad15dd6
Show file tree

Hide file tree

Showing 5 changed files with 742 additions and 1,491 deletions.
diff --git a/ipynb/mock-community/evaluate-classification-accuracy-nb-extra.ipynb b/ipynb/mock-community/evaluate-classification-accuracy-nb-extra.ipynb
diff --git a/.../taxonomy-assignment-q2-nb-commands.ipynb → ...xonomy-assignment-nb-extra-commands.ipynb b/.../taxonomy-assignment-q2-nb-commands.ipynb → ...xonomy-assignment-nb-extra-commands.ipynb
@@ -1,5 +1,17 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate Commands for `nb-extra` Mock Sweeps\n",
+    "This script is for computing performing parameter sweeps using the naive Bayes classifier that is used in `q2-feature-classifier`. The number of parameters was large so it was useful to be able to run the commands on a larger system.\n",
+    "\n",
+    "This script creates a temporary directory (`results_dir`) and generates two sets of shell commands. The commands end up in `classifier_commands.sh` and `classify_commands.sh` in that directory. The former must complete before the latter is started. The commands rely on the Python modules in https://github.com/BenKaehler/q2-extra-classifier.\n",
+    "\n",
+    "Once the commands are generated in the \"Classifier fitting scripts\" section, they can be run anywhere by copying the whole `results_dir` directory. The `results_dir` directory should then by syncronised back to its original location. The remainder of the script the copies the results into `tax-credit`."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,

diff --git a/ipynb/runtime/analysis.ipynb b/ipynb/runtime/analysis.ipynb
diff --git a/ipynb/runtime/compute-runtimes.ipynb b/ipynb/runtime/compute-runtimes.ipynb
@@ -23,8 +23,7 @@
     "\n",
     "from tax_credit.framework_functions import (runtime_make_test_data,\n",
     "                                            runtime_make_commands,\n",
-    "                                            clock_runtime,\n",
-    "                                           )"
+    "                                            clock_runtime)"
    ]
   },
   {
@@ -37,19 +36,19 @@
    "source": [
     "## project_dir should be the directory where you've downloaded (or cloned) the \n",
     "## tax-credit repository. \n",
-    "project_dir = expandvars(\"$HOME/Desktop/projects/tax-credit\")\n",
+    "project_dir = '../..'\n",
     "data_dir = join(project_dir, \"data\")\n",
     "\n",
-    "results_dir = expandvars(\"$HOME/Desktop/projects/tax-credit-runtime\")\n",
+    "results_dir = join(project_dir, 'temp_results_runtime')\n",
     "runtime_results = join(results_dir, 'runtime_results.txt')\n",
     "tmpdir = join(results_dir, 'tmp')\n",
     "\n",
-    "ref_db_dir = expandvars(\"$HOME/Desktop/ref_dbs/\")\n",
-    "ref_seqs = join(ref_db_dir, 'gg_13_8_otus/rep_set/99_otus/dna-sequences.fasta')\n",
-    "ref_taxa = join(ref_db_dir, 'gg_13_8_otus/taxonomy/99_otu_taxonomy.txt')\n",
+    "ref_db_dir = join(project_dir, 'data/ref_dbs/gg_13_8_otus')\n",
+    "ref_seqs = join(ref_db_dir, '99_otus_clean.fasta')\n",
+    "ref_taxa = join(ref_db_dir, '99_otu_taxonomy_clean.tsv')\n",
     "\n",
     "num_iters = 1\n",
-    "sampling_depths = [1, 10, 100, 1000, 10000]"
+    "sampling_depths = [1] + list(range(2000,10001,2000))"
    ]
   },
   {
@@ -62,10 +61,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 3,
+   "metadata": {},
    "outputs": [],
    "source": [
     "runtime_make_test_data(ref_seqs, tmpdir, sampling_depths)"
@@ -80,24 +77,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\u001b[33mQIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.\u001b[0m\n",
-      "\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1.fna.nb.qza\u001b[0m\n",
-      "\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10.fna.nb.qza\u001b[0m\n",
-      "\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/100.fna.nb.qza\u001b[0m\n",
-      "\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1000.fna.nb.qza\u001b[0m\n",
-      "\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.nb.qza\u001b[0m\n"
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/1.fna.nb.qza\u001b[0m\n",
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/2000.fna.nb.qza\u001b[0m\n",
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/4000.fna.nb.qza\u001b[0m\n",
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/6000.fna.nb.qza\u001b[0m\n",
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/8000.fna.nb.qza\u001b[0m\n",
+      "\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/10000.fna.nb.qza\u001b[0m\n"
      ]
     }
    ],
    "source": [
-    "! qiime tools import --input-path {ref_taxa} --output-path {ref_taxa}.qza --type \"FeatureData[Taxonomy]\"\n",
+    "! qiime tools import --input-path {ref_taxa} --output-path {ref_taxa}.qza --type \"FeatureData[Taxonomy]\" --source-format HeaderlessTSVTaxonomyFormat\n",
     "\n",
     "for depth in sampling_depths:\n",
     "    tmpfile = join(tmpdir, str(depth)) + '.fna'\n",
@@ -125,14 +123,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "qiime1_template = ('source activate qiime1; source ~/.bashrc; '\n",
-    "                   'assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}')\n",
+    "qiime1_setup = join(results_dir, '.bashrc')\n",
+    "qiime1_template = ('bash -c \"source activate qiime1; source ' + qiime1_setup + '; '\n",
+    "                   'assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}\"')\n",
     "blast_template = ('qiime feature-classifier classify-consensus-blast --i-query {1}.qza --o-classification '\n",
     "                  '{0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}')\n",
     "vsearch_template = ('qiime feature-classifier classify-consensus-vsearch --i-query {1}.qza '\n",
@@ -146,7 +145,9 @@
     "    'uclust': (qiime1_template, '--min_consensus_fraction 0.51 --similarity 0.8 --uclust_max_accepts 3'),\n",
     "    'sortmerna': (qiime1_template, '--sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 '\n",
     "                 '--sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8'),\n",
-    "    'blast' : (qiime1_template, '-e 0.001'),\n",
+    "    'blast' : (qiime1_template, '-e 0.001')\n",
+    "          }\n",
+    "qiime2_methods = {\n",
     "    'blast+' : (blast_template, '--p-evalue 0.001'),\n",
     "    'vsearch' : (vsearch_template, '--p-perc-identity 0.90'),\n",
     "    'naive-bayes': (naive_bayes_template, '--p-confidence 0.7')\n",
@@ -169,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
    "metadata": {
     "collapsed": true
    },
@@ -188,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "metadata": {
     "collapsed": true
    },
@@ -207,16 +208,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "20\n",
-      "('qiime feature-classifier classify-consensus-vsearch --i-query /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1.fna.qza --o-classification /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/assign.tmp --i-reference-reads /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10.fna.qza --i-reference-taxonomy /Users/nbokulich/Desktop/projects/tax-credit/data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv.qza --p-perc-identity 0.90', 'vsearch', '1', '10', 0)\n",
-      "('qiime feature-classifier classify-consensus-blast --i-query /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.qza --o-classification /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/assign.tmp --i-reference-reads /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.qza --i-reference-taxonomy /Users/nbokulich/Desktop/projects/tax-credit/data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv.qza --p-evalue 0.001', 'blast+', '10000', '10000', 0)\n"
+      "48\n",
+      "('bash -c \"source activate qiime1; source ../../temp_results_runtime/.bashrc; assign_taxonomy.py -i ../../temp_results_runtime/tmp/1.fna -o ../../temp_results_runtime/tmp -r ../../temp_results_runtime/tmp/2000.fna -t ../../data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv -m blast -e 0.001\"', 'blast', '1', '2000', 0)\n",
+      "('bash -c \"source activate qiime1; source ../../temp_results_runtime/.bashrc; assign_taxonomy.py -i ../../temp_results_runtime/tmp/10000.fna -o ../../temp_results_runtime/tmp -r ../../temp_results_runtime/tmp/10000.fna -t ../../data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv -m sortmerna --sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 --sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8\"', 'sortmerna', '10000', '10000', 0)\n"
      ]
     }
    ],
@@ -226,53 +227,16 @@
     "print(commands_b[-1])"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None,\n",
-       " None]"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Parallel(n_jobs=4)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "Parallel(n_jobs=21)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))))"
+   ]
   }
  ],
  "metadata": {

diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     packages=find_packages(),
     install_requires=['biom-format', 'pandas', 'statsmodels', 'bokeh',
                       'scipy', 'jupyter', 'scikit-bio', 'seaborn',
-		              'scikit-learn'],
+		      'scikit-learn', 'joblib'],
     author="Nicholas Bokulich",
     author_email="[email protected]",
     description="Systematic benchmarking of taxonomic classification methods",