Skip to content

Commit

Permalink
Merge pull request #143 from BenKaehler/run-time-analysis
Browse files Browse the repository at this point in the history
Run time analysis
  • Loading branch information
gregcaporaso authored Jul 11, 2017
2 parents 857c9ee + aab2c59 commit ad15dd6
Show file tree
Hide file tree
Showing 5 changed files with 742 additions and 1,491 deletions.
1,872 changes: 563 additions & 1,309 deletions ipynb/mock-community/evaluate-classification-accuracy-nb-extra.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate Commands for `nb-extra` Mock Sweeps\n",
"This script is for computing performing parameter sweeps using the naive Bayes classifier that is used in `q2-feature-classifier`. The number of parameters was large so it was useful to be able to run the commands on a larger system.\n",
"\n",
"This script creates a temporary directory (`results_dir`) and generates two sets of shell commands. The commands end up in `classifier_commands.sh` and `classify_commands.sh` in that directory. The former must complete before the latter is started. The commands rely on the Python modules in https://github.com/BenKaehler/q2-extra-classifier.\n",
"\n",
"Once the commands are generated in the \"Classifier fitting scripts\" section, they can be run anywhere by copying the whole `results_dir` directory. The `results_dir` directory should then by syncronised back to its original location. The remainder of the script the copies the results into `tax-credit`."
]
},
{
"cell_type": "code",
"execution_count": 1,
Expand Down
245 changes: 133 additions & 112 deletions ipynb/runtime/analysis.ipynb

Large diffs are not rendered by default.

102 changes: 33 additions & 69 deletions ipynb/runtime/compute-runtimes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
"\n",
"from tax_credit.framework_functions import (runtime_make_test_data,\n",
" runtime_make_commands,\n",
" clock_runtime,\n",
" )"
" clock_runtime)"
]
},
{
Expand All @@ -37,19 +36,19 @@
"source": [
"## project_dir should be the directory where you've downloaded (or cloned) the \n",
"## tax-credit repository. \n",
"project_dir = expandvars(\"$HOME/Desktop/projects/tax-credit\")\n",
"project_dir = '../..'\n",
"data_dir = join(project_dir, \"data\")\n",
"\n",
"results_dir = expandvars(\"$HOME/Desktop/projects/tax-credit-runtime\")\n",
"results_dir = join(project_dir, 'temp_results_runtime')\n",
"runtime_results = join(results_dir, 'runtime_results.txt')\n",
"tmpdir = join(results_dir, 'tmp')\n",
"\n",
"ref_db_dir = expandvars(\"$HOME/Desktop/ref_dbs/\")\n",
"ref_seqs = join(ref_db_dir, 'gg_13_8_otus/rep_set/99_otus/dna-sequences.fasta')\n",
"ref_taxa = join(ref_db_dir, 'gg_13_8_otus/taxonomy/99_otu_taxonomy.txt')\n",
"ref_db_dir = join(project_dir, 'data/ref_dbs/gg_13_8_otus')\n",
"ref_seqs = join(ref_db_dir, '99_otus_clean.fasta')\n",
"ref_taxa = join(ref_db_dir, '99_otu_taxonomy_clean.tsv')\n",
"\n",
"num_iters = 1\n",
"sampling_depths = [1, 10, 100, 1000, 10000]"
"sampling_depths = [1] + list(range(2000,10001,2000))"
]
},
{
Expand All @@ -62,10 +61,8 @@
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"runtime_make_test_data(ref_seqs, tmpdir, sampling_depths)"
Expand All @@ -80,24 +77,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mQIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/100.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1000.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.nb.qza\u001b[0m\n"
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/1.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/2000.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/4000.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/6000.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/8000.fna.nb.qza\u001b[0m\n",
"\u001b[32mSaved TaxonomicClassifier to: ../../temp_results_runtime/tmp/10000.fna.nb.qza\u001b[0m\n"
]
}
],
"source": [
"! qiime tools import --input-path {ref_taxa} --output-path {ref_taxa}.qza --type \"FeatureData[Taxonomy]\"\n",
"! qiime tools import --input-path {ref_taxa} --output-path {ref_taxa}.qza --type \"FeatureData[Taxonomy]\" --source-format HeaderlessTSVTaxonomyFormat\n",
"\n",
"for depth in sampling_depths:\n",
" tmpfile = join(tmpdir, str(depth)) + '.fna'\n",
Expand Down Expand Up @@ -125,14 +123,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"qiime1_template = ('source activate qiime1; source ~/.bashrc; '\n",
" 'assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}')\n",
"qiime1_setup = join(results_dir, '.bashrc')\n",
"qiime1_template = ('bash -c \"source activate qiime1; source ' + qiime1_setup + '; '\n",
" 'assign_taxonomy.py -i {1} -o {0} -r {2} -t {3} -m {4} {5}\"')\n",
"blast_template = ('qiime feature-classifier classify-consensus-blast --i-query {1}.qza --o-classification '\n",
" '{0}/assign.tmp --i-reference-reads {2}.qza --i-reference-taxonomy {3}.qza {5}')\n",
"vsearch_template = ('qiime feature-classifier classify-consensus-vsearch --i-query {1}.qza '\n",
Expand All @@ -146,7 +145,9 @@
" 'uclust': (qiime1_template, '--min_consensus_fraction 0.51 --similarity 0.8 --uclust_max_accepts 3'),\n",
" 'sortmerna': (qiime1_template, '--sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 '\n",
" '--sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8'),\n",
" 'blast' : (qiime1_template, '-e 0.001'),\n",
" 'blast' : (qiime1_template, '-e 0.001')\n",
" }\n",
"qiime2_methods = {\n",
" 'blast+' : (blast_template, '--p-evalue 0.001'),\n",
" 'vsearch' : (vsearch_template, '--p-perc-identity 0.90'),\n",
" 'naive-bayes': (naive_bayes_template, '--p-confidence 0.7')\n",
Expand All @@ -169,7 +170,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 4,
"metadata": {
"collapsed": true
},
Expand All @@ -188,7 +189,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 5,
"metadata": {
"collapsed": true
},
Expand All @@ -207,16 +208,16 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n",
"('qiime feature-classifier classify-consensus-vsearch --i-query /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/1.fna.qza --o-classification /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/assign.tmp --i-reference-reads /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10.fna.qza --i-reference-taxonomy /Users/nbokulich/Desktop/projects/tax-credit/data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv.qza --p-perc-identity 0.90', 'vsearch', '1', '10', 0)\n",
"('qiime feature-classifier classify-consensus-blast --i-query /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.qza --o-classification /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/assign.tmp --i-reference-reads /Users/nbokulich/Desktop/projects/tax-credit-runtime/tmp/10000.fna.qza --i-reference-taxonomy /Users/nbokulich/Desktop/projects/tax-credit/data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv.qza --p-evalue 0.001', 'blast+', '10000', '10000', 0)\n"
"48\n",
"('bash -c \"source activate qiime1; source ../../temp_results_runtime/.bashrc; assign_taxonomy.py -i ../../temp_results_runtime/tmp/1.fna -o ../../temp_results_runtime/tmp -r ../../temp_results_runtime/tmp/2000.fna -t ../../data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv -m blast -e 0.001\"', 'blast', '1', '2000', 0)\n",
"('bash -c \"source activate qiime1; source ../../temp_results_runtime/.bashrc; assign_taxonomy.py -i ../../temp_results_runtime/tmp/10000.fna -o ../../temp_results_runtime/tmp -r ../../temp_results_runtime/tmp/10000.fna -t ../../data/ref_dbs/gg_13_8_otus/99_otu_taxonomy_clean.tsv -m sortmerna --sortmerna_e_value 0.001 --min_consensus_fraction 0.51 --similarity 0.8 --sortmerna_best_N_alignments 3 --sortmerna_coverage 0.8\"', 'sortmerna', '10000', '10000', 0)\n"
]
}
],
Expand All @@ -226,53 +227,16 @@
"print(commands_b[-1])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": [
"[None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None,\n",
" None]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Parallel(n_jobs=4)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
"source": [
"Parallel(n_jobs=21)(delayed(clock_runtime)(command, runtime_results, force=False) for command in (list(set(commands_a + commands_b))))"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
packages=find_packages(),
install_requires=['biom-format', 'pandas', 'statsmodels', 'bokeh',
'scipy', 'jupyter', 'scikit-bio', 'seaborn',
'scikit-learn'],
'scikit-learn', 'joblib'],
author="Nicholas Bokulich",
author_email="[email protected]",
description="Systematic benchmarking of taxonomic classification methods",
Expand Down

0 comments on commit ad15dd6

Please sign in to comment.