diff --git a/docs/_static/homology_workflow.png b/docs/_static/homology_workflow.png new file mode 100644 index 0000000..36400e9 Binary files /dev/null and b/docs/_static/homology_workflow.png differ diff --git a/docs/_static/prediction_workflow.png b/docs/_static/prediction_workflow.png new file mode 100644 index 0000000..211024a Binary files /dev/null and b/docs/_static/prediction_workflow.png differ diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..bbd2345 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,3 @@ +.wy-nav-content { + max-width: 80%; +} diff --git a/docs/_static/transcriptome_workflow.png b/docs/_static/transcriptome_workflow.png new file mode 100644 index 0000000..daf46f7 Binary files /dev/null and b/docs/_static/transcriptome_workflow.png differ diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000..6fb5ebb --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,4 @@ +{% extends "!layout.html" %} +{% block extrahead %} + +{% endblock %} diff --git a/docs/conf.py b/docs/conf.py index c484a67..36e4e9b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,7 +6,7 @@ import subprocess from annotation import VERSION -import sphinx_pdj_theme +import sphinx_rtd_theme # -- Path setup -------------------------------------------------------------- @@ -32,6 +32,7 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'sphinx_rtd_theme' ] # Add any paths that contain templates here, relative to this directory. @@ -48,8 +49,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_pdj_theme' -html_theme_path = [sphinx_pdj_theme.get_html_theme_path()] +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -59,8 +59,12 @@ # Generate a full CLI help for the transcriptome command transcriptome_help = subprocess.run(['reat', 'transcriptome', '--help'], capture_output=True) -print(transcriptome_help.stdout.decode(), file=open('transcriptome_help.txt', 'w')) +print(transcriptome_help.stdout.decode(), file=open('modules/transcriptome/transcriptome_help.txt', 'w')) # Generate a full CLI help for the homology command homology_help = subprocess.run(['reat', 'homology', '--help'], capture_output=True) -print(homology_help.stdout.decode(), file=open('homology_help.txt', 'w')) +print(homology_help.stdout.decode(), file=open('modules/homology/homology_help.txt', 'w')) + +# Generate a full CLI help for the prediction command +# prediction_help = subprocess.run(['reat', 'prediction', '--help'], capture_output=True) +# print(prediction_help.stdout.decode(), file=open('modules/prediction/prediction_help.txt', 'w')) diff --git a/docs/index.rst b/docs/index.rst index 3414868..ae9ee30 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,10 +3,6 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -.. toctree:: - :maxdepth: 2 - :caption: Contents: - REAT - Robust and Extendable eukaryotic Annotation Toolkit =========================================================== @@ -35,95 +31,8 @@ These commands will download the cromwell binary required to execute the workflo conda activate reat -Each task in the workflow is configured with default resource requirements appropriate for most tasks, but these can be overriden by user provided ones. -For an example of this file see:: - - { - "ei_annotation.wf_align.long_read_alignment_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.long_read_assembly_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.long_read_indexing_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_alignment_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_alignment_sort_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_merge_resources": { - "cpu_cores": 4, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_scallop_assembly_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_stringtie_assembly_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_align.short_read_stats_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 8 - }, - "ei_annotation.wf_main_mikado.homology_alignment_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_main_mikado.homology_index_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 8 - }, - "ei_annotation.wf_main_mikado.orf_calling_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 8 - }, - "ei_annotation.wf_main_mikado.protein_alignment_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - }, - "ei_annotation.wf_main_mikado.protein_index_resources": - { - "cpu_cores": 6, - "max_retries": 1, - "mem_gb": 16 - } - } - +Each task in the workflow is configured with default resource requirements appropriate for most tasks, but these can be +overriden by user provided ones. For examples of resource configuration files, refer to each module's description. To configure the cromwell engine, there are two relevant files, the cromwell runtime options and the workflow options files. @@ -139,107 +48,14 @@ The workflow options can be used to activate the caching behaviour in cromwell, .. include:: ../workflow_options/options.json :literal: -Running REAT -============= - -There are several workflows that make REAT, here we will describe 'transcriptome' and 'homology'. - - -Transcriptome Workflow ------------------------ - -The intention of the transcriptome workflow is to use a variety of data types, from short reads to long reads of varied quality and length. - -The data input for the workflow can be defined through the use of comma separated files one for short read samples and another for long read samples. These samples are then processed in several steps, first they are aligned to the genome, then assembled into transcripts, junctions are determined from the data and finally they are combined into a consolidated set of gene models. - -The aligner and assembly programs used for short and long read samples can be selected through command line arguments. There are also command line arguments to select extra options to be applied at each step. - -In case an annotation is available, this can be provided for junctions and reference models to be extracted and these can then be augmented using the evidence present in the data. - -.. highlight:: none -.. include:: ./transcriptome_help.txt - :literal: - -Sample files -^^^^^^^^^^^^^ - -The way samples are organised in the input files reflects how the files that correspond to the sample will be processed. -Data can be combined or kept separate at different stages of the workflow in accordance with the configuration provided -and the characteristics of the data. - - -Short read data -"""""""""""""""" - -Each line corresponds to a sample. -There are four required fields: Sample name, strandness, RNA-seq paired data, merge. -Followed by three optional fields: score, is_reference, exclude_redundant. -Previous fields to an optional field must be present in the line. -Files within a pair are separated by semi-colon and where there are multiple pairs in a sample, these are separated by spaces. - -.. code-block:: bash - - Ara0,fr-firststrand,data/Ara1.1.fastq.gz;data/Ara1.2.fastq.gz,true,20 - Ara1,fr-firststrand,data/Ara1.1.fastq.gz;data/Ara1.2.fastq.gz data/Ara2.1.fastq.gz;data/Ara2.2.fastq.gz,true,20 - Ara2,fr-firststrand,data/Ara3.1.fastq.gz;data/Ara3.2.fastq.gz data/Ara5.1.fastq.gz;data/Ara5.2.fastq.gz data/Ara6.1.fastq.gz;data/Ara6.2.fastq.gz,false +.. toctree:: + :maxdepth: 2 + :caption: Contents: -Sample RNA-seq data can be merged in different places, the options for controlling when the merging happens are as follows: -All transcripts assembled from paired reads within a sample are combined after assembling, paired read alignments can be merged before assembly using the 'merge' parameter in the CSV file. - -Junctions -++++++++++ - -Junctions from RNA-seq data can be determined in several ways. -By default junctions are collected for all the RNA-seq fastq pair as defined in the 'RNA-seq paired data' section of the CSV file for each sample. -Alternatively, samples can be combined where appropriate using the 'ei_annotation.wf_align.group_to_samples' parameter in the input.json file. -This parameter will define arbitrary groupings of the samples present in the short read CSV, with the following format:: - - "ei_annotation.wf_align.group_to_samples": { - "group1": ["Sample1", "Sample2"], - "group2": ["Sample3", "Sample4"] - } - -These groups will be validated against the samples in the CSV files, group names should be unique, samples can only belong to a single group and all samples should be part of a group. - -Long read data -""""""""""""""" - -Each line corresponds to a sample. -There are four required fields: Sample name, strandness, RNA-seq long read data, merge. -Followed by three optional fields: score, is_reference and exclude_redundant. -Previous fields to an optional field must be present in the line. -Where multiple read files correspond to a single sample (this implies they result in a single set of transcripts), the third column will contain all the files separated by spaces. - -.. code-block:: bash - - A01_1,fr-firststrand,data/A1_1.fastq.gz,low - A01_2,fr-firststrand,data/A1_2.fastq.gz,low - B01,fr-firststrand,data/B1.fastq.gz,low,10,true,true - C01,fr-firststrand,data/C1.fastq.gz,low - ALL,fr-firststrand,data/D1_1.fastq.gz data/D1_2.fastq.gz data/D1_3.fastq.gz data/D1_4.fastq.gz,low - CCS,fr-firststrand,data/CCS.fastq.gz,high - polished,fr-firststrand,data/polished.fastq.gz,high - - -.. warning:: - - The 'reference' sample name is reserved for internal use. - If this name is being used in any of the sample input CSV files, you will be notified with an error message. - - -Homology workflow ------------------ - -When there is protein evidence available from related species, the homology workflow can be used to generate gene models based on this evidence. -This is achieved by aligning the proteins provided through a set of related species annotations and evaluating these alignments to generate a score. -Protein alignments are evaluated in two ways: Coherence of the alignment structure with respect to the original model's structure and consensus structure from the multiple species. -These scores are then used by Mikado to group and filter models, generating a set of predicted models. - -.. highlight:: none -.. include:: ./homology_help.txt - :literal: - + modules/transcriptome/index + modules/homology/index + modules/prediction/index Indices and tables diff --git a/docs/modules/homology/index.rst b/docs/modules/homology/index.rst new file mode 100644 index 0000000..d0973b3 --- /dev/null +++ b/docs/modules/homology/index.rst @@ -0,0 +1,13 @@ +Homology Workflow +----------------- + +When there is protein evidence available from related species, the homology workflow can be used to generate gene models based on this evidence. This is achieved by aligning the proteins provided through a set of related species annotations and evaluating these alignments to generate a score. + +Protein alignments are evaluated in two ways, coherence of the alignment structure with respect to the original model's structure and consensus structure from the multiple species. These scores are then used by Mikado to group and filter models, generating a set of predicted models. + +.. highlight:: none +.. include:: ./homology_help.txt + :literal: + +.. image:: /_static/homology_workflow.png + :alt: Homology workflow diagram diff --git a/docs/modules/prediction/index.rst b/docs/modules/prediction/index.rst new file mode 100644 index 0000000..57b466f --- /dev/null +++ b/docs/modules/prediction/index.rst @@ -0,0 +1,13 @@ +Prediction Workflow +======================== + +The intention of the prediction workflow is to use a variety of transcript evidence, from short reads and long reads +based gene assemblies, protein alignments, homology alignments and other evidence such as expression, introns and +repeats to generate gene predictions ab initio and evidence based gene predictions. + +.. highlight:: none +.. include:: ./prediction_help.txt + :literal: + +.. image:: /_static/prediction_workflow.png + :alt: Prediction workflow diagram diff --git a/docs/modules/transcriptome/index.rst b/docs/modules/transcriptome/index.rst new file mode 100644 index 0000000..343b2f7 --- /dev/null +++ b/docs/modules/transcriptome/index.rst @@ -0,0 +1,173 @@ +Transcriptome Workflow +======================== + +The intention of the transcriptome workflow is to use a variety of data types, from short reads to long reads of varied quality and length. + +The data input for the workflow can be defined through the use of comma separated files one for short read samples and another for long read samples. These samples are then processed in several steps, first they are aligned to the genome, then assembled into transcripts, junctions are determined from the data and finally they are combined into a consolidated set of gene models. + +The aligner and assembly programs used for short and long read samples can be selected through command line arguments. There are also command line arguments to select extra options to be applied at each step. + +In case an annotation is available, this can be provided for junctions and reference models to be extracted and these can then be augmented using the evidence present in the data. + +.. highlight:: none +.. include:: ./transcriptome_help.txt + :literal: + +Sample files +------------------ + +The way samples are organised in the input files reflects how the files that correspond to the sample will be processed. +Data can be combined or kept separate at different stages of the workflow in accordance with the configuration provided +and the characteristics of the data. + + +Short read data +^^^^^^^^^^^^^^^^^^^^ + +Each line corresponds to a sample. +There are four required fields: Sample name, strandness, RNA-seq paired data, merge. +Followed by three optional fields: score, is_reference, exclude_redundant. +Previous fields to an optional field must be present in the line. +Files within a pair are separated by semi-colon and where there are multiple pairs in a sample, these are separated by spaces. + +.. code-block:: bash + + Ara0,fr-firststrand,data/Ara1.1.fastq.gz;data/Ara1.2.fastq.gz,true,20 + Ara1,fr-firststrand,data/Ara1.1.fastq.gz;data/Ara1.2.fastq.gz data/Ara2.1.fastq.gz;data/Ara2.2.fastq.gz,true,20 + Ara2,fr-firststrand,data/Ara3.1.fastq.gz;data/Ara3.2.fastq.gz data/Ara5.1.fastq.gz;data/Ara5.2.fastq.gz data/Ara6.1.fastq.gz;data/Ara6.2.fastq.gz,false + + +Sample RNA-seq data can be merged in different places, the options for controlling when the merging happens are as follows: +All transcripts assembled from paired reads within a sample are combined after assembling, paired read alignments can be merged before assembly using the 'merge' parameter in the CSV file. + +Junctions +++++++++++ + +Junctions from RNA-seq data can be determined in several ways. +By default junctions are collected for all the RNA-seq fastq pair as defined in the 'RNA-seq paired data' section of the CSV file for each sample. +Alternatively, samples can be combined where appropriate using the 'ei_annotation.wf_align.group_to_samples' parameter in the input.json file. +This parameter will define arbitrary groupings of the samples present in the short read CSV, with the following format:: + + "ei_annotation.wf_align.group_to_samples": { + "group1": ["Sample1", "Sample2"], + "group2": ["Sample3", "Sample4"] + } + +These groups will be validated against the samples in the CSV files, group names should be unique, samples can only belong to a single group and all samples should be part of a group. + +Long read data +^^^^^^^^^^^^^^^^^^ + +Each line corresponds to a sample. +There are four required fields: Sample name, strandness, RNA-seq long read data, merge. +Followed by three optional fields: score, is_reference and exclude_redundant. +Previous fields to an optional field must be present in the line. +Where multiple read files correspond to a single sample (this implies they result in a single set of transcripts), the third column will contain all the files separated by spaces. + +.. code-block:: bash + + A01_1,fr-firststrand,data/A1_1.fastq.gz,low + A01_2,fr-firststrand,data/A1_2.fastq.gz,low + B01,fr-firststrand,data/B1.fastq.gz,low,10,true,true + C01,fr-firststrand,data/C1.fastq.gz,low + ALL,fr-firststrand,data/D1_1.fastq.gz data/D1_2.fastq.gz data/D1_3.fastq.gz data/D1_4.fastq.gz,low + CCS,fr-firststrand,data/CCS.fastq.gz,high + polished,fr-firststrand,data/polished.fastq.gz,high + + +.. warning:: + + The 'reference' sample name is reserved for internal use. + If this name is being used in any of the sample input CSV files, you will be notified with an error message. + +.. image:: /_static/transcriptome_workflow.png + :alt: Transcriptome workflow diagram + +Configurable computational resources available:: + + { + "ei_annotation.wf_align.long_read_alignment_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.long_read_assembly_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.long_read_indexing_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_alignment_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_alignment_sort_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_merge_resources": { + "cpu_cores": 4, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_scallop_assembly_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_stringtie_assembly_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_align.short_read_stats_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 8 + }, + "ei_annotation.wf_main_mikado.homology_alignment_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_main_mikado.homology_index_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 8 + }, + "ei_annotation.wf_main_mikado.orf_calling_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 8 + }, + "ei_annotation.wf_main_mikado.protein_alignment_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + }, + "ei_annotation.wf_main_mikado.protein_index_resources": + { + "cpu_cores": 6, + "max_retries": 1, + "mem_gb": 16 + } + } + diff --git a/setup.py b/setup.py index 9f7c97f..ea3111b 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ extras_require={ 'docs': [ 'sphinx', - 'sphinx_pdj_theme', + 'sphinx_rtd_theme', ] }, package_data={