diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f5f7702..db10044 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,9 +1,9 @@ --- name: Bug report about: Create a report to help us improve -title: '' -labels: '' -assignees: 'slsevilla' +title: "" +labels: "" +assignees: "slsevilla" --- **Describe the bug** @@ -11,10 +11,11 @@ A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior; if applicable, add screenshots to help explain your problem: + 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** -A clear and concise description of what you expected to happen. \ No newline at end of file +A clear and concise description of what you expected to happen. diff --git a/.github/ISSUE_TEMPLATE/enchancement_request.md b/.github/ISSUE_TEMPLATE/enchancement_request.md index e9a608c..36a0fa7 100644 --- a/.github/ISSUE_TEMPLATE/enchancement_request.md +++ b/.github/ISSUE_TEMPLATE/enchancement_request.md @@ -1,14 +1,13 @@ --- name: Enhancement request about: Suggest an additional component to an already existing feature -title: '' -labels: '' -assignees: 'slsevilla' - +title: "" +labels: "" +assignees: "slsevilla" --- **Describe what enhancement you'd like to see.** A clear and concise description of what you'd like to add, specifically to what already existing feature. Ex. I am using a new reference, and I'd love to see this reference included [...] **Additional context** -Add any other context or screenshots about the feature request here. \ No newline at end of file +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index e4f6e3f..69ef716 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,13 +1,13 @@ --- name: Feature request about: Suggest an idea for this project -title: '' -labels: '' -assignees: 'slsevilla' +title: "" +labels: "" +assignees: "slsevilla" --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** -A clear and concise description of what you want to happen. If there are alternative solutions or features you've considered, please include these! \ No newline at end of file +A clear and concise description of what you want to happen. If there are alternative solutions or features you've considered, please include these! diff --git a/.github/workflows/build_mkdocs.yaml b/.github/workflows/build_mkdocs.yaml index c01e496..e41aadf 100644 --- a/.github/workflows/build_mkdocs.yaml +++ b/.github/workflows/build_mkdocs.yaml @@ -3,7 +3,7 @@ on: workflow_dispatch: push: paths: - - 'docs/**' + - "docs/**" jobs: deploy: runs-on: ubuntu-latest @@ -14,4 +14,4 @@ jobs: python-version: 3.9 - run: pip install --upgrade pip - run: pip install -r docs/requirements.txt - - run: mkdocs gh-deploy --force \ No newline at end of file + - run: mkdocs gh-deploy --force diff --git a/.github/workflows/lintr.yaml b/.github/workflows/lintr.yaml index 77fe300..ac0b525 100644 --- a/.github/workflows/lintr.yaml +++ b/.github/workflows/lintr.yaml @@ -13,18 +13,18 @@ jobs: Lintr: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: docker://snakemake/snakemake:v7.19.1 - - name: Lint Workflow - continue-on-error: true - run: | - docker run -v $PWD:/opt2 snakemake/snakemake:v7.19.1 /bin/bash -c \ - "mkdir -p /opt2/output_carlisle/config /opt2/output_carlisle/annotation && \ - cp -r /opt2/workflow/scripts/ /opt2/output_carlisle/ && \ - cp /opt2/resources/cluster_biowulf.yaml /opt2/output_carlisle/config/cluster.yaml && \ - cp /opt2/resources/tools_biowulf.yaml /opt2/output_carlisle/config/tools.yaml && \ - cd /opt2/output_carlisle/annotation && \ - touch hg38.fa genes.gtf hg38.bed hg38.tss.bed hg38_refseq.ucsc Ecoli_GCF_000005845.2_ASM584v2_genomic.fna adapters.fa && \ - snakemake --lint -s /opt2/workflow/Snakefile \ - -d /opt2/output_carlisle --configfile /opt2/.test/config_lint.yaml || \ - echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'" \ No newline at end of file + - uses: actions/checkout@v2 + - uses: docker://snakemake/snakemake:v7.19.1 + - name: Lint Workflow + continue-on-error: true + run: | + docker run -v $PWD:/opt2 snakemake/snakemake:v7.19.1 /bin/bash -c \ + "mkdir -p /opt2/output_carlisle/config /opt2/output_carlisle/annotation && \ + cp -r /opt2/workflow/scripts/ /opt2/output_carlisle/ && \ + cp /opt2/resources/cluster_biowulf.yaml /opt2/output_carlisle/config/cluster.yaml && \ + cp /opt2/resources/tools_biowulf.yaml /opt2/output_carlisle/config/tools.yaml && \ + cd /opt2/output_carlisle/annotation && \ + touch hg38.fa genes.gtf hg38.bed hg38.tss.bed hg38_refseq.ucsc Ecoli_GCF_000005845.2_ASM584v2_genomic.fna adapters.fa && \ + snakemake --lint -s /opt2/workflow/Snakefile \ + -d /opt2/output_carlisle --configfile /opt2/.test/config_lint.yaml || \ + echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'" diff --git a/.github/workflows/test_dev.yml b/.github/workflows/test_dev.yml index 33a6a87..d04ba2b 100644 --- a/.github/workflows/test_dev.yml +++ b/.github/workflows/test_dev.yml @@ -13,20 +13,20 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.1.0 - - uses: docker://snakemake/snakemake:v7.19.1 - with: - directory: '.test' - - name: dryrun - shell: bash {0} - run: | - bash ./install.sh ./test-workdir/carlisle-v9999.9999.9999-dev - ./test-workdir/carlisle-v9999.9999.9999-dev/bin/carlisle --runmode=init --workdir=./test-workdir/output_carlisle - cp ./test-workdir/carlisle-v9999.9999.9999-dev/bin/.test/config_lint.yaml ./test-workdir/output_carlisle/config/config.yaml - # TODO: use `carlisle run --mode dryrun` - docker run -v ./test-workdir/:/opt2 snakemake/snakemake:v7.19.1 /bin/bash -c \ - "cd /opt2 && snakemake \ - -s ./carlisle-v9999.9999.9999-dev/bin/workflow/Snakefile \ - --dryrun \ - --configfile carlisle-v9999.9999.9999-dev/bin/.test/config_lint.yaml \ - --directory output_carlisle" + - uses: actions/checkout@v4.1.0 + - uses: docker://snakemake/snakemake:v7.19.1 + with: + directory: ".test" + - name: dryrun + shell: bash {0} + run: | + bash ./install.sh ./test-workdir/carlisle-v9999.9999.9999-dev + ./test-workdir/carlisle-v9999.9999.9999-dev/bin/carlisle --runmode=init --workdir=./test-workdir/output_carlisle + cp ./test-workdir/carlisle-v9999.9999.9999-dev/bin/.test/config_lint.yaml ./test-workdir/output_carlisle/config/config.yaml + # TODO: use `carlisle run --mode dryrun` + docker run -v ./test-workdir/:/opt2 snakemake/snakemake:v7.19.1 /bin/bash -c \ + "cd /opt2 && snakemake \ + -s ./carlisle-v9999.9999.9999-dev/bin/workflow/Snakefile \ + --dryrun \ + --configfile carlisle-v9999.9999.9999-dev/bin/.test/config_lint.yaml \ + --directory output_carlisle" diff --git a/.test/README.md b/.test/README.md index 1dff24a..2714e30 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,6 +1,9 @@ ################################################ + # CHANGELOG + ### samples retired 2/6/24 + + ### 12/16/22 + New test data is added to pipeline ################################################ + # HG38 + # sample location + /data/CCBR/projects/ccbr1155/CS031014/fastq # sample ids mapping; original to renamed + VS586.R1.fastq.gz,HN6_IgG_rabbit_negative_control_1.R1.fastq.gz VS586.R2.fastq.gz,HN6_IgG_rabbit_negative_control_1.R2.fastq.gz VS591.R1.fastq.gz,53_H3K4me3_1.R1.fastq.gz @@ -28,7 +37,8 @@ VS592.R2.fastq.gz,53_H3K4me3_2.R2.fastq.gz VS588.R1.fastq.gz,HN6_H3K4me3_1.R1.fastq.gz VS588.R2.fastq.gz,HN6_H3K4me3_1.R2.fastq.gz -# subset each of the samples +# subset each of the samples + fq_original_loc="/data/CCBR/projects/ccbr1155/CS031014/fastq";\ test_loc="/data/CCBR_Pipeliner/Pipelines/CARLISLE_dev/.test";\ fq_list=("HN6_IgG_rabbit_negative_control_1.R1.fastq.gz" "HN6_IgG_rabbit_negative_control_1.R2.fastq.gz" "HN6_H3K4me3_1.R1.fastq.gz" "HN6_H3K4me3_1.R2.fastq.gz" "HN6_H3K4me3_2.R1.fastq.gz" "HN6_H3K4me3_2.R2.fastq.gz" "53_H3K4me3_1.R1.fastq.gz" "53_H3K4me3_1.R2.fastq.gz" "53_H3K4me3_2.R1.fastq.gz" "53_H3K4me3_2.R2.fastq.gz" "HN6_H4K20me3_1.R1.fastq.gz" "HN6_H4K20me3_1.R2.fastq.gz" "HN6_H4K20me3_2.R1.fastq.gz" "HN6_H4K20me3_2.R2.fastq.gz" "53_H4K20m3_1.R1.fastq.gz" "53_H4K20m3_1.R2.fastq.gz" "53_H4K20m3_2.R1.fastq.gz" "53_H4K20m3_2.R2.fastq.gz");\ diff --git a/.test/config_lint.yaml b/.test/config_lint.yaml index 5f7c9e5..dbe1c13 100644 --- a/.test/config_lint.yaml +++ b/.test/config_lint.yaml @@ -12,20 +12,20 @@ samplemanifest: "/opt2/.test/samples.test_lintr.tsv" ##################################################################################### # run sample contrasts run_contrasts: true -contrasts: "/opt2/.test/contrasts.test.tsv" # run_contrasts needs to be "Y" +contrasts: "/opt2/.test/contrasts.test.tsv" # run_contrasts needs to be "Y" contrasts_fdr_cutoff: 0.05 contrasts_lfc_cutoff: 0.59 # FC of 1.5 run_go_enrichment: true run_rose: true # reference -genome: "hg38" # currently supports hg38, hg19 and mm10. Custom genome can be added with appropriate additions to "reference" section below. +genome: "hg38" # currently supports hg38, hg19 and mm10. Custom genome can be added with appropriate additions to "reference" section below. # alignment quality threshold mapping_quality: 2 #only report alignment records with mapping quality of at least N (>= N). # normalization method -## spikein: normalization will be performed based off of spike-in aligned read count; +## spikein: normalization will be performed based off of spike-in aligned read count; ## library: library normalization will be performed ## none: no norm will be performed norm_method: "spikein" # method of normalization to be used; currently supports ["spikein","library","none"] @@ -100,7 +100,7 @@ geneset_id: "GOBP" # ["GOBP" "GOCC" "GOMF" "KEGG"] ##################################################################################### # References -# NOTE: "gtf" is only required if TxDb is not avaiable for the species in +# NOTE: "gtf" is only required if TxDb is not avaiable for the species in # Bioconductor eg. hs1 ##################################################################################### # references: @@ -152,4 +152,4 @@ adapters: "/opt2/output_carlisle/annotation/adapters.fa" # R Packages ##################################################################################### Rlib_dir: "/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_4.3_carlisle/" -Rpkg_config: "/opt2/output_carlisle/config//Rpack.config" \ No newline at end of file +Rpkg_config: "/opt2/output_carlisle/config//Rpack.config" diff --git a/.test/contrasts.test.tsv b/.test/contrasts.test.tsv index c69fab0..2a4fb72 100644 --- a/.test/contrasts.test.tsv +++ b/.test/contrasts.test.tsv @@ -1,2 +1,2 @@ condition1 condition2 -53_H3K4me3 HN6_H3K4me3 \ No newline at end of file +53_H3K4me3 HN6_H3K4me3 diff --git a/.test/samples.test.tsv b/.test/samples.test.tsv index 2abf39f..71acf19 100644 --- a/.test/samples.test.tsv +++ b/.test/samples.test.tsv @@ -2,4 +2,4 @@ sampleName replicateNumber isControl controlName controlReplicateNumber path_to_ 53_H3K4me3 1 N HN6_IgG_rabbit_negative_control 1 PIPELINE_HOME/.test/53_H3K4me3_1.R1.fastq.gz PIPELINE_HOME/.test/53_H3K4me3_1.R2.fastq.gz 53_H3K4me3 2 N HN6_IgG_rabbit_negative_control 1 PIPELINE_HOME/.test/53_H3K4me3_2.R1.fastq.gz PIPELINE_HOME/.test/53_H3K4me3_2.R2.fastq.gz HN6_H3K4me3 1 N HN6_IgG_rabbit_negative_control 1 PIPELINE_HOME/.test/HN6_H3K4me3_1.R1.fastq.gz PIPELINE_HOME/.test/HN6_H3K4me3_1.R2.fastq.gz -HN6_IgG_rabbit_negative_control 1 Y - - PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz \ No newline at end of file +HN6_IgG_rabbit_negative_control 1 Y - - PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz diff --git a/.test/samples.test_lintr.tsv b/.test/samples.test_lintr.tsv index a39af0b..e01483f 100644 --- a/.test/samples.test_lintr.tsv +++ b/.test/samples.test_lintr.tsv @@ -2,4 +2,4 @@ sampleName replicateNumber isControl controlName controlReplicateNumber path_to_ 53_H3K4me3 1 N HN6_IgG_rabbit_negative_control 1 /opt2/.test/53_H3K4me3_1.R1.fastq.gz /opt2/.test/53_H3K4me3_1.R2.fastq.gz 53_H3K4me3 2 N HN6_IgG_rabbit_negative_control 1 /opt2/.test/53_H3K4me3_2.R1.fastq.gz /opt2/.test/53_H3K4me3_2.R2.fastq.gz HN6_H3K4me3 1 N HN6_IgG_rabbit_negative_control 1 /opt2/.test/HN6_H3K4me3_1.R1.fastq.gz /opt2/.test/HN6_H3K4me3_1.R2.fastq.gz -HN6_IgG_rabbit_negative_control 1 Y - - /opt2/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz /opt2/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz \ No newline at end of file +HN6_IgG_rabbit_negative_control 1 Y - - /opt2/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz /opt2/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f7623b..e23fcba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,25 +1,32 @@ ## CARLISLE development version -- Bug fixes: (#127, @epehrsson) - - Removes single-sample group check for DESeq. - - Increases memory for DESeq. - - Ensures control replicate number is an integer. - - Fixes FDR cutoff misassigned to log2FC cutoff. - - Fixes `no_dedup` variable names in library normalization scripts. - - Fig bug that added nonexistent directories to the singularity bind paths. (#135, @kelly-sovacool) -- New features: - - Containerize rules that require R (`deseq`, `go_enrichment`, and `spikein_assessment`) to fix installation issues with common R library path. (#129, @kelly-sovacool) - - The `Rlib_dir` and `Rpkg_config` config options have been removed as they are no longer needed. - - New visualizations: (#132, @epehrsson) - - New rules `cov_correlation`, `homer_enrich`, `combine_homer`, `count_peaks` - - Add peak caller to MACS2 peak xls filename - - New parameters in the config file to make certain rules optional: (#133, @kelly-sovacool) - - GO enrichment is controlled by `run_go_enrichment` (default: `false`) - - ROSE is controlled by `run_rose` (default: `false`) -- Misc: - - The singularity version is no longer specified, per request of the biowulf admins. (#139, @kelly-sovacool) +### Bug fixes + +- Bug fixes for DESeq (#127, @epehrsson) + - Removes single-sample group check for DESeq. + - Increases memory for DESeq. + - Ensures control replicate number is an integer. + - Fixes FDR cutoff misassigned to log2FC cutoff. + - Fixes `no_dedup` variable names in library normalization scripts. +- Fig bug that added nonexistent directories to the singularity bind paths. (#135, @kelly-sovacool) + - Containerize rules that require R (`deseq`, `go_enrichment`, and `spikein_assessment`) to fix installation issues with common R library path. (#129, @kelly-sovacool) + - The `Rlib_dir` and `Rpkg_config` config options have been removed as they are no longer needed. + +### New features + +- New visualizations: (#132, @epehrsson) + - New rules `cov_correlation`, `homer_enrich`, `combine_homer`, `count_peaks` + - Add peak caller to MACS2 peak xls filename +- New parameters in the config file to make certain rules optional: (#133, @kelly-sovacool) + - GO enrichment is controlled by `run_go_enrichment` (default: `false`) + - ROSE is controlled by `run_rose` (default: `false`) + +### Misc + +- The singularity version is no longer specified, per request of the biowulf admins. (#139, @kelly-sovacool) ## CARLISLE v2.5.0 + - Refactors R packages to a common source location (#118, @slsevilla) - Adds a --force flag to allow for re-initialization of a workdir (#97, @slsevilla) - Fixes error with testrun in DESEQ2 (#113, @slsevilla) @@ -30,7 +37,8 @@ - Minor documentation improvements. (#100, @kelly-sovacool) - Fix: allow printing the version or help message even if singularity is not in the path. (#110, @kelly-sovacool) -## CARLISLE v2.4.1 +## CARLISLE v2.4.1 + - Add GitHub Action to add issues/PRs to personal project boards by @kelly-sovacool in #95 - Create install script by @kelly-sovacool in #93 - feat: use summits bed for homer input; save temporary files; fix deseq2 bug by @slsevilla in #108 @@ -38,12 +46,14 @@ - Test a dryrun with GitHub Actions by @kelly-sovacool in #94 ## CARLISLE v2.4.0 + - Feature- Merged Yue's fork, adding DEEPTOOLS by @slsevilla in #85 - Feature- Added tracking features from SPOOK by @slsevilla in #88 - Feature - Dev test run completed by @slsevilla in #89 - Bug - Fixed bugs related to Biowulf transition ## CARLISLE v2.1.0 + - enhancement - update gopeaks resources - change SEACR to run "norm" without spikein controls, "non" with spikein controls @@ -51,10 +61,12 @@ - fix GoEnrich bug for failed plots ## CARLISLE v2.0.1 + - fix error when contrasts set to "N" - adjust goenrich resources to be more efficient ## CARLISLE 2.0.0 + - Add a MAPQ filter to samtools (rule align) - Add GoPeaks MultiQC module - Allow for library normalization to occur during first pass @@ -71,16 +83,20 @@ - moved ~12% of all jobs to local deployment (within SLURM submission) ## CARLISLE 1.2.0 + - merge increases to resources; update workflow img, contributions ## CARLISLE 1.1.1 + - patch for gz bigbed bug ## CARLISLE 1.1.0 + - add broad-cutoff to macs2 broad peaks param settings - add non.stringent and non.relaxed to annotation options - merge DESEQ and DESEQ2 rules together - identify some files as temp ## CARLISLE 1.0.1 + - contains patch for DESEQ error with non hs1 reference samples diff --git a/annotation/README.md b/annotation/README.md index 09240c1..a73fec3 100755 --- a/annotation/README.md +++ b/annotation/README.md @@ -1,9 +1,11 @@ ## mm10 download + https://bitbucket.org/young_computation/rose/src/master/annotation/mm10_refseq.ucsc `create_hs1_ucsc.sh` script was used to create the hs1 ucsc file. #### hg19_refseq.ucsc and hg38_refseq.ucsc source + /data/khanlab3/gb_omics/processed_DATA/YueHu/cutrun_120822# hg19_refseq.ucsc and hg38_refseq.ucsc source # info on these resources is outlined in: https://github.com/CCBR/CARLISLE/issues/27 diff --git a/annotation/create_hs1_uscs.sh b/annotation/create_hs1_uscs.sh index d4eb35e..52db593 100644 --- a/annotation/create_hs1_uscs.sh +++ b/annotation/create_hs1_uscs.sh @@ -3,4 +3,4 @@ module load ucsc gtfToGenePred -genePredExt -ignoreGroupsWithoutExons /data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf hs1.genes.genePred head -n1 hg19_refseq.ucsc > hs1.ucsc -awk -F"\t" -v OFS="\t" '{print $NF,$_}' hs1.genes.genePred >> hs1.ucsc \ No newline at end of file +awk -F"\t" -v OFS="\t" '{print $NF,$_}' hs1.genes.genePred >> hs1.ucsc diff --git a/bin/carlisle b/bin/carlisle index 1ac07e9..22194d6 100755 --- a/bin/carlisle +++ b/bin/carlisle @@ -5,4 +5,4 @@ SCRIPTDIRNAME=$(readlink -f $(dirname "$SCRIPTNAME")) TOOLDIR=$(dirname "$SCRIPTDIRNAME") TOOLNAME=$(basename "$SCRIPTNAME") -${TOOLDIR}/carlisle "$@" || true \ No newline at end of file +${TOOLDIR}/carlisle "$@" || true diff --git a/carlisle b/carlisle index eb12d8d..367c2ea 100755 --- a/carlisle +++ b/carlisle @@ -46,7 +46,7 @@ function get_version (){ echo "Version: $(get_version_from_path $PIPELINE_HOME)" } -function usage() { +function usage() { # This function prints generic usage of the wrapper script. echo "${SCRIPTBASENAME} --> run CARLISLE @@ -64,13 +64,13 @@ function usage() { *) runlocal : run without submitting to sbatch *) runtest: run on cluster with included hg38 test dataset 2. WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions. - - Optional Arguments: + + Optional Arguments: *) -f / --force: Force flag will re-initialize a previously initialized workdir " } -function err() { +function err() { # This function print a error message (argument 1), the usage and then exits with non-zero status cat <<< " # @@ -80,7 +80,7 @@ function err() { # # # - " && usage && exit 1 1>&2; + " && usage && exit 1 1>&2; } function init() { @@ -89,7 +89,7 @@ function init() { # 2. copying essential files like config.yaml and samples.tsv into the workdir # 3. setting up logs and stats folders # create output folder - if [[ -d $WORKDIR ]] && [[ -z $FORCEFLAG ]]; then + if [[ -d $WORKDIR ]] && [[ -z $FORCEFLAG ]]; then err "Folder $WORKDIR already exists! If you'd like to re-initialize use the -f/--force flag $FORCEFLAG|" fi mkdir -p $WORKDIR @@ -116,7 +116,7 @@ function init() { if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi #create links for rose file - if [[ ! -d $WORKDIR/annotation ]]; then + if [[ ! -d $WORKDIR/annotation ]]; then mkdir -p $WORKDIR/annotation cp ${PIPELINE_HOME}/annotation/* $WORKDIR/annotation/ fi @@ -165,9 +165,9 @@ function controlcheck(){ control_list=`awk '$3 ~ /Y/' ${WORKDIR}/config/samples.tsv | awk '{print $1}'` check1=`awk '{print $1}' ${WORKDIR}/config/contrasts.tsv` check2=`awk '{print $2}' ${WORKDIR}/config/contrasts.tsv` - + for sample_id in ${control_list[@]}; do - if [[ $check1 == $sample_id || $check2 == $sample_id ]]; then + if [[ $check1 == $sample_id || $check2 == $sample_id ]]; then echo "Controls ($sample_id) cannot be listed in contrast.csv - update and re-run" echo "$check1 okkk $check2" exit 0 @@ -179,7 +179,7 @@ function dryrun() { # Dry-run runcheck controlcheck - + if [ ! -d ${WORKDIR}/logs/dryrun/ ]; then mkdir ${WORKDIR}/logs/dryrun/; fi if [ -f ${WORKDIR}/dryrun.log ]; then @@ -192,7 +192,7 @@ function dryrun() { function unlock() { # Unlock the workdir if previous snakemake run ended abruptly runcheck - run "--unlock" + run "--unlock" } function dag() { @@ -234,18 +234,18 @@ function preruncleanup() { echo "Running..." # check initialization - check_essential_files + check_essential_files cd $WORKDIR ## Archive previous run files - if [ -f ${WORKDIR}/logs/snakemake.log ];then + if [ -f ${WORKDIR}/logs/snakemake.log ];then modtime=$(stat ${WORKDIR}/logs/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") mv ${WORKDIR}/logs/snakemake.log ${WORKDIR}/stats/snakemake.${modtime}.log - if [ -f ${WORKDIR}/logs/snakemake.log.HPC_summary.txt ];then + if [ -f ${WORKDIR}/logs/snakemake.log.HPC_summary.txt ];then mv ${WORKDIR}/logs/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt fi - if [ -f ${WORKDIR}/logs/snakemake.stats ];then + if [ -f ${WORKDIR}/logs/snakemake.stats ];then mv ${WORKDIR}/logs/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats fi fi @@ -284,11 +284,11 @@ function run() { snakemake -s $SNAKEFILE \ --report ${WORKDIR}/logs/runlocal_snakemake_report.html \ --directory $WORKDIR \ - --configfile ${WORKDIR}/config/config.yaml + --configfile ${WORKDIR}/config/config.yaml fi elif [ "$1" == "slurm" ];then - + preruncleanup cat > ${WORKDIR}/submit_script.sbatch << EOF @@ -326,7 +326,7 @@ function run() { snakemake -s $SNAKEFILE \ --directory $WORKDIR \ --report ${WORKDIR}/logs/runslurm_snakemake_report.html \ - --configfile ${WORKDIR}/config/config.yaml + --configfile ${WORKDIR}/config/config.yaml fi bash <(curl https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/gather_cluster_stats.sh 2>/dev/null) ${WORKDIR}/logs/snakemake.log > ${WORKDIR}/logs/snakemake.log.HPC_summary.txt @@ -334,7 +334,7 @@ EOF sbatch ${WORKDIR}/submit_script.sbatch - else # for unlock and dryrun + else # for unlock and dryrun snakemake $1 -s $SNAKEFILE \ --directory $WORKDIR \ --use-envmodules \ @@ -356,7 +356,7 @@ function reset() { echo "Working Dir: $WORKDIR" if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi - + echo "Deleting $WORKDIR" rm -rf $WORKDIR echo "Re-Initializing $WORKDIR" diff --git a/config/config.yaml b/config/config.yaml index f2c5bee..f6407d0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,167 +1,167 @@ -##################################################################################### -# Folders / Paths -##################################################################################### -# The working dir... output will be in the results subfolder of the workdir -workdir: "WORKDIR" - -# scripts directory -# by default, use the scripts copied to the working directory. -# alternatively, use the scripts from the pipeline source. -scriptsdir: "WORKDIR/scripts" -#scriptsdir: "PIPELINE_HOME/workflow/scripts" - -# tab delimited samples file .. see samplefile for format details -samplemanifest: "WORKDIR/config/samples.tsv" - -##################################################################################### -# User parameters -##################################################################################### -# run sample contrasts -run_contrasts: true # true or false, no quotes -contrasts: "WORKDIR/config/contrasts.tsv" # run_contrasts needs to be `true` -contrasts_fdr_cutoff: 0.05 -contrasts_lfc_cutoff: 0.59 # FC of 1.5 - -# these steps are long-running. use `true` if you would like to run them -run_go_enrichment: false -run_rose: false - -# reference -genome: "hg38" # currently supports hg38, hg19 and mm10. Custom genome can be added with appropriate additions to "reference" section below. - -# alignment quality threshold -mapping_quality: 2 #only report alignment records with mapping quality of at least N (>= N). - -# normalization method -## spikein: normalization will be performed based off of spike-in aligned read count; -## library: library normalization will be performed -## none: no norm will be performed -norm_method: "spikein" # method of normalization to be used; currently supports ["spikein","library","none"] -## if norm_method ="spikein" -spikein_genome: "ecoli" # must be species found in spikein_reference below -spikein_scale: 1000000 - -# user parameters for alignment -bowtie2_parameters: "--dovetail --phred33 --very-sensitive" -fragment_len_filter: "1000" - -# duplication status -## users can select duplicated peaks (dedup) or non-deduplicated peaks (no_dedup) -### dupstatus: "dedup" # means run deduplicated analysis only -### dupstatus: "no_dedup" # means run non-deduplicated analysis only -## complete list: -### dupstatus: "dedup, no_dedup" -dupstatus: "dedup, no_dedup" - -# which peaktypes to consider for differential analysis: -# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | -# | Macs2 | AVAILABLE | AVAILABLE | NA | NA | NA | NA | -## macs2 options: macs2_narrow, macs2_broad -### NOTE: DESeq step generally fails for broadPeak; generally has too many calls. - -# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent| Non-Normalized, Relaxed | -# | SEACR | NA | NA | AVAILABLE w/o SPIKEIN | AVAILABLE w/o SPIKEIN | AVAILABLE w/ SPIKEIN | AVAILABLE w/ SPIKEIN | -## seacr options: seacr_stringent, seacr_relaxed - -# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | -# | GoPeaks | AVAILABLE | AVAILABLE | NA | NA | NA | NA | -## gopeaks options: gopeaks_narrow, gopeaks_broad - -# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | -# | Macs2 | AVAILABLE | AVAILABLE | NA | NA | NA | NA | -# | SEACR | NA | NA | AVAILABLE w/o SPIKEIN | AVAILABLE w/o SPIKEIN | AVAILABLE w/ SPIKEIN | AVAILABLE w/ SPIKEIN | -# | GoPeaks | AVAILABLE | AVAILABLE | NA | NA | NA | NA | -## complete list: -### peaktype: "macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad" -peaktype: "macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad" - -## macs2 additional option -### macs2 can be run with or without the control. adding a control will increase peak specificity -### default is "N"; selecting "Y" will run the paired control sample provided in the sample manifest -macs2_control: "N" - -# qvalues -## thresholds to be used for peak callers -## must be a list of comma separated values. minimum of numeric value required. -### default MACS2 qvalue is 0.05 https://manpages.ubuntu.com/manpages/xenial/man1/macs2_callpeak.1.html -### default GOPEAKS qvalue is 0.05 https://github.com/maxsonBraunLab/gopeaks/blob/main/README.md; [link to issue] -### default SEACR FDR threshold 1 https://github.com/FredHutch/SEACR/blob/master/README.md -# quality_thresholds: "0.1, 0.05" -quality_thresholds: "0.05" - -## MACS2, broad-peaks specific, quality threshold -### if broadPeak is seleted as a 'peaktype', an additional quality threshold can be used -macs2_broad_peak_threshold: "0.01" - -# annotations -## rose parameters -stitch_distance: 12500 -tss_distance: 2500 - -## homer -motif_size: 1000 -preparsedDir: "/data/CCBR_Pipeliner/db/PipeDB/homer/preparsedDir" - -## GO Enrichment -## enrichment analysis can be performed on hg19 or hg38 samples -## one option may be chosen for each project -geneset_id: "GOBP" # ["GOBP" "GOCC" "GOMF" "KEGG"] - -##################################################################################### -# References -# NOTE: "gtf" is only required if TxDb is not avaiable for the species in -# Bioconductor eg. hs1 -##################################################################################### -# references: -reference: - hg38: - fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/hg38.fa" - gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/genes.gtf" - blacklist: "PIPELINE_HOME/resources/blacklistbed/hg38.bed" - regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" - macs2_g: "hs" - tss_bed: "PIPELINE_HOME/resources/tss_bed/hg38.tss.bed" - rose: "WORKDIR/annotation/hg38_refseq.ucsc" - hg19: - fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/hg19.fa" - gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/genes.gtf" - blacklist: "PIPELINE_HOME/resources/blacklistbed/hg19.bed" - regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" - macs2_g: "hs" - tss_bed: "PIPELINE_HOME/resources/tss_bed/hg19.tss.bed" - rose: "WORKDIR/annotation/hg19_refseq.ucsc" - mm10: - fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/mm10.fa" - gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/genes.gtf" - blacklist: "PIPELINE_HOME/resources/blacklistbed/mm10.bed" - regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY" - macs2_g: "mm" - tss_bed: "PIPELINE_HOME/resources/tss_bed/mm10.tss.bed" - rose: "WORKDIR/annotation/mm10_refseq.ucsc" - hs1: - fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/hs1.fa" - gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf" - blacklist: "PIPELINE_HOME/resources/blacklistbed/hs1.bed" - tss_bed: "PIPELINE_HOME/resources/tss_bed/hs1.tss.bed" - regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" - macs2_g: "3.1e+8" - rose: "WORKDIR/annotation/hs1_refseq.ucsc" -# ref: https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html -# used faCount from http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/ to get 3.1e+8 value above - -spikein_reference: - ecoli: - fa: "PIPELINE_HOME/resources/spikein/Ecoli_GCF_000005845.2_ASM584v2_genomic.fna" - drosophila: - fa: "/fdb/igenomes/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - saccharomyces: - fa: "PIPELINE_HOME/resources/spikein/S_cer_S288C_R64.fna" - -adapters: "PIPELINE_HOME/resources/other/adapters.fa" - -##################################################################################### -# CONTAINERS -##################################################################################### -containers: - base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" - carlisle_r: "docker://nciccbr/carlisle_r:v2" \ No newline at end of file +##################################################################################### +# Folders / Paths +##################################################################################### +# The working dir... output will be in the results subfolder of the workdir +workdir: "WORKDIR" + +# scripts directory +# by default, use the scripts copied to the working directory. +# alternatively, use the scripts from the pipeline source. +scriptsdir: "WORKDIR/scripts" +#scriptsdir: "PIPELINE_HOME/workflow/scripts" + +# tab delimited samples file .. see samplefile for format details +samplemanifest: "WORKDIR/config/samples.tsv" + +##################################################################################### +# User parameters +##################################################################################### +# run sample contrasts +run_contrasts: true # true or false, no quotes +contrasts: "WORKDIR/config/contrasts.tsv" # run_contrasts needs to be `true` +contrasts_fdr_cutoff: 0.05 +contrasts_lfc_cutoff: 0.59 # FC of 1.5 + +# these steps are long-running. use `true` if you would like to run them +run_go_enrichment: false +run_rose: false + +# reference +genome: "hg38" # currently supports hg38, hg19 and mm10. Custom genome can be added with appropriate additions to "reference" section below. + +# alignment quality threshold +mapping_quality: 2 #only report alignment records with mapping quality of at least N (>= N). + +# normalization method +## spikein: normalization will be performed based off of spike-in aligned read count; +## library: library normalization will be performed +## none: no norm will be performed +norm_method: "spikein" # method of normalization to be used; currently supports ["spikein","library","none"] +## if norm_method ="spikein" +spikein_genome: "ecoli" # must be species found in spikein_reference below +spikein_scale: 1000000 + +# user parameters for alignment +bowtie2_parameters: "--dovetail --phred33 --very-sensitive" +fragment_len_filter: "1000" + +# duplication status +## users can select duplicated peaks (dedup) or non-deduplicated peaks (no_dedup) +### dupstatus: "dedup" # means run deduplicated analysis only +### dupstatus: "no_dedup" # means run non-deduplicated analysis only +## complete list: +### dupstatus: "dedup, no_dedup" +dupstatus: "dedup, no_dedup" + +# which peaktypes to consider for differential analysis: +# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | +# | Macs2 | AVAILABLE | AVAILABLE | NA | NA | NA | NA | +## macs2 options: macs2_narrow, macs2_broad +### NOTE: DESeq step generally fails for broadPeak; generally has too many calls. + +# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent| Non-Normalized, Relaxed | +# | SEACR | NA | NA | AVAILABLE w/o SPIKEIN | AVAILABLE w/o SPIKEIN | AVAILABLE w/ SPIKEIN | AVAILABLE w/ SPIKEIN | +## seacr options: seacr_stringent, seacr_relaxed + +# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | +# | GoPeaks | AVAILABLE | AVAILABLE | NA | NA | NA | NA | +## gopeaks options: gopeaks_narrow, gopeaks_broad + +# | Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | +# | Macs2 | AVAILABLE | AVAILABLE | NA | NA | NA | NA | +# | SEACR | NA | NA | AVAILABLE w/o SPIKEIN | AVAILABLE w/o SPIKEIN | AVAILABLE w/ SPIKEIN | AVAILABLE w/ SPIKEIN | +# | GoPeaks | AVAILABLE | AVAILABLE | NA | NA | NA | NA | +## complete list: +### peaktype: "macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad" +peaktype: "macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad" + +## macs2 additional option +### macs2 can be run with or without the control. adding a control will increase peak specificity +### default is "N"; selecting "Y" will run the paired control sample provided in the sample manifest +macs2_control: "N" + +# qvalues +## thresholds to be used for peak callers +## must be a list of comma separated values. minimum of numeric value required. +### default MACS2 qvalue is 0.05 https://manpages.ubuntu.com/manpages/xenial/man1/macs2_callpeak.1.html +### default GOPEAKS qvalue is 0.05 https://github.com/maxsonBraunLab/gopeaks/blob/main/README.md; [link to issue] +### default SEACR FDR threshold 1 https://github.com/FredHutch/SEACR/blob/master/README.md +# quality_thresholds: "0.1, 0.05" +quality_thresholds: "0.05" + +## MACS2, broad-peaks specific, quality threshold +### if broadPeak is seleted as a 'peaktype', an additional quality threshold can be used +macs2_broad_peak_threshold: "0.01" + +# annotations +## rose parameters +stitch_distance: 12500 +tss_distance: 2500 + +## homer +motif_size: 1000 +preparsedDir: "/data/CCBR_Pipeliner/db/PipeDB/homer/preparsedDir" + +## GO Enrichment +## enrichment analysis can be performed on hg19 or hg38 samples +## one option may be chosen for each project +geneset_id: "GOBP" # ["GOBP" "GOCC" "GOMF" "KEGG"] + +##################################################################################### +# References +# NOTE: "gtf" is only required if TxDb is not avaiable for the species in +# Bioconductor eg. hs1 +##################################################################################### +# references: +reference: + hg38: + fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/hg38.fa" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/genes.gtf" + blacklist: "PIPELINE_HOME/resources/blacklistbed/hg38.bed" + regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" + macs2_g: "hs" + tss_bed: "PIPELINE_HOME/resources/tss_bed/hg38.tss.bed" + rose: "WORKDIR/annotation/hg38_refseq.ucsc" + hg19: + fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/hg19.fa" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hg19_basic/genes.gtf" + blacklist: "PIPELINE_HOME/resources/blacklistbed/hg19.bed" + regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" + macs2_g: "hs" + tss_bed: "PIPELINE_HOME/resources/tss_bed/hg19.tss.bed" + rose: "WORKDIR/annotation/hg19_refseq.ucsc" + mm10: + fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/mm10.fa" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/genes.gtf" + blacklist: "PIPELINE_HOME/resources/blacklistbed/mm10.bed" + regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY" + macs2_g: "mm" + tss_bed: "PIPELINE_HOME/resources/tss_bed/mm10.tss.bed" + rose: "WORKDIR/annotation/mm10_refseq.ucsc" + hs1: + fa: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/hs1.fa" + gtf: "/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf" + blacklist: "PIPELINE_HOME/resources/blacklistbed/hs1.bed" + tss_bed: "PIPELINE_HOME/resources/tss_bed/hs1.tss.bed" + regions: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" + macs2_g: "3.1e+8" + rose: "WORKDIR/annotation/hs1_refseq.ucsc" +# ref: https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html +# used faCount from http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/ to get 3.1e+8 value above + +spikein_reference: + ecoli: + fa: "PIPELINE_HOME/resources/spikein/Ecoli_GCF_000005845.2_ASM584v2_genomic.fna" + drosophila: + fa: "/fdb/igenomes/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" + saccharomyces: + fa: "PIPELINE_HOME/resources/spikein/S_cer_S288C_R64.fna" + +adapters: "PIPELINE_HOME/resources/other/adapters.fa" + +##################################################################################### +# CONTAINERS +##################################################################################### +containers: + base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" + carlisle_r: "docker://nciccbr/carlisle_r:v2" diff --git a/config/contrasts.tsv b/config/contrasts.tsv index e9ba67e..3d93b0f 100644 --- a/config/contrasts.tsv +++ b/config/contrasts.tsv @@ -1,2 +1,2 @@ condition1 condition2 -sample1 sample2 \ No newline at end of file +sample1 sample2 diff --git a/config/fqscreen_config.conf b/config/fqscreen_config.conf index 4101904..c4839c4 100644 --- a/config/fqscreen_config.conf +++ b/config/fqscreen_config.conf @@ -3,10 +3,10 @@ ###################### ## Bowtie or Bowtie2 # ###################### -## If the Bowtie1/2 binary is not in your PATH then you can +## If the Bowtie1/2 binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BOWTIE /usr/local/bin/bowtie/bowtie @@ -16,10 +16,10 @@ ########################################### ## Bismark (for bisulfite sequencing only)# ########################################### -## If the Bismark binary is not in your PATH then you can +## If the Bismark binary is not in your PATH then you can ## set this value to tell the program where to find it. ## Uncomment the line below and set the appropriate location. -## Please note, this path should include the executable +## Please note, this path should include the executable ## filename. #BISMARK /usr/local/bin/bismark/bismark @@ -40,26 +40,26 @@ THREADS 24 ## This section allows you to configure multiple databases ## to search against in your screen. For each database ## you need to provide a database name (which can't contain -## spaces) and the location of the bowtie indices which +## spaces) and the location of the bowtie indices which ## you created for that database. -## -## The entries shown below are only suggested examples, you +## +## The entries shown below are only suggested examples, you ## can add as many DATABASE sections as required, and you ## can comment out or remove as many of the existing entries ## as desired. ## ## Either the original bowtie or bowtie2 may be used for the -## mapping. Specify the aligner to use with the command line -## flag --aligner with arguments 'bowtie' or +## mapping. Specify the aligner to use with the command line +## flag --aligner with arguments 'bowtie' or ## 'bowtie2' (default). -## -## The configuration file may list paths to both bowtie and +## +## The configuration file may list paths to both bowtie and ## bowtie2 indices. FastQ Screen automatically detects whether -## a specified index is compatible with bowtie or bowtie2. +## a specified index is compatible with bowtie or bowtie2. ## -## Although the configuration file may list paths to both -## bowtie and bowtie2 indices, only one aligner will be used -## for the mapping, as specified by the --aligner flag. +## Although the configuration file may list paths to both +## bowtie and bowtie2 indices, only one aligner will be used +## for the mapping, as specified by the --aligner flag. ## ## The path to the index files SHOULD INCLUDE THE BASENAME of ## the index, e.g: @@ -72,8 +72,8 @@ THREADS 24 ## ## Human - sequences available from ## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ -DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg38/hg38 -DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm10/mm10 +DATABASE Human /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/hg38/hg38 +DATABASE Mouse /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/mm10/mm10 DATABASE Bacteria /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Bacteria/bacteria DATABASE Fungi /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Fungi/fungi -DATABASE Virus /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Virus/virus \ No newline at end of file +DATABASE Virus /data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/Virus/virus diff --git a/config/multiqc_config.yaml b/config/multiqc_config.yaml index adb6d41..292be60 100644 --- a/config/multiqc_config.yaml +++ b/config/multiqc_config.yaml @@ -1,10 +1,10 @@ #Intro title information title: "CUTRUN Analysis Report" report_header_info: - - FASTQ Analysis: 'Analysis includes basic QC metrics' - - FASTQScreen Analysis: 'Analysis includes human, mouse, and bacteria' - - SAMStats Analysis: 'Analysis included flagstats and idxstats' - - GoPeaks: 'Analysis includes the number of peaks called per sample via the general table and the bar plot' + - FASTQ Analysis: "Analysis includes basic QC metrics" + - FASTQScreen Analysis: "Analysis includes human, mouse, and bacteria" + - SAMStats Analysis: "Analysis included flagstats and idxstats" + - GoPeaks: "Analysis includes the number of peaks called per sample via the general table and the bar plot" #include fastqscreen fastqscreen_simpleplot: true @@ -14,14 +14,14 @@ show_analysis_paths: False #find samples sp: - fastq_screen: - fn: '*_screen.txt' - gopeaks: - fn: '*gopeaks.json' + fastq_screen: + fn: "*_screen.txt" + gopeaks: + fn: "*gopeaks.json" module_order: -# Sample Quality control - - 'fastqc' - - 'fastq_screen' - - 'samtools' - - 'gopeaks' \ No newline at end of file + # Sample Quality control + - "fastqc" + - "fastq_screen" + - "samtools" + - "gopeaks" diff --git a/config/samples.tsv b/config/samples.tsv index 56c314d..4cc4e66 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -4,4 +4,4 @@ sample1 2 N igg1 2 sample2 1 N igg2 1 igg1 1 Y - - igg1 2 Y - - -igg2 1 Y - - \ No newline at end of file +igg2 1 Y - - diff --git a/docs/requirements.txt b/docs/requirements.txt index e63c63e..9d5ddac 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,4 +7,4 @@ mkdocs-git-revision-date-plugin==0.3.2 #https://pypi.org/project/mkdocs-material/ mkdocs-material==9.1.6 #https://pypi.org/project/mkdocs-material-extensions/ -mkdocs-material-extensions==1.1.1 \ No newline at end of file +mkdocs-material-extensions==1.1.1 diff --git a/docs/user-guide/contributions.md b/docs/user-guide/contributions.md index a05202e..ed0b68f 100644 --- a/docs/user-guide/contributions.md +++ b/docs/user-guide/contributions.md @@ -1,4 +1,5 @@ # Contributions + The following members contributed to the development of the CARLISLE pipeline: - [Samantha Sevilla](https://github.com/slsevilla) @@ -8,4 +9,4 @@ The following members contributed to the development of the CARLISLE pipeline: - [Yue Hu](https://github.com/YueHu-002) - Vassiliki Saloura -VK, SS, SK, HC contributed to the generating the source code and all members contributed to the main concepts and analysis. \ No newline at end of file +VK, SS, SK, HC contributed to the generating the source code and all members contributed to the main concepts and analysis. diff --git a/docs/user-guide/getting-started.md b/docs/user-guide/getting-started.md index 1274913..329bda2 100644 --- a/docs/user-guide/getting-started.md +++ b/docs/user-guide/getting-started.md @@ -1,9 +1,11 @@ # Overview + The CARLISLE github repository is stored locally, and will be used for project deployment. Multiple projects can be deployed from this one point simultaneously, without concern. ## 1. Getting Started ## 1.1 Introduction + The CARLISLE Pipelie beings with raw FASTQ files and performs trimming followed by alignment using [BOWTIE2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). Data is then normalized through either the use of an user-species species (IE E.Coli) spike-in control or through the determined library size. Peaks are then called using [MACS2](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html), [SEACR](https://github.com/FredHutch/SEACR), and [GoPEAKS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02707-w) with various options selected by the user. Peaks are then annotated, and summarized into reports. If designated, differential analysis is performed using [DESEQ2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html). QC reports are also generated with each project using [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MULTIQC](https://multiqc.info/). Annotations are added using [HOMER](http://homer.ucsd.edu/homer/ngs/annotation.html) and [ROSE](https://github.com/stjude/ROSE). GSEA Enrichment analysis predictions are added using [CHIPENRICH](https://bioconductor.org/packages/devel/bioc/vignettes/chipenrich/inst/doc/chipenrich-vignette.html). The following are sub-commands used within CARLISLE: @@ -19,6 +21,7 @@ The following are sub-commands used within CARLISLE: - runtest: copies test manifests and files to WORKDIR ## 1.2 Setup Dependencies + CARLISLE has several dependencies listed below. These dependencies can be installed by a sysadmin. All dependencies will be automatically loaded if running from Biowulf. - bedtools: "bedtools/2.30.0" @@ -42,13 +45,16 @@ CARLISLE has several dependencies listed below. These dependencies can be instal - ucsc: "ucsc/407" ## 1.3 Login to the cluster + CARLISLE has been exclusively tested on Biowulf HPC. Login to the cluster's head node and move into the pipeline location. + ``` # ssh into cluster's head node ssh -Y $USER@biowulf.nih.gov ``` -## 1.4 Load an interactive session +## 1.4 Load an interactive session + An interactive session should be started before performing any of the pipeline sub-commands, even if the pipeline is to be executed on the cluster. ``` diff --git a/docs/user-guide/output.md b/docs/user-guide/output.md index 0f8de72..7e48447 100644 --- a/docs/user-guide/output.md +++ b/docs/user-guide/output.md @@ -1,18 +1,19 @@ # 4. Expected Outputs + The following directories are created under the WORKDIR/results directory: - alignment_stats: this directory include information on the alignment of each sample - bam: this directory includes BAM files, statistics on samples, statistics on spike-in controls for each sample - bedgraph: this directory includes BEDGRAPH files and statistic summaries for each sample -- bigwig: this directory includes the bigwig files for each sample +- bigwig: this directory includes the bigwig files for each sample - peaks: this directory contains a sub-directory that relates to the quality threshold used. - - quality threshold - - contrasts: this directory includes the contrasts for each line listed in the contrast manifest - - peak_caller: this directory includes all peak calls from each peak_caller (SEACR, MACS2, GOPEAKS) for each sample - - annotation - - go_enrichment: this directory includes gene set enrichment pathway predictions when `run_go_enrichment` is set to `true` in the config file. - - homer: this directory includes the annotation output from HOMER - - rose: this directory includes the annotation output from ROSE when `run_rose` is set to `true` in the config file. + - quality threshold + - contrasts: this directory includes the contrasts for each line listed in the contrast manifest + - peak_caller: this directory includes all peak calls from each peak_caller (SEACR, MACS2, GOPEAKS) for each sample + - annotation + - go_enrichment: this directory includes gene set enrichment pathway predictions when `run_go_enrichment` is set to `true` in the config file. + - homer: this directory includes the annotation output from HOMER + - rose: this directory includes the annotation output from ROSE when `run_rose` is set to `true` in the config file. - qc: this directory includes MULTIQC reports and spike-in control reports (when applicable) ``` @@ -117,4 +118,4 @@ The following directories are created under the WORKDIR/results directory: └── qc ├── fastqc_raw └── fqscreen_raw -``` \ No newline at end of file +``` diff --git a/docs/user-guide/preparing-files.md b/docs/user-guide/preparing-files.md index 547738a..93be694 100644 --- a/docs/user-guide/preparing-files.md +++ b/docs/user-guide/preparing-files.md @@ -1,7 +1,9 @@ # 2. Preparing Files + The pipeline is controlled through editing configuration and manifest files. Defaults are found in the /WORKDIR/config and /WORKDIR/manifest directories, after initialization. ## 2.1 Configs + The configuration files control parameters and software of the pipeline. These files are listed below: - config/config.yaml @@ -9,26 +11,32 @@ The configuration files control parameters and software of the pipeline. These f - resources/tools.yaml ### 2.1.1 Cluster Config + The cluster configuration file dictates the resouces to be used during submission to Biowulf HPC. There are two differnt ways to control these parameters - first, to control the default settings, and second, to create or edit individual rules. These parameters should be edited with caution, after significant testing. ### 2.1.2 Tools Config + The tools configuration file dictates the version of each software or program that is being used in the pipeline. ### 2.1.3 Config YAML + There are several groups of parameters that are editable for the user to control the various aspects of the pipeline. These are : - Folders and Paths - - These parameters will include the input and ouput files of the pipeline, as well as list all manifest names. + - These parameters will include the input and ouput files of the pipeline, as well as list all manifest names. - User parameters - - These parameters will control the pipeline features. These include thresholds and whether to perform processes. + - These parameters will control the pipeline features. These include thresholds and whether to perform processes. - References - - These parameters will control the location of index files, spike-in references, adaptors and species calling information. + - These parameters will control the location of index files, spike-in references, adaptors and species calling information. + +#### 2.1.3.1 User Parameters -#### 2.1.3.1 User Parameters ##### 2.1.3.1.1 (Spike in Controls) + The pipeline allows for the use of a species specific spike-in control, or the use of normalization via library size. The parameter `spikein_genome` should be set to the species term used in `spikein_reference`. For example for ecoli spike-in: + ``` run_contrasts: true norm_method: "spikein" @@ -40,6 +48,7 @@ spikein_reference: ``` For example for drosophila spike-in: + ``` run_contrasts: true norm_method: "spikein" @@ -51,39 +60,49 @@ spikein_reference: ``` If it's determined that the amount of spike-in is not sufficient for the run, a library normaliaztion can be performed. + 1. Complete a CARLISLE run with spike-in set to "Y". This will allow for the complete assessment of the spike-in. 2. Run inital QC analysis on the output data 3. Add the alignment_stats dir to the configuration file. -4. Re-run the CARLISLE pipeline +4. Re-run the CARLISLE pipeline ##### 2.1.3.1.2 Duplication Status + Users can select duplicated peaks (dedup) or non-deduplicated peaks (no_dedup) through the user parameter. + ``` -dupstatus: "dedup, no_dedup" +dupstatus: "dedup, no_dedup" ``` ##### 2.1.3.1.3 Peak Caller + Three peak callers are available for deployment within the pipeline, with different settings deployed for each caller. 1. [MACS2](https://github.com/macs3-project/MACS) is available with two peak calling options: narrowPeak or broadPeak. NOTE: DESeq step generally fails for broadPeak; generally has too many calls. + ``` peaktype: "macs2_narrow, macs2_broad," ``` + 2. [SEACR](https://github.com/FredHutch/SEACR) is available with four peak calling options: stringent or relaxed parameters, to be paired with "norm" for samples without a spike-in control and "non" for samples with a spikein control + ``` peaktype: "seacr_stringent, seacr_relaxed" ``` + 3. [GOPEAKS](https://github.com/maxsonBraunLab/gopeaks) is available with two peak calling options: narrowpeaks or broadpeaks + ``` peaktype: "gopeaks_narrow, gopeaks_broad" ``` + A complete list of the available peak calling parameters and the recommended list of parameters is provided below: -| Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | -| --- | --- | --- |--- |--- |--- |--- | -| Macs2 | AVAIL | AVAIL | NA | NA | NA | NA | -| SEACR | NA | NA | AVAIL w/o SPIKEIN | AVAIL w/o SPIKEIN | AVAIL w/ SPIKEIN | AVAIL w/ SPIKEIN | -| GoPeaks | AVAIL | AVAIL | NA | NA | NA | NA | +| Peak Caller | Narrow | Broad | Normalized, Stringent | Normalized, Relaxed | Non-Normalized, Stringent | Non-Normalized, Relaxed | +| ----------- | ------ | ----- | --------------------- | ------------------- | ------------------------- | ----------------------- | +| Macs2 | AVAIL | AVAIL | NA | NA | NA | NA | +| SEACR | NA | NA | AVAIL w/o SPIKEIN | AVAIL w/o SPIKEIN | AVAIL w/ SPIKEIN | AVAIL w/ SPIKEIN | +| GoPeaks | AVAIL | AVAIL | NA | NA | NA | NA | ``` # Recommended list @@ -94,22 +113,26 @@ A complete list of the available peak calling parameters and the recommended lis ``` ##### 2.1.3.1.3.1 Macs2 additional option + MACS2 can be run with or without the control. adding a control will increase peak specificity Selecting "Y" for the `macs2_control` will run the paired control sample provided in the sample manifest ##### 2.1.3.1.4 Quality Tresholds + Thresholds for quality can be controled through the `quality_tresholds` parameter. This must be a list of comma separated values. minimum of numeric value required. - default MACS2 qvalue is 0.05 https://manpages.ubuntu.com/manpages/xenial/man1/macs2_callpeak.1.html - default GOPEAKS pvalue is 0.05 https://github.com/maxsonBraunLab/gopeaks/blob/main/README.md - default SEACR FDR threshold 1 https://github.com/FredHutch/SEACR/blob/master/README.md + ``` #default values quality_thresholds: "0.1, 0.05, 0.01" ``` #### 2.1.3.2 References -Additional reference files may be added to the pipeline, if other species were to be used. + +Additional reference files may be added to the pipeline, if other species were to be used. The absolute file paths which must be included are: @@ -119,15 +142,17 @@ The absolute file paths which must be included are: The following information must be included: 1. regions: "list of regions to be included; IE chr1 chr2 chr3" -2. macs2_g: "macs2 genome shorthand; IE mm IE hs" +2. macs2_g: "macs2 genome shorthand; IE mm IE hs" ## 2.2 Preparing Manifests + There are two manifests, one which required for all pipeliens and one that is only required if running a differential analysis. These files describe information on the samples and desired contrasts. The paths of these files are defined in the snakemake_config.yaml file. These files are: - samplemanifest - contrasts ### 2.2.1 Samples Manifest (REQUIRED) + This manifest will include information to sample level information. It includes the following column headers: - sampleName: the sample name WITHOUT replicate number (IE "SAMPLE") @@ -140,23 +165,22 @@ This manifest will include information to sample level information. It includes An example sampleManifest file is shown below: - -| sampleName| replicateNumber| isControl| controlName| controlReplicateNumber| path_to_R1| path_to_R2 -| --- |--- |--- |--- |--- |--- |--- | -| 53_H3K4me3| 1| N| HN6_IgG_rabbit_negative_control| 1| PIPELINE_HOME/.test/53_H3K4me3_1.R1.fastq.gz| PIPELINE_HOME/.test/53_H3K4me3_1.R2.fastq.gz -| 53_H3K4me3| 2| N| HN6_IgG_rabbit_negative_control| 1| PIPELINE_HOME/.test/53_H3K4me3_2.R1.fastq.gz| PIPELINE_HOME/.test/53_H3K4me3_2.R2.fastq.gz -| HN6_H3K4me3| 1| N| HN6_IgG_rabbit_negative_control| 1| PIPELINE_HOME/.test/HN6_H3K4me3_1.R1.fastq.gz| PIPELINE_HOME/.test/HN6_H3K4me3_1.R2.fastq.gz -| HN6_H3K4me3| 2| N| HN6_IgG_rabbit_negative_control| 1| PIPELINE_HOME/.test/HN6_H3K4me3_2.R1.fastq.gz| PIPELINE_HOME/.test/HN6_H3K4me3_2.R2.fastq.gz -| HN6_IgG_rabbit_negative_control| 1| Y| -| -| PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz| PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz - +| sampleName | replicateNumber | isControl | controlName | controlReplicateNumber | path_to_R1 | path_to_R2 | +| ------------------------------- | --------------- | --------- | ------------------------------- | ---------------------- | ----------------------------------------------------------------- | ----------------------------------------------------------------- | +| 53_H3K4me3 | 1 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/53_H3K4me3_1.R1.fastq.gz | PIPELINE_HOME/.test/53_H3K4me3_1.R2.fastq.gz | +| 53_H3K4me3 | 2 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/53_H3K4me3_2.R1.fastq.gz | PIPELINE_HOME/.test/53_H3K4me3_2.R2.fastq.gz | +| HN6_H3K4me3 | 1 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/HN6_H3K4me3_1.R1.fastq.gz | PIPELINE_HOME/.test/HN6_H3K4me3_1.R2.fastq.gz | +| HN6_H3K4me3 | 2 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/HN6_H3K4me3_2.R1.fastq.gz | PIPELINE_HOME/.test/HN6_H3K4me3_2.R2.fastq.gz | +| HN6_IgG_rabbit_negative_control | 1 | Y | - | - | PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz | PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz | ### 2.2.2 Contrast Manifest (OPTIONAL) + This manifest will include sample information to performed differential comparisons. An example contrast file: -| condition1 | condition2 | -| --- | --- | +| condition1 | condition2 | +| ----------------------- | -------------------- | | MOC1_siSmyd3_2m_25_HCHO | MOC1_siNC_2m_25_HCHO | **Note**: you must have more than one sample per condition in order to perform differential analysis with DESeq2 diff --git a/docs/user-guide/run.md b/docs/user-guide/run.md index e08f634..bd03a45 100644 --- a/docs/user-guide/run.md +++ b/docs/user-guide/run.md @@ -1,7 +1,9 @@ # 3. Running the Pipeline ## 3.1 Pipeline Overview + The Snakemake workflow has a multiple options: + ``` Usage: bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle -m/--runmode= -w/--workdir= 1. RUNMODE: [Type: String] Valid options: @@ -16,30 +18,34 @@ Usage: bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle -m/--runmode= NOTE: Previously, the following files were used as the tss BEDs but these: -> * are 1-based (BEDs are 0-based) -> * have 5 columns (BEDs have 3 or 6 or 12 columns) -> * column 5 is strand (BEDs have column 6 reserved for strand) -> -> Hence, we have replaced these with the new BED files created using the above commands. +> +> - are 1-based (BEDs are 0-based) +> - have 5 columns (BEDs have 3 or 6 or 12 columns) +> - column 5 is strand (BEDs have column 6 reserved for strand) +> +> Hence, we have replaced these with the new BED files created using the above commands. +> > #### hg38.tss.bed source +> > https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/ref/hg38.tss.bed +> > #### hg19.tss.bed source +> > https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/ref/hg19.tss.bed diff --git a/resources/tss_bed/gtf2tssBed.py b/resources/tss_bed/gtf2tssBed.py index 6dbbdf5..4568d0d 100755 --- a/resources/tss_bed/gtf2tssBed.py +++ b/resources/tss_bed/gtf2tssBed.py @@ -2,46 +2,58 @@ import argparse import os + def get_gene_name(s): - s=list(map(lambda x:x.replace(";","").replace("\"",""), s.strip().split(" "))) - for i,v in enumerate(s): - if v == "gene_name": - gene_name = s[i+1] - return gene_name - elif v == "gene": - gene_name = s[i+1] - return gene_name - else: - return "Unknown" + s = list(map(lambda x: x.replace(";", "").replace('"', ""), s.strip().split(" "))) + for i, v in enumerate(s): + if v == "gene_name": + gene_name = s[i + 1] + return gene_name + elif v == "gene": + gene_name = s[i + 1] + return gene_name + else: + return "Unknown" + parser = argparse.ArgumentParser() parser.add_argument("--gtf", help="GTF input file", required=True) parser.add_argument("--bed", help="TSS BED output file", required=True) args = parser.parse_args() -def get_gene_start(start,end,strand): - if strand == "+": - return int(start)-1 - else: - return int(end)-1 + +def get_gene_start(start, end, strand): + if strand == "+": + return int(start) - 1 + else: + return int(end) - 1 + if not os.path.exists(args.gtf): - print("%s : GTF File is missing!"%(args.gtf)) - exit() + print("%s : GTF File is missing!" % (args.gtf)) + exit() -gtf = open(args.gtf,'r') -bed = open(args.bed,'w') +gtf = open(args.gtf, "r") +bed = open(args.bed, "w") for l in gtf.readlines(): - if l.startswith("#"): continue - l = l.split("\t") - if l[2] == "gene": - gene_name = get_gene_name(l[8]) - chrom = l[0] - start = l[3] - end = l[4] - strand = l[6] - gene_start = get_gene_start(start,end,strand) - bed_line = [chrom, str(gene_start), str(gene_start+1),gene_name,str(0),strand] - bed.write("%s\n"%("\t".join(bed_line))) + if l.startswith("#"): + continue + l = l.split("\t") + if l[2] == "gene": + gene_name = get_gene_name(l[8]) + chrom = l[0] + start = l[3] + end = l[4] + strand = l[6] + gene_start = get_gene_start(start, end, strand) + bed_line = [ + chrom, + str(gene_start), + str(gene_start + 1), + gene_name, + str(0), + strand, + ] + bed.write("%s\n" % ("\t".join(bed_line))) gtf.close() bed.close() diff --git a/workflow/Snakefile b/workflow/Snakefile index 9654dd4..75232e7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -99,7 +99,7 @@ def run_contrasts(wildcards): files=[] if config["run_contrasts"]: files.append(join(RESULTSDIR,"replicate_sample.tsv")) - + # inputs for matrix inputs=expand(join(RESULTSDIR,"peaks","{qthresholds}","contrasts","{contrast_list}.{dupstatus}","{contrast_list}.{dupstatus}.{peak_caller_type}.txt"),qthresholds=QTRESHOLDS, contrast_list=CONTRAST_LIST,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE) files.extend(inputs) @@ -159,7 +159,7 @@ def get_enrich(wildcards): def get_combined(wildcards): files = [] - if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) files.extend(combined_m) return files @@ -213,7 +213,7 @@ rule all: ########################################## # manifests unpack(run_pipe_prep), - + # ALIGN / deeptools_bw unpack(run_deeptools_bw), @@ -231,7 +231,7 @@ rule all: # ALIGN / alignstats expand(join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"),replicate=REPLICATES), - + # ALIGN / gather_alignstats join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"), @@ -247,7 +247,7 @@ rule all: # DIFF / create_contrast_data_files unpack(run_contrasts), - + # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment unpack(get_motifs), unpack(get_enrich), @@ -302,11 +302,11 @@ onerror: # ALIGN / alignstats join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"), - + # ALIGN / gather_alignstats join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"), - # ALIGN / create_library_norm_scales + # ALIGN / create_library_norm_scales join(RESULTSDIR,"alignment_stats","library_scale.tsv"), # ALIGN / bam2bg diff --git a/workflow/envs/dummy_env.yaml b/workflow/envs/dummy_env.yaml index 8b9de65..8724513 100644 --- a/workflow/envs/dummy_env.yaml +++ b/workflow/envs/dummy_env.yaml @@ -1,3 +1,3 @@ # conda environments needed for the workflow should be stored here. # An alternative to conda environments, is containers (singularity or docker) -# Check https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html?highlight=conda#containerization-of-conda-based-workflows for details. \ No newline at end of file +# Check https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html?highlight=conda#containerization-of-conda-based-workflows for details. diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index ca3e533..c20451e 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -37,7 +37,7 @@ rule trim: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -62,7 +62,7 @@ rule trim: rule align: """ Align using bowtie: - * use --dovetail option via "bowtie2_parameters" in config.yaml. This is recommended for + * use --dovetail option via "bowtie2_parameters" in config.yaml. This is recommended for Cut and Run where overlapping R1 and R2 alignments are expected BAM is sorted and indexed, and stats are collected using flagstat and idxstats """ @@ -88,7 +88,7 @@ rule align: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -127,7 +127,7 @@ rule filter: bamidxstats=join(RESULTSDIR,"bam","{replicate}.{dupstatus}.bam.idxstats"), params: replicate = "{replicate}", - dupstatus = "{dupstatus}", # can be "dedup" or "no_dedup" + dupstatus = "{dupstatus}", # can be "dedup" or "no_dedup" fragment_len_filter = config["fragment_len_filter"], pyscript = join(SCRIPTSDIR,"_filter_bam.py"), memg = getmemg("filter") @@ -141,7 +141,7 @@ rule filter: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -158,7 +158,7 @@ rule filter: --TMP_DIR ${{TMPDIR}}/{params.replicate}_{params.dupstatus}_picardtmp \\ --CREATE_INDEX true \\ --METRICS_FILE {output.bam}.dupmetrics - + python {params.pyscript} --inputBAM ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.filtered.tmp1.bam \\ --outputBAM ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.filtered.bam \\ --fragmentlength {params.fragment_len_filter} \\ @@ -168,12 +168,12 @@ rule filter: # deduplicate only the spikeins genome_regions=$(cut -f1 {input.genome_len} | tr '\\n' ' ') samtools view -@{threads} -b {input.bam} $genome_regions | \\ - samtools sort -@{threads} -T $TMPDIR -o ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.genome.bam - + samtools sort -@{threads} -T $TMPDIR -o ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.genome.bam - spikein_regions=$(cut -f1 {input.spikein_len} | tr '\\n' ' ') samtools view -@{threads} -b {input.bam} $spikein_regions | \\ - samtools sort -@{threads} -T $TMPDIR -o ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.spikein.bam - + samtools sort -@{threads} -T $TMPDIR -o ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.spikein.bam + mkdir -p ${{TMPDIR}}/{params.replicate}_{params.dupstatus}_picardtmp java -Xmx{params.memg} -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ --INPUT ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.spikein.bam \\ @@ -321,7 +321,7 @@ rule bam2bg: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -333,16 +333,16 @@ rule bam2bg: echo "No spike-in scale was used" spikein_scale=1 elif [[ "{params.spikein}" == "LIBRARY" ]];then - spikein_scale=`cat {input.library_file} | grep {params.replicate} | grep {params.dupstatus} | cut -f2 -d" " | head -n1` + spikein_scale=`cat {input.library_file} | grep {params.replicate} | grep {params.dupstatus} | cut -f2 -d" " | head -n1` echo "The spike-in is generated from the library size" else spikein_readcount=$(while read a b;do awk -v a=$a '{{if ($1==a) {{print $3}}}}' {input.bamidxstats};done < {input.spikein_len} | awk '{{sum=sum+$1}}END{{print sum}}') - + # if the spikein_readcount is below threshold, then there is not enough of the spikein control to use total_count=$(awk '{{sum+=$3; sum+=$4;}}END{{print sum;}}' {input.bamidxstats}) spikein_percent=`echo "scale=6 ; $spikein_readcount / $total_count * 100" | bc`;\ - if [[ $spikein_percent < 0.001 ]]; then + if [[ $spikein_percent < 0.001 ]]; then echo "The spikein percentage of {input.bam} was below the threshold (0.001%) at $spikein_percent%. The spikein_scale was set to 1." spikein_scale=1 else @@ -353,12 +353,12 @@ rule bam2bg: # create fragments file samtools view -b -@{threads} {input.bam} {params.regions} | samtools sort -n -@{threads} -T $TMPDIR -o ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.bam - bedtools bamtobed -bedpe -i ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.bam > ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.bed - + awk -v fl={params.fragment_len_filter} '{{ if ($1==$4 && $6-$2 < fl) {{print $0}}}}' ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.bed > ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.clean.bed - + cut -f 1,2,6 ${{TMPDIR}}/{params.replicate}.{params.dupstatus}.clean.bed | \\ LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n -k3,3n > {output.fragments_bed} - + # run bedtools bedtools genomecov -bg -scale $spikein_scale -i {output.fragments_bed} -g {input.genome_len} > {output.bg} @@ -483,7 +483,7 @@ rule cov_correlation: pearson_plot=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.png"), pca=join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"), hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"), - pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png") + pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png") params: rscript=join(SCRIPTSDIR,"_plot_correlation.R"), dupstatus="{dupstatus}" @@ -506,7 +506,7 @@ rule cov_correlation: -o {output.pearson_plot} \ --removeOutliers \ --outFileCorMatrix {output.pearson_corr} - + # Plot PCA plotPCA -in {output.counts} --outFileNameData {output.pca} diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index 97b966a..3ead6d0 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -28,14 +28,14 @@ rule create_contrast_data_files: """ Output file should include 6 columns 1)condition 2)sample - 53_H3K4me3 53_H3K4me3_1 - + 53_H3K4me3 53_H3K4me3_1 + 3)bed /results/peaks/gopeaks/53_H3K4me3_1_vs_HN6_IgG_rabbit_negative_control_1.dedup.broadGo_peaks.bed - + 4)bedgraph 5)scaling factor - /results/bedgraph/53_H3K4me3_1.dedup.bedgraph 86.32596685082872928176 - + /results/bedgraph/53_H3K4me3_1.dedup.bedgraph 86.32596685082872928176 + 6)bed /results/fragments/53_H3K4me3_1.dedup.fragments.bed """ @@ -130,7 +130,7 @@ rule make_counts_matrix: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -172,7 +172,7 @@ rule DESeq: """ set -exo pipefail dirname=$(basename $(mktemp)) - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID/$dirname" else TMPDIR="/dev/shm/$dirname" @@ -281,7 +281,7 @@ rule diffbb: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -290,7 +290,7 @@ rule diffbb: fi ## add check to ensure that DESEQ2 ran to completion - ## mainly used in tinytest scenarios, but also used if + ## mainly used in tinytest scenarios, but also used if ## Nsamples/group is 1 check=`wc -c {input.results} | cut -f1 -d" "` if [[ $check == 0 ]]; then diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk index e4e729a..8e99fdd 100644 --- a/workflow/rules/init.smk +++ b/workflow/rules/init.smk @@ -12,7 +12,7 @@ pp = pprint.PrettyPrinter(indent=4) ######################################################### ######################################################### -# FILE-ACTION FUNCTIONS +# FILE-ACTION FUNCTIONS ######################################################### def check_existence(filename): if not os.path.exists(filename): @@ -42,7 +42,7 @@ def get_file_size(filename): ######################################################### CONFIGFILE = str(workflow.overwrite_configfiles[0]) -# set memory limit +# set memory limit # used for sambamba sort, etc MEMORYG="100G" @@ -121,7 +121,7 @@ for i,t in enumerate(list(df[df['isControl']=="N"]['replicateName'].unique())): if not c in REPLICATES: print("# Control NOT found for sampleName_replicateNumber:"+t) print("# "+config["samplemanifest"]+" has no entry for sample:"+crow.controlName+" replicateNumber:"+str(crow.controlReplicateNumber)) - exit() + exit() print("## "+str(i+1)+") "+t+" "+c) process_replicates.extend([t,c]) TREATMENTS.append(t) @@ -138,7 +138,7 @@ if len(process_replicates)!=len(REPLICATES): REPLICATES = process_replicates print("# Contrast manifest is confirmed!") -# write out the treatment:cntrl +# write out the treatment:cntrl fpath=join(RESULTSDIR,"treatment_control_list.txt") originalDF = pd.DataFrame(TREAT_to_CONTRL_DICT.items()).rename(columns={0: 'key', 1: 'val'}) split_keysDF = pd.DataFrame(originalDF['key'].str.split(':').tolist()) @@ -366,7 +366,7 @@ rule create_replicate_sample_table: print("# %s file created!"%(rg_file)) rule create_reference: - input: + input: genomefa_source=GENOMEFA, blacklist_source=GENOMEBLACKLIST, output: @@ -384,7 +384,7 @@ rule create_reference: spiked_source=SPIKED_GENOMEFA, spiked_output=join(BOWTIE2_INDEX,"spikein.fa"), refdata=refdata, - envmodules: + envmodules: TOOLS["bowtie2"], TOOLS["samtools"], TOOLS["bedtools"], @@ -392,7 +392,7 @@ rule create_reference: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -401,7 +401,7 @@ rule create_reference: fi # create dir and links - if [[ -d {params.bowtie2_dir} ]]; then rm -r {params.bowtie2_dir}; fi + if [[ -d {params.bowtie2_dir} ]]; then rm -r {params.bowtie2_dir}; fi mkdir -p {params.bowtie2_dir}/ref ln -s {input.genomefa_source} {output.genomefa} ln -s {input.blacklist_source} {output.blacklist} diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index 173fd12..add11a6 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -1,10 +1,10 @@ # TSV file should include 6 columns # 1)condition 2)sample -# 53_H3K4me3 53_H3K4me3_1 +# 53_H3K4me3 53_H3K4me3_1 # 3)bed # /results/peaks/gopeaks/53_H3K4me3_1_vs_HN6_IgG_rabbit_negative_control_1.dedup.broadGo_peaks.bed # 4)bedgraph 5)scaling factor -# /results/bedgraph/53_H3K4me3_1.dedup.bedgraph 86.32596685082872928176 +# /results/bedgraph/53_H3K4me3_1.dedup.bedgraph 86.32596685082872928176 # 6)bed # /results/fragments/53_H3K4me3_1.dedup.fragments.bed @@ -63,7 +63,7 @@ rule macs2_narrow: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -75,13 +75,13 @@ rule macs2_narrow: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set frag file treat_bed={params.frag_bed_path}/${{treatment}}.{params.dupstatus}.fragments.bed cntrl_bed={params.frag_bed_path}/${{control}}.{params.dupstatus}.fragments.bed # run with or without control - if [[ {params.control_flag} == "Y" ]]; then + if [[ {params.control_flag} == "Y" ]]; then file_base="${{treatment}}_vs_${{control}}.{params.dupstatus}" macs2 callpeak \\ @@ -98,7 +98,7 @@ rule macs2_narrow: --nomodel else file_base="${{treatment}}_vs_${{control}}.{params.dupstatus}" - + macs2 callpeak \\ -t ${{treat_bed}} \\ -f BED \\ @@ -111,12 +111,12 @@ rule macs2_narrow: --call-summits \\ --nomodel fi - + # mv output and rename for consistency mv $TMPDIR/${{file_base}}_peaks.narrowPeak {output.peak_file} mv $TMPDIR/${{file_base}}_summits.bed {output.summit_file} mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file} - + # create bigbed files, zip cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/narrow.bed bedToBigBed -type=bed3 ${{TMPDIR}}/narrow.bed {input.genome_len} ${{TMPDIR}}/narrow.bigbed @@ -153,7 +153,7 @@ rule macs2_broad: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -165,15 +165,15 @@ rule macs2_broad: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set frag file treat_bed={params.frag_bed_path}/${{treatment}}.{params.dupstatus}.fragments.bed cntrl_bed={params.frag_bed_path}/${{control}}.{params.dupstatus}.fragments.bed # run with or without control - if [[ {params.control_flag} == "Y" ]]; then + if [[ {params.control_flag} == "Y" ]]; then file_base="${{treatment}}_vs_${{control}}.{params.dupstatus}" - + macs2 callpeak \\ -t ${{treat_bed}} \\ -c ${{cntrl_bed}} \\ @@ -188,7 +188,7 @@ rule macs2_broad: --nomodel else file_base="${{treatment}}_vs_nocontrol.{params.dupstatus}" - + macs2 callpeak \\ -t ${{treat_bed}} \\ -f BED \\ @@ -201,7 +201,7 @@ rule macs2_broad: --broad --broad-cutoff {params.broadtreshold} \\ --nomodel fi - + # mv output and rename for consistency mv $TMPDIR/${{file_base}}_peaks.broadPeak {output.peak_file} mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file} @@ -237,7 +237,7 @@ rule seacr_stringent: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -249,7 +249,7 @@ rule seacr_stringent: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set frag file treat_bed={params.bg_path}/${{treatment}}.{params.dupstatus}.bedgraph cntrl_bed={params.bg_path}/${{control}}.{params.dupstatus}.bedgraph @@ -262,7 +262,7 @@ rule seacr_stringent: --mode stringent \\ --threshold {params.qthresholds} \\ --output norm - + # mv output and rename for consistency mv $TMPDIR/norm.stringent.bed {output.peak_file} else @@ -308,7 +308,7 @@ rule seacr_relaxed: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -320,7 +320,7 @@ rule seacr_relaxed: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set frag file treat_bed={params.bg_path}/${{treatment}}.{params.dupstatus}.bedgraph cntrl_bed={params.bg_path}/${{control}}.{params.dupstatus}.bedgraph @@ -332,7 +332,7 @@ rule seacr_relaxed: --normalize norm \\ --mode relaxed \\ --threshold {params.qthresholds} \\ - --output norm + --output norm # mv output and rename for consistency mv $TMPDIR/norm.relaxed.bed {output.peak_file} @@ -344,7 +344,7 @@ rule seacr_relaxed: --mode relaxed \\ --threshold {params.qthresholds} \\ --output non - + mv $TMPDIR/non.relaxed.bed {output.peak_file} fi @@ -378,7 +378,7 @@ rule gopeaks_narrow: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -389,7 +389,7 @@ rule gopeaks_narrow: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set bam file treat_bam={params.bam_path}/${{treatment}}.{params.dupstatus}.bam cntrl_bam={params.bam_path}/${{control}}.{params.dupstatus}.bam @@ -402,7 +402,7 @@ rule gopeaks_narrow: mv $TMPDIR/narrow_gopeaks.json {output.json} # create bigbed files - cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/narrow_peaks.bed + cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/narrow_peaks.bed bedToBigBed -type=bed3 ${{TMPDIR}}/narrow_peaks.bed {input.genome_len} ${{TMPDIR}}/narrow_peaks.bigbed bgzip -c ${{TMPDIR}}/narrow_peaks.bigbed > {output.bg_file} """ @@ -431,7 +431,7 @@ rule gopeaks_broad: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -442,7 +442,7 @@ rule gopeaks_broad: # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` - + # set bam file treat_bam={params.bam_path}/${{treatment}}.{params.dupstatus}.bam cntrl_bam={params.bam_path}/${{control}}.{params.dupstatus}.bam @@ -455,7 +455,7 @@ rule gopeaks_broad: mv $TMPDIR/broad_gopeaks.json {output.json} # create bigbed files - cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed + cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed bedToBigBed -type=bed3 ${{TMPDIR}}/broad.bed {input.genome_len} ${{TMPDIR}}/broad.bigbed bgzip -c ${{TMPDIR}}/broad.bigbed > {output.bg_file} """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 143f770..1ead23d 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -39,7 +39,7 @@ rule qc_fastqc: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -49,7 +49,7 @@ rule qc_fastqc: # run FASTQC fastqc {input.R1} {input.R2} -o {params.base} - + # Gzip input files gunzip -c {input.R1} > ${{TMPDIR}}/{params.R1} gunzip -c {input.R2} > ${{TMPDIR}}/{params.R2} @@ -62,7 +62,7 @@ rule qc_fastqc: --subset 1000000 \\ --aligner bowtie2 \\ --force - + # Run FastQ Validator mkdir -p {params.base_val} {params.fastq_val} \\ @@ -104,12 +104,12 @@ rule spikein_assessment: shell: """ if [[ {params.spikein} == "ecoli" ]]; then species_name="NC_000913.3"; else species_name=""; fi - + # get sample list sample_list="{input.bams}" clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"` - # rum script + # rum script Rscript {params.rscript_wrapper} \\ --rmd {params.rmd} \\ --carlisle_functions {params.carlisle_functions} \\ @@ -146,7 +146,7 @@ if ("gopeaks_narrow" in PEAKTYPE) or ("gopeaks_broad" in PEAKTYPE): shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -190,7 +190,7 @@ else: shell: """ set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) @@ -207,4 +207,4 @@ else: -o $TMPDIR/qc mv $TMPDIR/qc/multiqc_report.html {output.report} - """ \ No newline at end of file + """ diff --git a/workflow/scripts/_carlisle_functions.R b/workflow/scripts/_carlisle_functions.R index 0f3c28b..9aee339 100644 --- a/workflow/scripts/_carlisle_functions.R +++ b/workflow/scripts/_carlisle_functions.R @@ -2,8 +2,8 @@ # LIBRARY ######################################################################## library(tidyverse) -load_packages <- function(){ - pkgs <- 'BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0 +load_packages <- function() { + pkgs <- "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0 chipenrich ChIPseeker DESeq2 @@ -28,39 +28,43 @@ tidyverse TxDb.Hsapiens.UCSC.hg19.knownGene TxDb.Hsapiens.UCSC.hg38.knownGene TxDb.Mmusculus.UCSC.mm10.knownGene -yaml' - package_vctr <- pkgs %>% stringr::str_split('\n') %>% unlist() +yaml" + package_vctr <- pkgs %>% + stringr::str_split("\n") %>% + unlist() invisible(lapply(package_vctr, library, character.only = TRUE)) } ######################################################################## # SPIKE IN PLOT ######################################################################## -GENERATE_SPIKEIN_PLOT<-function(input_df,spike_type){ - spike_df=data.frame() - for (rowid in rownames(input_df,spike_type)){ +GENERATE_SPIKEIN_PLOT <- function(input_df, spike_type) { + spike_df <- data.frame() + for (rowid in rownames(input_df, spike_type)) { # spike_type="NC_000913.3" - + # read in file - stats=read.table(input_df[rowid,"bam"]) - stats=stats[,c("V1","V3")] - colnames(stats)=c("location","read_count") - + stats <- read.table(input_df[rowid, "bam"]) + stats <- stats[, c("V1", "V3")] + colnames(stats) <- c("location", "read_count") + # add metadata - stats$sampleid=input_df[rowid,"repid"] - stats$groupid=input_df[rowid,"sampleid"] - - if(nrow(stats)==0){ - spike_df=subset(stats,location==spike_type) - } else{ - spike_df=rbind(subset(stats,location==spike_type), - spike_df) + stats$sampleid <- input_df[rowid, "repid"] + stats$groupid <- input_df[rowid, "sampleid"] + + if (nrow(stats) == 0) { + spike_df <- subset(stats, location == spike_type) + } else { + spike_df <- rbind( + subset(stats, location == spike_type), + spike_df + ) } } - - p=ggplot(data=spike_df,aes(x=sampleid,y=read_count,fill=groupid)) + - geom_bar(stat="identity") - p_final=p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + + p <- ggplot(data = spike_df, aes(x = sampleid, y = read_count, fill = groupid)) + + geom_bar(stat = "identity") + p_final <- p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + ggtitle(paste0("Spike-in control values\n", spike_type)) print(p_final) @@ -70,178 +74,197 @@ GENERATE_SPIKEIN_PLOT<-function(input_df,spike_type){ ######################################################################## # GO ENRICHMENT ######################################################################## -READ_PEAK_FILE<-function(peak_file_in){ - peak_df=read.csv(peak_file_in,sep="\t",header=FALSE)[,c("V1","V2","V3")] - colnames(peak_df)=c("chrom","start","end") - +READ_PEAK_FILE <- function(peak_file_in) { + peak_df <- read.csv(peak_file_in, sep = "\t", header = FALSE)[, c("V1", "V2", "V3")] + colnames(peak_df) <- c("chrom", "start", "end") + return(peak_df) } -SET_LOCUST_DEF<-function(locus_loc_in){ +SET_LOCUST_DEF <- function(locus_loc_in) { # depending on where the majority of peaks fall, ID locus choice to use - if (locus_loc_in=="<0.1" | locus_loc_in=="0.1 - 1"){ - locus_loc_in="1kb" - } else if (locus_loc_in=="1 - 5"){ - locus_loc_in="5kb" - } else if (locus_loc_in=="5 - 10" ) { - locus_loc_in="10kb" + if (locus_loc_in == "<0.1" | locus_loc_in == "0.1 - 1") { + locus_loc_in <- "1kb" + } else if (locus_loc_in == "1 - 5") { + locus_loc_in <- "5kb" + } else if (locus_loc_in == "5 - 10") { + locus_loc_in <- "10kb" } else { - locus_loc_in="not applicable given distribution" + locus_loc_in <- "not applicable given distribution" } - - print(paste0("The locust defintion is determined to be: ",locus_loc)) + + print(paste0("The locust defintion is determined to be: ", locus_loc)) return(locus_loc) } -SET_LOCUST_LIST<-function(locus_loc_in){ +SET_LOCUST_LIST <- function(locus_loc_in) { # depending on where the majority of peaks fall, ID locus choice to use - if (locus_loc_in=="<0.1" | locus_loc_in=="0.1 - 1"){ - locusdf_list=c("1kb","1kb_outside","1kb_outside_upstream") - } else if (locus_loc_in=="1 - 5"){ - locusdf_list=c("5kb","5kb_outside","5kb_outside_upstream") - } else if (locus_loc_in=="5 - 10" ) { - locusdf_list=c("10kb","10kb_outside","10kb_outside_upstream") + if (locus_loc_in == "<0.1" | locus_loc_in == "0.1 - 1") { + locusdf_list <- c("1kb", "1kb_outside", "1kb_outside_upstream") + } else if (locus_loc_in == "1 - 5") { + locusdf_list <- c("5kb", "5kb_outside", "5kb_outside_upstream") + } else if (locus_loc_in == "5 - 10") { + locusdf_list <- c("10kb", "10kb_outside", "10kb_outside_upstream") } else { - locusdf_list="none" + locusdf_list <- "none" } - + return(locusdf_list) } -PLOT_QC_FUNCTIONS<-function(function_in,rowid_in,l_id){ - - if (function_in=="polyenrich"){ - p=plot_polyenrich_spline(peaks = READ_PEAK_FILE(peak_df[rowid_in,"peak_bed"]), - locusdef = l_id, genome = speciesID) - } else if (function_in=="spline"){ - p=plot_chipenrich_spline(peaks = READ_PEAK_FILE(peak_df[rowid_in,"peak_bed"]), - locusdef = l_id, genome = speciesID) - } else if (function_in=="cov"){ - p=plot_gene_coverage(peaks = READ_PEAK_FILE(peak_df[rowid_in,"peak_bed"]), - locusdef = l_id, genome = speciesID) - } - - else{ - print(paste0("Missing function",function_in)) +PLOT_QC_FUNCTIONS <- function(function_in, rowid_in, l_id) { + if (function_in == "polyenrich") { + p <- plot_polyenrich_spline( + peaks = READ_PEAK_FILE(peak_df[rowid_in, "peak_bed"]), + locusdef = l_id, genome = speciesID + ) + } else if (function_in == "spline") { + p <- plot_chipenrich_spline( + peaks = READ_PEAK_FILE(peak_df[rowid_in, "peak_bed"]), + locusdef = l_id, genome = speciesID + ) + } else if (function_in == "cov") { + p <- plot_gene_coverage( + peaks = READ_PEAK_FILE(peak_df[rowid_in, "peak_bed"]), + locusdef = l_id, genome = speciesID + ) + } else { + print(paste0("Missing function", function_in)) } return(p) } -PLOT_QC_MAIN<-function(function_in,rowid_in){ - locusdf_list=c(strsplit(peak_df[1,"locusdf_list"],",")[[1]],"nearest_tss","nearest_gene") - locusdf_list=locusdf_list[locusdf_list != "none"] - +PLOT_QC_MAIN <- function(function_in, rowid_in) { + locusdf_list <- c(strsplit(peak_df[1, "locusdf_list"], ",")[[1]], "nearest_tss", "nearest_gene") + locusdf_list <- locusdf_list[locusdf_list != "none"] + # create plots, catching for errors - counter=1 - legend_text="" - row_count=1; col_count=1 - p=list() - for (l_id in locusdf_list){ - plot_test = tryCatch({ - PLOT_QC_FUNCTIONS(function_in,rowid_in,l_id) - }, warning = function(w) { - "warning" - }, error = function(e) { - "failed" - }) - + counter <- 1 + legend_text <- "" + row_count <- 1 + col_count <- 1 + p <- list() + for (l_id in locusdf_list) { + plot_test <- tryCatch( + { + PLOT_QC_FUNCTIONS(function_in, rowid_in, l_id) + }, + warning = function(w) { + "warning" + }, + error = function(e) { + "failed" + } + ) + # save successful plots in plot_list # save successful plots legend - if (class(plot_test) == "trellis"){ - p[[counter]]=plot_test - counter=counter+1 - + if (class(plot_test) == "trellis") { + p[[counter]] <- plot_test + counter <- counter + 1 + # if true, this is a new row - if (col_count==1){ - legend_text=paste0(legend_text,"Row ",row_count,": | Col 1 (",l_id,")") - col_count=2 - } else if (col_count==2 & row_count==1){ - legend_text=paste0(legend_text," | Col 2 (",l_id,")\n") - row_count=row_count+1 - col_count=1 - } else if (col_count==2 & row_count>1){ - legend_text=paste0(legend_text," | Col 2 (",l_id,")") - col_count=col_count+1 - } else{ - legend_text=paste0(legend_text," | Col 3 (",l_id,")\n") - col_count=1 - row_count=row_count+1 + if (col_count == 1) { + legend_text <- paste0(legend_text, "Row ", row_count, ": | Col 1 (", l_id, ")") + col_count <- 2 + } else if (col_count == 2 & row_count == 1) { + legend_text <- paste0(legend_text, " | Col 2 (", l_id, ")\n") + row_count <- row_count + 1 + col_count <- 1 + } else if (col_count == 2 & row_count > 1) { + legend_text <- paste0(legend_text, " | Col 2 (", l_id, ")") + col_count <- col_count + 1 + } else { + legend_text <- paste0(legend_text, " | Col 3 (", l_id, ")\n") + col_count <- 1 + row_count <- row_count + 1 } } } - + # print with data key cat(legend_text) - n=length(p) - if (n==0){ + n <- length(p) + if (n == 0) { print("All plots failed for this sample") - } else if (n==1){ + } else if (n == 1) { plot(c(p[[1]])) - } else if (n==2){ - plot(c(p[[1]],p[[2]])) - } else if (n==3){ - plot(c(p[[1]],p[[2]],p[[3]])) - } else if (n==4){ - plot(c(p[[1]],p[[2]],p[[3]],p[[4]])) - } else if (n==5){ - plot(c(p[[1]],p[[2]],p[[3]],p[[4]],p[[5]])) - } else if (n==6){ - plot(c(p[[1]],p[[2]],p[[3]],p[[4]],p[[5]],p[[6]])) - } else{ - plot(paste0("Missing N",n)) + } else if (n == 2) { + plot(c(p[[1]], p[[2]])) + } else if (n == 3) { + plot(c(p[[1]], p[[2]], p[[3]])) + } else if (n == 4) { + plot(c(p[[1]], p[[2]], p[[3]], p[[4]])) + } else if (n == 5) { + plot(c(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]])) + } else if (n == 6) { + plot(c(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]])) + } else { + plot(paste0("Missing N", n)) } } -GO_ANALYSIS_MAIN<-function(rowid,peak_enrichment){ - sampleid=peak_df[rowid,"sampleid"] - peakcaller=peak_df[rowid,"peak_caller"] - peaktype=peak_df[rowid,"peak_type"] - locus_def=peak_df[rowid,"locus_loc_short"] - - print(paste0("** ",sampleid," | ", peakcaller, " | ", peaktype," **")) - print(paste0("-- Peak analysis: ",peak_enrichment)) - - if (locus_def=="none"){ - locus_def="nearest_tss" +GO_ANALYSIS_MAIN <- function(rowid, peak_enrichment) { + sampleid <- peak_df[rowid, "sampleid"] + peakcaller <- peak_df[rowid, "peak_caller"] + peaktype <- peak_df[rowid, "peak_type"] + locus_def <- peak_df[rowid, "locus_loc_short"] + + print(paste0("** ", sampleid, " | ", peakcaller, " | ", peaktype, " **")) + print(paste0("-- Peak analysis: ", peak_enrichment)) + + if (locus_def == "none") { + locus_def <- "nearest_tss" } - - if (peak_enrichment=="hybrid"){ - results = hybridenrich(peaks = READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - genome = speciesID, genesets = geneset_id, - locusdef = locus_def, qc_plots = F, #randomization = 'complete', - out_name = NULL, n_cores = 1, min_geneset_size=10) - } else if (peak_enrichment=="broad"){ - results = broadenrich(peaks = READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - genome = speciesID, genesets = geneset_id, - locusdef = locus_def, qc_plots = FALSE, #randomization = 'complete', - out_name = NULL, n_cores=1, min_geneset_size=10) - } else if (peak_enrichment=="enrich"){ - results = chipenrich(peaks = READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - genome = speciesID, genesets = geneset_id, - locusdef = locus_def, qc_plots = FALSE, #randomization = 'complete', - out_name = NULL, n_cores=1) - } else if (peak_enrichment=="poly"){ - results = polyenrich(peaks = READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - genome = speciesID, genesets = geneset_id, - method = 'polyenrich', locusdef = locus_def, - qc_plots = FALSE, out_name = NULL, n_cores = 1) - } else if (peak_enrichment=="poly_weighted"){ - results = polyenrich(peaks = READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - genome = speciesID, genesets = geneset_id, - method="polyenrich_weighted", locusdef = "enhancer_plus5kb", - qc_plots = FALSE, out_name = NULL, n_cores = 1) - } else if (peak_enrichment=="reglocation"){ - results = proxReg(READ_PEAK_FILE(peak_df[rowid,"peak_bed"]), - reglocation = 'tss', genome = speciesID, - genesets=geneset_id, out_name=NULL) + + if (peak_enrichment == "hybrid") { + results <- hybridenrich( + peaks = READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + genome = speciesID, genesets = geneset_id, + locusdef = locus_def, qc_plots = F, # randomization = 'complete', + out_name = NULL, n_cores = 1, min_geneset_size = 10 + ) + } else if (peak_enrichment == "broad") { + results <- broadenrich( + peaks = READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + genome = speciesID, genesets = geneset_id, + locusdef = locus_def, qc_plots = FALSE, # randomization = 'complete', + out_name = NULL, n_cores = 1, min_geneset_size = 10 + ) + } else if (peak_enrichment == "enrich") { + results <- chipenrich( + peaks = READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + genome = speciesID, genesets = geneset_id, + locusdef = locus_def, qc_plots = FALSE, # randomization = 'complete', + out_name = NULL, n_cores = 1 + ) + } else if (peak_enrichment == "poly") { + results <- polyenrich( + peaks = READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + genome = speciesID, genesets = geneset_id, + method = "polyenrich", locusdef = locus_def, + qc_plots = FALSE, out_name = NULL, n_cores = 1 + ) + } else if (peak_enrichment == "poly_weighted") { + results <- polyenrich( + peaks = READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + genome = speciesID, genesets = geneset_id, + method = "polyenrich_weighted", locusdef = "enhancer_plus5kb", + qc_plots = FALSE, out_name = NULL, n_cores = 1 + ) + } else if (peak_enrichment == "reglocation") { + results <- proxReg(READ_PEAK_FILE(peak_df[rowid, "peak_bed"]), + reglocation = "tss", genome = speciesID, + genesets = geneset_id, out_name = NULL + ) } # set results, sig - result.out = results$results - alpha = sum(result.out$P.value < 0.05) / nrow(result.out) - print(paste0("--sig: ",alpha)) - + result.out <- results$results + alpha <- sum(result.out$P.value < 0.05) / nrow(result.out) + print(paste0("--sig: ", alpha)) + # write out - fpath=paste0(output_dir,"/",sampleid,".",peakcaller,".",dedup_status,".",peaktype,".",peak_enrichment,"_",geneset_id,".csv") - write.csv(result.out,fpath) -} \ No newline at end of file + fpath <- paste0(output_dir, "/", sampleid, ".", peakcaller, ".", dedup_status, ".", peaktype, ".", peak_enrichment, "_", geneset_id, ".csv") + write.csv(result.out, fpath) +} diff --git a/workflow/scripts/_diff_markdown.Rmd b/workflow/scripts/_diff_markdown.Rmd index 66e9d05..54a7355 100644 --- a/workflow/scripts/_diff_markdown.Rmd +++ b/workflow/scripts/_diff_markdown.Rmd @@ -21,7 +21,7 @@ params: elbowlimits: "~/../../../Volumes/ccbr1155/CS030666/analysis/results/peaks/contrasts/siNC_H4K20me3_vs_siSmyd3_H4K20me3__dedup__narrowGo_peaks.bed/siNC_H4K20me3_vs_siSmyd3_H4K20me3__dedup__narrowGo_peaks.bed_AUCbased_diffanalysis_elbowlimits.yaml" species: "hg38" gtf: "" -editor_options: +editor_options: chunk_output_type: console --- @@ -35,71 +35,72 @@ load_packages() ## Loading SampleInfo and Counts ```{r sampleinfo, include=TRUE, echo=FALSE} -sampleinfo = read.csv(params$coldata,header = TRUE,sep="\t",strip.white = TRUE,check.names = FALSE,colClasses = "character") +sampleinfo <- read.csv(params$coldata, header = TRUE, sep = "\t", strip.white = TRUE, check.names = FALSE, colClasses = "character") # filter based off of params -sampleinfo = sampleinfo[sampleinfo$group == params$condition1 | sampleinfo$group == params$condition2,] -sampleinfo$group = relevel(as.factor(sampleinfo$group),params$condition2) -rawcounts = read.csv(params$rawcountsmatrix, - header = TRUE,sep="\t", - comment.char = "#", - strip.white = TRUE, - check.names = FALSE, - colClasses = "character") -rawcounts = as.data.frame(rawcounts) -rawcounts %>% column_to_rownames(var="peakID") -> rawcounts +sampleinfo <- sampleinfo[sampleinfo$group == params$condition1 | sampleinfo$group == params$condition2, ] +sampleinfo$group <- relevel(as.factor(sampleinfo$group), params$condition2) +rawcounts <- read.csv(params$rawcountsmatrix, + header = TRUE, sep = "\t", + comment.char = "#", + strip.white = TRUE, + check.names = FALSE, + colClasses = "character" +) +rawcounts <- as.data.frame(rawcounts) +rawcounts %>% column_to_rownames(var = "peakID") -> rawcounts # filter based off of sampleinfo -rawcounts = rawcounts[,colnames(rawcounts)==sampleinfo$samplename] +rawcounts <- rawcounts[, colnames(rawcounts) == sampleinfo$samplename] # convert character to numeric to integer -x = matrix(as.numeric(as.matrix(rawcounts)),ncol=ncol(rawcounts)) -x = matrix(mapply(x,FUN=as.integer),ncol=ncol(rawcounts)) -x = as.data.frame(x) -colnames(x) = colnames(rawcounts) -rownames(x) = rownames(rawcounts) +x <- matrix(as.numeric(as.matrix(rawcounts)), ncol = ncol(rawcounts)) +x <- matrix(mapply(x, FUN = as.integer), ncol = ncol(rawcounts)) +x <- as.data.frame(x) +colnames(x) <- colnames(rawcounts) +rownames(x) <- rownames(rawcounts) # if lib size is greater than max integer size allowed, handle samples # cant replace with the original value since max integer is 2147483647 # anything larger will be replaced with an NA -x[is.na(x)]=2147483647 - -rawcounts = x -sampleinfo=sampleinfo[sampleinfo$samplename==colnames(rawcounts),] - -#determine lib reduction factor -if (mean(colSums(rawcounts))>1000000000){ - lib_factor=1e8 -}else if (mean(colSums(rawcounts))>100000000){ - lib_factor=1e7 -}else if (mean(colSums(rawcounts))>10000000){ - lib_factor=1e6 -} else if (mean(colSums(rawcounts))>1000000){ - lib_factor=1e5 -} else if (mean(colSums(rawcounts))>100000){ - lib_factor=1e4 -} else if (mean(colSums(rawcounts))>10000){ - lib_factor=1e3 -} else if (mean(colSums(rawcounts))>1000){ - lib_factor=1e2 -} else if (mean(colSums(rawcounts))>100){ - lib_factor=1e1 +x[is.na(x)] <- 2147483647 + +rawcounts <- x +sampleinfo <- sampleinfo[sampleinfo$samplename == colnames(rawcounts), ] + +# determine lib reduction factor +if (mean(colSums(rawcounts)) > 1000000000) { + lib_factor <- 1e8 +} else if (mean(colSums(rawcounts)) > 100000000) { + lib_factor <- 1e7 +} else if (mean(colSums(rawcounts)) > 10000000) { + lib_factor <- 1e6 +} else if (mean(colSums(rawcounts)) > 1000000) { + lib_factor <- 1e5 +} else if (mean(colSums(rawcounts)) > 100000) { + lib_factor <- 1e4 +} else if (mean(colSums(rawcounts)) > 10000) { + lib_factor <- 1e3 +} else if (mean(colSums(rawcounts)) > 1000) { + lib_factor <- 1e2 +} else if (mean(colSums(rawcounts)) > 100) { + lib_factor <- 1e1 } else { - lib_factor=1e1 + lib_factor <- 1e1 } -sampleinfo$library_size=colSums(rawcounts)/lib_factor -sampleinfodf = as.data.frame(sampleinfo) -sampleinfodf$dupstatus = params$dupstatus -rownames(sampleinfo) = sampleinfo$samplename -pander(sampleinfodf,style="rmarkdown") -rawcounts_logcpm = log2(cpm(rawcounts)) -cpm_melt=reshape2::melt(rawcounts_logcpm) -colnames(cpm_melt)=c("peakID","samplename","log2cpm") -fdr_cutoff=as.double(params$fdr_cutoff) -log2fc_cutoff=as.double(params$log2fc_cutoff) +sampleinfo$library_size <- colSums(rawcounts) / lib_factor +sampleinfodf <- as.data.frame(sampleinfo) +sampleinfodf$dupstatus <- params$dupstatus +rownames(sampleinfo) <- sampleinfo$samplename +pander(sampleinfodf, style = "rmarkdown") +rawcounts_logcpm <- log2(cpm(rawcounts)) +cpm_melt <- reshape2::melt(rawcounts_logcpm) +colnames(cpm_melt) <- c("peakID", "samplename", "log2cpm") +fdr_cutoff <- as.double(params$fdr_cutoff) +log2fc_cutoff <- as.double(params$log2fc_cutoff) ``` -Total Peaks: `r nrow(rawcounts)` +Total Peaks: `r nrow(rawcounts)` Total Samples: `r ncol(rawcounts)` ```{r fdr_check} @@ -109,68 +110,75 @@ print(log2fc_cutoff) ```{r cpmplots, echo=FALSE} -ggplot(cpm_melt,aes(x=samplename,y=log2cpm)) + - geom_boxplot(fill=as.factor(as.numeric(as.factor(sampleinfo$group))+1)) + +ggplot(cpm_melt, aes(x = samplename, y = log2cpm)) + + geom_boxplot(fill = as.factor(as.numeric(as.factor(sampleinfo$group)) + 1)) + theme_classic() + coord_flip() - # theme(legend.title = element_blank(),axis.text.x = element_text(angle = 90),legend.text=element_text(size=6),legend.position = "none") +# theme(legend.title = element_blank(),axis.text.x = element_text(angle = 90),legend.text=element_text(size=6),legend.position = "none") ``` ## Run DESeq2 ```{r dds, include=TRUE, echo=FALSE} -dds <- DESeqDataSetFromMatrix(countData = as.matrix(rawcounts), - colData = sampleinfo[,c("samplename","group")], - design = ~ group) -if (params$spiked=="SPIKEIN" & !is.null(params$contrast_data)) { - bbpaths_df = read.csv(params$contrast_data, - header = FALSE,sep="\t", - comment.char = "#", - strip.white = TRUE) - colnames(bbpaths_df)=c("sample", - "replicate", - "bedfile", - "bedgraph", - "scalingfactor", - "bed") - sf_df=unique(bbpaths_df[,c("replicate","scalingfactor")]) - dds_cols=colnames(dds) - sfs=c() - for (i in dds_cols){ - if (i %in% sf_df$replicate){ - sfs=c(sfs,sf_df[sf_df$replicate==i,"scalingfactor"]) +dds <- DESeqDataSetFromMatrix( + countData = as.matrix(rawcounts), + colData = sampleinfo[, c("samplename", "group")], + design = ~group +) +if (params$spiked == "SPIKEIN" & !is.null(params$contrast_data)) { + bbpaths_df <- read.csv(params$contrast_data, + header = FALSE, sep = "\t", + comment.char = "#", + strip.white = TRUE + ) + colnames(bbpaths_df) <- c( + "sample", + "replicate", + "bedfile", + "bedgraph", + "scalingfactor", + "bed" + ) + sf_df <- unique(bbpaths_df[, c("replicate", "scalingfactor")]) + dds_cols <- colnames(dds) + sfs <- c() + for (i in dds_cols) { + if (i %in% sf_df$replicate) { + sfs <- c(sfs, sf_df[sf_df$replicate == i, "scalingfactor"]) } } - if (length(sfs)==length(dds_cols)){ -# scaling factor magnitudes are variable and depend on the constant used while scaling using spiked-in reads -# DESeq2 size factors are generally hovering around 1 -# we try to rescale the scaling factors by dividing them by mean of all scaling factors ... this way they also -# start hovering around 1 ... based on suggestion from Sohyoung. + if (length(sfs) == length(dds_cols)) { + # scaling factor magnitudes are variable and depend on the constant used while scaling using spiked-in reads + # DESeq2 size factors are generally hovering around 1 + # we try to rescale the scaling factors by dividing them by mean of all scaling factors ... this way they also + # start hovering around 1 ... based on suggestion from Sohyoung. if (params$scalesfbymean == "Y") { - adjusted_sfs = sfs/mean(sfs) + adjusted_sfs <- sfs / mean(sfs) } -# AUC-based counts are prescaled, but fragmentbased counts are not prescaled -# DESeq2 should ingest rawcounts ... prescaled counts need to be divided by sfs -# to convert them back to rawcounts + # AUC-based counts are prescaled, but fragmentbased counts are not prescaled + # DESeq2 should ingest rawcounts ... prescaled counts need to be divided by sfs + # to convert them back to rawcounts if (params$rawcountsprescaled == "Y") { - rawrawcounts=round(t(t(rawcounts) / sfs)) - dds <- DESeqDataSetFromMatrix(countData = as.matrix(rawrawcounts), - colData = sampleinfo[,c("samplename","group")], - design = ~ group) + rawrawcounts <- round(t(t(rawcounts) / sfs)) + dds <- DESeqDataSetFromMatrix( + countData = as.matrix(rawrawcounts), + colData = sampleinfo[, c("samplename", "group")], + design = ~group + ) } - DESeq2::sizeFactors(dds)= 1/adjusted_sfs + DESeq2::sizeFactors(dds) <- 1 / adjusted_sfs } else { print("Samples are spiked, but DESeq2 scaling factors used!!") } -} - +} + dds <- DESeq(dds) -if ( params$htsfilter == "Y" ){ - dds <- HTSFilter::HTSFilter(dds,s.len=50, plot=TRUE)$filteredData +if (params$htsfilter == "Y") { + dds <- HTSFilter::HTSFilter(dds, s.len = 50, plot = TRUE)$filteredData } results <- results(dds) results_df <- as.data.frame(results) -results_df %>% rownames_to_column(var="peakID") -> results_df +results_df %>% rownames_to_column(var = "peakID") -> results_df ``` ### DESeq MAplot @@ -185,64 +193,66 @@ DESeq2::plotMA(results) # default for nsub is 1000, if there are less than 1000 rows with a mean > 5 this will error # If there are less than 1K check hether there are enough istances to use nsub value set to the n rows # if there are not then variancestablizing must be used -if (sum( rowMeans( counts(dds, normalized=TRUE)) > 5 ) > 1000){ +if (sum(rowMeans(counts(dds, normalized = TRUE)) > 5) > 1000) { print("VST: Using standard method") rld <- vst(dds) -} else{ +} else { # run test for dds - t <- try(vst(dds,nsub=nrow(dds)),silent = TRUE) - + t <- try(vst(dds, nsub = nrow(dds)), silent = TRUE) + # if this completes then run nsub equal to nrow # if it fails run separate function - if (class(t)[1] == "DESeqTransform"){ + if (class(t)[1] == "DESeqTransform") { print("VST: Using nrow as nsub") - rld <- vst(dds,nsub=nrow(dds)) - } else if(grepl( "Error", t, fixed = TRUE)){ - print ("VST: Using variance Stablizing transformation") + rld <- vst(dds, nsub = nrow(dds)) + } else if (grepl("Error", t, fixed = TRUE)) { + print("VST: Using variance Stablizing transformation") rld <- varianceStabilizingTransformation(dds) - } else{ + } else { print("VST: Using nrow as nsub") - rld <- vst(dds,nsub=nrow(dds)) + rld <- vst(dds, nsub = nrow(dds)) } -} - -assayrld = as.data.frame(assay(rld)) -assayrld$row_variance = rowVars(as.matrix(assayrld)) -assayrld = arrange(assayrld,desc(row_variance)) -zero_variance_rows=assayrld$row_variance<1e-5 -assayrld$row_variance = NULL -assayrld = assayrld[!zero_variance_rows,] -if (nrow(assayrld) > 500){ - assayrld=assayrld[1:500,] } -pca=prcomp(t(assayrld),scale. = T) -m.pc1 = round(pca$sdev[1]^2/sum(pca$sdev^2)*100,2) -m.pc2 = round(pca$sdev[2]^2/sum(pca$sdev^2)*100,2) -m.pc3 = round(pca$sdev[3]^2/sum(pca$sdev^2)*100,2) -xlab=paste0("PC1(",m.pc1,"%)") -ylab=paste0("PC2(",m.pc2,"%)") -ggplot(pca$x,aes(x=PC1,y=PC2,label=rownames(pca$x)))+geom_point(col=as.factor(as.numeric(as.factor(sampleinfo$group))+1))+ - xlab(xlab)+ylab(ylab)+ - geom_text_repel(max.overlaps = 10,size=2)+ + +assayrld <- as.data.frame(assay(rld)) +assayrld$row_variance <- rowVars(as.matrix(assayrld)) +assayrld <- arrange(assayrld, desc(row_variance)) +zero_variance_rows <- assayrld$row_variance < 1e-5 +assayrld$row_variance <- NULL +assayrld <- assayrld[!zero_variance_rows, ] +if (nrow(assayrld) > 500) { + assayrld <- assayrld[1:500, ] +} +pca <- prcomp(t(assayrld), scale. = T) +m.pc1 <- round(pca$sdev[1]^2 / sum(pca$sdev^2) * 100, 2) +m.pc2 <- round(pca$sdev[2]^2 / sum(pca$sdev^2) * 100, 2) +m.pc3 <- round(pca$sdev[3]^2 / sum(pca$sdev^2) * 100, 2) +xlab <- paste0("PC1(", m.pc1, "%)") +ylab <- paste0("PC2(", m.pc2, "%)") +ggplot(pca$x, aes(x = PC1, y = PC2, label = rownames(pca$x))) + + geom_point(col = as.factor(as.numeric(as.factor(sampleinfo$group)) + 1)) + + xlab(xlab) + + ylab(ylab) + + geom_text_repel(max.overlaps = 10, size = 2) + theme_light() ``` ### DESeq Elbow ```{r elbow,include=TRUE,echo=FALSE} -limits=ELBOW::do_elbow_rnaseq(results) +limits <- ELBOW::do_elbow_rnaseq(results) ELBOW::plot_dataset(results, "log2FoldChange", limits$up_limit, limits$low_limit) -write_yaml(limits,file=params$elbowlimits) +write_yaml(limits, file = params$elbowlimits) ``` ```{r elbow2,include=TRUE,echo=FALSE} -lim=c(limits$up_limit,limits$low_limit) -lim=as.data.frame(lim) -rownames(lim)=c("UpLimit","DownLimit") -colnames(lim)="log2FC" -lim$FC=2^lim$log2FC -lim["DownLimit","FC"]=-1/lim["DownLimit","FC"] -lim %>% rownames_to_column(var="Limit") -> lim +lim <- c(limits$up_limit, limits$low_limit) +lim <- as.data.frame(lim) +rownames(lim) <- c("UpLimit", "DownLimit") +colnames(lim) <- "log2FC" +lim$FC <- 2^lim$log2FC +lim["DownLimit", "FC"] <- -1 / lim["DownLimit", "FC"] +lim %>% rownames_to_column(var = "Limit") -> lim pander(lim) # DT::datatable(lim) %>% formatSignif(columns=colnames(lim),digits=4) ``` @@ -250,119 +260,122 @@ pander(lim) ### DESeq Annotation ```{r annotate,include=TRUE,echo=FALSE} -x = as.data.frame(rownames(results)) -colnames(x) = c("peakID") -x %>% separate(col = c("peakID"),into = c("chrom","coord"),sep = ":") %>% - separate(col = c("coord"),into = c("start","end"),sep = "-") -> x +x <- as.data.frame(rownames(results)) +colnames(x) <- c("peakID") +x %>% + separate(col = c("peakID"), into = c("chrom", "coord"), sep = ":") %>% + separate(col = c("coord"), into = c("start", "end"), sep = "-") -> x peaks <- GenomicRanges::makeGRangesFromDataFrame(x) -if (params$species=="mm10"){ +if (params$species == "mm10") { txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene - anno_db<-"org.Mm.eg.db" -} else if (params$species=="hg19"){ + anno_db <- "org.Mm.eg.db" +} else if (params$species == "hg19") { txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene - anno_db<-"org.Hs.eg.db" -} else if (params$species=="hg38"){ + anno_db <- "org.Hs.eg.db" +} else if (params$species == "hg38") { txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene - anno_db<-"org.Hs.eg.db" -} else if (params$species=="hs1"){ + anno_db <- "org.Hs.eg.db" +} else if (params$species == "hs1") { # make txdb for T2T HS1 ## modify seq info to match GTF - temp_seqnames = paste0("chr",seqnames(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)) - temp_seqnames = gsub("MT","M",temp_seqnames) + temp_seqnames <- paste0("chr", seqnames(BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0)) + temp_seqnames <- gsub("MT", "M", temp_seqnames) ## rename all other features - BSgenome.T2T_updated=BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0 - seqnames(BSgenome.T2T_updated)=temp_seqnames - names(seqlengths(BSgenome.T2T_updated))=temp_seqnames - names(isCircular(BSgenome.T2T_updated))=temp_seqnames - names(genome(BSgenome.T2T_updated))=temp_seqnames + BSgenome.T2T_updated <- BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0 + seqnames(BSgenome.T2T_updated) <- temp_seqnames + names(seqlengths(BSgenome.T2T_updated)) <- temp_seqnames + names(isCircular(BSgenome.T2T_updated)) <- temp_seqnames + names(genome(BSgenome.T2T_updated)) <- temp_seqnames ## https://rdrr.io/bioc/GenomicFeatures/man/makeTxDbFromGFF.html ## create TXDB object - txdb<-makeTxDbFromGFF(params$gtf, - format=c("auto"), - dataSource="/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf", - organism="human", - taxonomyId=9606, - chrominfo=seqinfo(BSgenome.T2T_updated)) - anno_db<-"org.Hs.eg.db" + txdb <- makeTxDbFromGFF(params$gtf, + format = c("auto"), + dataSource = "/data/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf", + organism = "human", + taxonomyId = 9606, + chrominfo = seqinfo(BSgenome.T2T_updated) + ) + anno_db <- "org.Hs.eg.db" } options(ChIPseeker.downstreamDistance = 0) peakAnno <- ChIPseeker::annotatePeak(peaks, - tssRegion = c(-2000,200), - TxDb = txdb, - level = "gene", - overlap = "all", - annoDb = anno_db) + tssRegion = c(-2000, 200), + TxDb = txdb, + level = "gene", + overlap = "all", + annoDb = anno_db +) pa <- as.data.frame(peakAnno) -pa$shortAnno=stringr::word(pa$annotation,1) -pa$shortAnno[pa$shortAnno=="5'"]="5'UTR" -pa$shortAnno[pa$shortAnno=="3'"]="3'UTR" -pa$peakID = paste0(pa$seqnames,":",pa$start,"-",pa$end) -results_df = merge(results_df,pa,by=c("peakID")) -write.table(results_df,file=params$results,quote=FALSE,sep="\t",row.names = FALSE,col.names = TRUE) - -up=as.data.frame(table(results_df[results_df$padj < fdr_cutoff & results_df$log2FoldChange > log2fc_cutoff,]$shortAnno)) -down=as.data.frame(table(results_df[results_df$padj < fdr_cutoff & results_df$log2FoldChange < -1*log2fc_cutoff,]$shortAnno)) -if(nrow(up)==0){ +pa$shortAnno <- stringr::word(pa$annotation, 1) +pa$shortAnno[pa$shortAnno == "5'"] <- "5'UTR" +pa$shortAnno[pa$shortAnno == "3'"] <- "3'UTR" +pa$peakID <- paste0(pa$seqnames, ":", pa$start, "-", pa$end) +results_df <- merge(results_df, pa, by = c("peakID")) +write.table(results_df, file = params$results, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) + +up <- as.data.frame(table(results_df[results_df$padj < fdr_cutoff & results_df$log2FoldChange > log2fc_cutoff, ]$shortAnno)) +down <- as.data.frame(table(results_df[results_df$padj < fdr_cutoff & results_df$log2FoldChange < -1 * log2fc_cutoff, ]$shortAnno)) +if (nrow(up) == 0) { up <- data.frame(matrix(ncol = 2, nrow = 0)) } -if(nrow(down)==0){ +if (nrow(down) == 0) { down <- data.frame(matrix(ncol = 2, nrow = 0)) } -colnames(up)=c("shortAnno","UP") -colnames(down)=c("shortAnno","DOWN") -deg=as.data.frame(merge(up,down,by=c("shortAnno"),all=TRUE)) +colnames(up) <- c("shortAnno", "UP") +colnames(down) <- c("shortAnno", "DOWN") +deg <- as.data.frame(merge(up, down, by = c("shortAnno"), all = TRUE)) deg[is.na(deg)] <- 0 -deg %>% column_to_rownames(var="shortAnno") -> deg -deg=cbind(deg,rowSums(deg)) -deg=rbind(deg,colSums(deg)) -colnames(deg)[length(colnames(deg))]="Total" -rownames(deg)[length(rownames(deg))]="Total" -deg %>% rownames_to_column(var="Annotation") -> deg +deg %>% column_to_rownames(var = "shortAnno") -> deg +deg <- cbind(deg, rowSums(deg)) +deg <- rbind(deg, colSums(deg)) +colnames(deg)[length(colnames(deg))] <- "Total" +rownames(deg)[length(rownames(deg))] <- "Total" +deg %>% rownames_to_column(var = "Annotation") -> deg pander(deg) # add catch for hs1 which names SYMBOLS col as geneId -if(params$species == "hs1"){ +if (params$species == "hs1") { lookup <- c(SYMBOL = "geneId") - results_df=results_df %>% - dplyr::rename(any_of(lookup)) + results_df <- results_df %>% + dplyr::rename(any_of(lookup)) } ``` ### DESeq Volcano ```{r volcano,fig.width=8, fig.height=10,include=TRUE,echo=FALSE} -colors=brewer.pal(7,"Set1") -anno_types=levels(as.factor(results_df$shortAnno)) -keyvals=rep("grey",times=nrow(results_df)) -names(keyvals)=rep("NS",times=length(keyvals)) -for ( i in seq(1,length(anno_types))) { - keyvals[ abs(results_df$log2FoldChange) > log2fc_cutoff & results_df$padj < fdr_cutoff & results_df$shortAnno == anno_types[i] ] = colors[i] +colors <- brewer.pal(7, "Set1") +anno_types <- levels(as.factor(results_df$shortAnno)) +keyvals <- rep("grey", times = nrow(results_df)) +names(keyvals) <- rep("NS", times = length(keyvals)) +for (i in seq(1, length(anno_types))) { + keyvals[abs(results_df$log2FoldChange) > log2fc_cutoff & results_df$padj < fdr_cutoff & results_df$shortAnno == anno_types[i]] <- colors[i] names(keyvals)[keyvals == colors[i]] <- anno_types[i] } # names(keyvals)[names(keyvals)=="NA"]="NS" EnhancedVolcano(results_df, - lab = results_df$SYMBOL, - x = 'log2FoldChange', - y = 'padj', - ylab = bquote(~-Log[10] ~ FDR), - pCutoff = fdr_cutoff, - FCcutoff = log2fc_cutoff, - labSize = 4, - title = "", - subtitle = "", - titleLabSize = 1, - subtitleLabSize = 1, - #captionLabSize = 10, - colCustom = keyvals, - colAlpha = 1, - # boxedLabels = TRUE, - # drawConnectors = TRUE, - legendLabels = c("NS", expression(Log[2] ~ FC), "FDR", expression(FDR ~ and ~ log[2] ~ FC)), - legendLabSize = 10 - ) + lab = results_df$SYMBOL, + x = "log2FoldChange", + y = "padj", + ylab = bquote(~ -Log[10] ~ FDR), + pCutoff = fdr_cutoff, + FCcutoff = log2fc_cutoff, + labSize = 4, + title = "", + subtitle = "", + titleLabSize = 1, + subtitleLabSize = 1, + # captionLabSize = 10, + colCustom = keyvals, + colAlpha = 1, + # boxedLabels = TRUE, + # drawConnectors = TRUE, + legendLabels = c("NS", expression(Log[2] ~ FC), "FDR", expression(FDR ~ and ~ log[2] ~ FC)), + legendLabSize = 10 +) ``` ### DESeq Detailed Results diff --git a/workflow/scripts/_diff_markdown_wrapper.R b/workflow/scripts/_diff_markdown_wrapper.R index e562feb..9aa629d 100644 --- a/workflow/scripts/_diff_markdown_wrapper.R +++ b/workflow/scripts/_diff_markdown_wrapper.R @@ -1,121 +1,175 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages(library("argparse")) -#debug="TRUE" -#path="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff" +# debug="TRUE" +# path="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff" -debug="FALSE" -path="/data/CCBR/projects/ccbr1155/CS030586_CARAP/diff" +debug <- "FALSE" +path <- "/data/CCBR/projects/ccbr1155/CS030586_CARAP/diff" # create parser object parser <- ArgumentParser() -parser$add_argument("--rmd", type="character", required=TRUE, - help="path to rmd") -parser$add_argument("--carlisle_functions", type="character", required=TRUE, - help="path to carlisle functions file") -parser$add_argument("--spiked", type="character", required=TRUE, - help="type of normalization used") -parser$add_argument("--rawcountsprescaled", action='store_true', - help="if counts are scaled by spike-in already ... Y (for AUC-based method) or N (for fragments-based method)") -parser$add_argument("--scalesfbymean", action='store_true', - help="DESeq2 scaling factors are around 1. To ensure that spike-in scaling factors are also around 1 divide each scaling factor by mean of all scaling factors.") -parser$add_argument("--htsfilter", action='store_true', - help="Use HTSFilter") -parser$add_argument("--contrast_data", type="character", required=FALSE, default=NULL, - help="contrast_data inputs file") -parser$add_argument("--countsmatrix", type="character", required=TRUE, - help="countsmatrix as TSV") -parser$add_argument("--sampleinfo", type="character", required=TRUE, - help="sample info as TSV") -parser$add_argument("--dupstatus", type="character", required=TRUE, - help="either dedup or no_dedup") -parser$add_argument("--fdr_cutoff", type="double", default=0.05, required=FALSE, - help="FDR cutoff [default %(default)s]") -parser$add_argument("--log2fc_cutoff", type="double", default=0.59, required=FALSE, - help="log2foldchange cutoff [default %(default)s]") -parser$add_argument("--condition1", type="character", required=TRUE, - help = "condition1") -parser$add_argument("--condition2", type="character", required=TRUE, - help = "condition2") -parser$add_argument("--results", type="character", required=TRUE, - help = "path to results TSV") -parser$add_argument("--report", type="character", required=TRUE, - help = "HTML report") -parser$add_argument("--elbowlimits", type="character", required=TRUE, - help = "YAML ELBOW limits") -parser$add_argument("--tmpdir", type="character", required=FALSE, default="/tmp", - help = "tmpdir") -parser$add_argument("--species", type="character", required=TRUE, - help = "species") -parser$add_argument("--gtf", type="character", required=FALSE, - help = "gtf path - needed for HS1") +parser$add_argument("--rmd", + type = "character", required = TRUE, + help = "path to rmd" +) +parser$add_argument("--carlisle_functions", + type = "character", required = TRUE, + help = "path to carlisle functions file" +) +parser$add_argument("--spiked", + type = "character", required = TRUE, + help = "type of normalization used" +) +parser$add_argument("--rawcountsprescaled", + action = "store_true", + help = "if counts are scaled by spike-in already ... Y (for AUC-based method) or N (for fragments-based method)" +) +parser$add_argument("--scalesfbymean", + action = "store_true", + help = "DESeq2 scaling factors are around 1. To ensure that spike-in scaling factors are also around 1 divide each scaling factor by mean of all scaling factors." +) +parser$add_argument("--htsfilter", + action = "store_true", + help = "Use HTSFilter" +) +parser$add_argument("--contrast_data", + type = "character", required = FALSE, default = NULL, + help = "contrast_data inputs file" +) +parser$add_argument("--countsmatrix", + type = "character", required = TRUE, + help = "countsmatrix as TSV" +) +parser$add_argument("--sampleinfo", + type = "character", required = TRUE, + help = "sample info as TSV" +) +parser$add_argument("--dupstatus", + type = "character", required = TRUE, + help = "either dedup or no_dedup" +) +parser$add_argument("--fdr_cutoff", + type = "double", default = 0.05, required = FALSE, + help = "FDR cutoff [default %(default)s]" +) +parser$add_argument("--log2fc_cutoff", + type = "double", default = 0.59, required = FALSE, + help = "log2foldchange cutoff [default %(default)s]" +) +parser$add_argument("--condition1", + type = "character", required = TRUE, + help = "condition1" +) +parser$add_argument("--condition2", + type = "character", required = TRUE, + help = "condition2" +) +parser$add_argument("--results", + type = "character", required = TRUE, + help = "path to results TSV" +) +parser$add_argument("--report", + type = "character", required = TRUE, + help = "HTML report" +) +parser$add_argument("--elbowlimits", + type = "character", required = TRUE, + help = "YAML ELBOW limits" +) +parser$add_argument("--tmpdir", + type = "character", required = FALSE, default = "/tmp", + help = "tmpdir" +) +parser$add_argument("--species", + type = "character", required = TRUE, + help = "species" +) +parser$add_argument("--gtf", + type = "character", required = FALSE, + help = "gtf path - needed for HS1" +) args <- parser$parse_args() gtf <- args$gtf -if (debug){ - carlisle_functions="/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" - rawcountsmatrix="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/counts_matrix.txt" - coldata="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/sample_info.txt" - dupstatus="dedup" - condition1="siSmyd3_2m_Smyd3_0.25HCHO_500K" - condition2="siNC_2m_Smyd3_0.25HCHO_500K" - indexcols="peakID" - fdr_cutoff=0.05 - log2fc_cutoff=0.59 - results="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/results.txt" - report="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/report.html" - spiked="LIBRARY" - rawcountsprescaled="N" - scalesfbymean="N" - htsfilter="Y" - elbowlimits="~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/elbow.yaml" - tmpdir="/dev/shm" - gtf="~/../../Volumes/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf" +if (debug) { + carlisle_functions <- "/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" + rawcountsmatrix <- "~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/counts_matrix.txt" + coldata <- "~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/sample_info.txt" + dupstatus <- "dedup" + condition1 <- "siSmyd3_2m_Smyd3_0.25HCHO_500K" + condition2 <- "siNC_2m_Smyd3_0.25HCHO_500K" + indexcols <- "peakID" + fdr_cutoff <- 0.05 + log2fc_cutoff <- 0.59 + results <- "~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/results.txt" + report <- "~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/report.html" + spiked <- "LIBRARY" + rawcountsprescaled <- "N" + scalesfbymean <- "N" + htsfilter <- "Y" + elbowlimits <- "~/CCBR/projects/ccbr1155/CS030586_CARAP/diff/elbow.yaml" + tmpdir <- "/dev/shm" + gtf <- "~/../../Volumes/CCBR_Pipeliner/db/PipeDB/Indices/hs1/genes.gtf" } else { - carlisle_functions=args$carlisle_functions - rawcountsmatrix=args$countsmatrix - coldata=args$sampleinfo - dupstatus=args$dupstatus - condition1=args$condition1 - condition2=args$condition2 - fdr_cutoff=args$fdr_cutoff - log2fc_cutoff=args$log2fc_cutoff - indexcols="peakID" - results=args$results - report=args$report - spiked=args$spiked - contrast_data=args$contrast_data - elbowlimits=args$elbowlimits - if (args$rawcountsprescaled) {rawcountsprescaled="Y"} else {rawcountsprescaled="N"} - if (args$scalesfbymean) {scalesfbymean="Y"} else {scalesfbymean="N"} - tmpdir=args$tmpdir - if (args$htsfilter) {htsfilter="Y"} else {htsfilter="N"} - species=args$species - gtf=args$gtf + carlisle_functions <- args$carlisle_functions + rawcountsmatrix <- args$countsmatrix + coldata <- args$sampleinfo + dupstatus <- args$dupstatus + condition1 <- args$condition1 + condition2 <- args$condition2 + fdr_cutoff <- args$fdr_cutoff + log2fc_cutoff <- args$log2fc_cutoff + indexcols <- "peakID" + results <- args$results + report <- args$report + spiked <- args$spiked + contrast_data <- args$contrast_data + elbowlimits <- args$elbowlimits + if (args$rawcountsprescaled) { + rawcountsprescaled <- "Y" + } else { + rawcountsprescaled <- "N" + } + if (args$scalesfbymean) { + scalesfbymean <- "Y" + } else { + scalesfbymean <- "N" + } + tmpdir <- args$tmpdir + if (args$htsfilter) { + htsfilter <- "Y" + } else { + htsfilter <- "N" + } + species <- args$species + gtf <- args$gtf } -parameters=list( - carlisle_functions=carlisle_functions, - rawcountsmatrix=rawcountsmatrix, - coldata=coldata, - spiked=spiked, - rawcountsprescaled=rawcountsprescaled, - scalesfbymean=scalesfbymean, - contrast_data=contrast_data, - dupstatus=dupstatus, - condition1=condition1, - condition2=condition2, - indexcols=indexcols, - htsfilter=htsfilter, - fdr_cutoff=fdr_cutoff, - log2fc_cutoff=log2fc_cutoff, - results=results, - elbowlimits=elbowlimits, - species=species, - gtf=gtf) +parameters <- list( + carlisle_functions = carlisle_functions, + rawcountsmatrix = rawcountsmatrix, + coldata = coldata, + spiked = spiked, + rawcountsprescaled = rawcountsprescaled, + scalesfbymean = scalesfbymean, + contrast_data = contrast_data, + dupstatus = dupstatus, + condition1 = condition1, + condition2 = condition2, + indexcols = indexcols, + htsfilter = htsfilter, + fdr_cutoff = fdr_cutoff, + log2fc_cutoff = log2fc_cutoff, + results = results, + elbowlimits = elbowlimits, + species = species, + gtf = gtf +) rmarkdown::render(args$rmd, - params=parameters, + params = parameters, output_file = report, - intermediates_dir = paste(tmpdir,"intermediates_dir",sep="/"), - knit_root_dir = paste(tmpdir,"knit_root_dir",sep="/")) + intermediates_dir = paste(tmpdir, "intermediates_dir", sep = "/"), + knit_root_dir = paste(tmpdir, "knit_root_dir", sep = "/") +) diff --git a/workflow/scripts/_filter_bam.py b/workflow/scripts/_filter_bam.py index dcf85a8..58fb979 100644 --- a/workflow/scripts/_filter_bam.py +++ b/workflow/scripts/_filter_bam.py @@ -11,25 +11,33 @@ import pysam import argparse -parser = argparse.ArgumentParser(description='Filter BAM by readids') -parser.add_argument('--inputBAM', type=str, required=True, - help='input BAM file') -parser.add_argument('--outputBAM', type=str, required=True, - help='filtered output BAM file') -parser.add_argument('--fragmentlength', type=int, required=False, default=1000, - help='discard flagment lengths larger than this integer') -parser.add_argument('--removemarkedduplicates', action='store_true', - help='removed marked and optical duplicates') +parser = argparse.ArgumentParser(description="Filter BAM by readids") +parser.add_argument("--inputBAM", type=str, required=True, help="input BAM file") +parser.add_argument( + "--outputBAM", type=str, required=True, help="filtered output BAM file" +) +parser.add_argument( + "--fragmentlength", + type=int, + required=False, + default=1000, + help="discard flagment lengths larger than this integer", +) +parser.add_argument( + "--removemarkedduplicates", + action="store_true", + help="removed marked and optical duplicates", +) args = parser.parse_args() inBAM = pysam.AlignmentFile(args.inputBAM, "rb") outBAM = pysam.AlignmentFile(args.outputBAM, "wb", template=inBAM) -count=0 +count = 0 for read in inBAM.fetch(): - count+=1 - if count%1000000 == 0: - print("%d reads read!"%(count)) + count += 1 + if count % 1000000 == 0: + print("%d reads read!" % (count)) if not read.is_proper_pair or read.is_unmapped: continue if read.template_length > args.fragmentlength: @@ -38,4 +46,4 @@ continue outBAM.write(read) inBAM.close() -outBAM.close() \ No newline at end of file +outBAM.close() diff --git a/workflow/scripts/_generate_spikein_plots.Rmd b/workflow/scripts/_generate_spikein_plots.Rmd index f9db808..2070a9b 100644 --- a/workflow/scripts/_generate_spikein_plots.Rmd +++ b/workflow/scripts/_generate_spikein_plots.Rmd @@ -8,7 +8,7 @@ params: carlisle_functions: "/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" bam_list: "~/../../Volumes/data/carlisle/v2.0/results/bam/raw/53_H3K4me3_1.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/53_H3K4me3_2.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/HN6_H3K4me3_1.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/HN6_H3K4me3_2.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/53_H4K20m3_1.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/53_H4K20m3_2.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/HN6_H4K20me3_1.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/HN6_H4K20me3_2.bam.idxstatsxxx~/../../Volumes/data/carlisle/v2.0/results/bam/raw/HN6_IgG_rabbit_negative_control_1.bam.idxstats" spikein_control: "NC_000913.3" -editor_options: +editor_options: chunk_output_type: console --- @@ -23,17 +23,17 @@ load_packages() ```{r params, include=FALSE, echo=FALSE} # set params -bam_list=params$bam_list -spikein_control=params$spikein_control -debug="N" +bam_list <- params$bam_list +spikein_control <- params$spikein_control +debug <- "N" -if (debug=="Y"){ - spikein_control="NC_000913.3" - bam_list="" +if (debug == "Y") { + spikein_control <- "NC_000913.3" + bam_list <- "" } # set up bam list -bam_list=as.list(strsplit(bam_list, 'xxx'))[[1]] +bam_list <- as.list(strsplit(bam_list, "xxx"))[[1]] ``` @@ -42,29 +42,30 @@ bam_list=as.list(strsplit(bam_list, 'xxx'))[[1]] # /data/sevillas2/carlisle/v2.0/results/bam/HN6_H4K20me3_2.dedup.bam.idxstats # create df -proj_df=data.frame() -for (bam in bam_list){ - total_str_length=length(strsplit(bam,"/")[[1]]) - +proj_df <- data.frame() +for (bam in bam_list) { + total_str_length <- length(strsplit(bam, "/")[[1]]) + # example: # /data/sevillas2/carlisle/v2.0/results/bam/HN6_H4K20me3_2.dedup.bam.idxstats - proj_df[nrow(proj_df)+1,"bam"]=bam - + proj_df[nrow(proj_df) + 1, "bam"] <- bam + # example: HN6_H4K20me3_2.dedup.bam.idxstats - ext=strsplit(bam,"/")[[1]][total_str_length] - + ext <- strsplit(bam, "/")[[1]][total_str_length] + # example: HN6_H4K20me3_2 - proj_df[nrow(proj_df),"repid"]=strsplit(ext,"[.]")[[1]][1] - + proj_df[nrow(proj_df), "repid"] <- strsplit(ext, "[.]")[[1]][1] + # example: HN6_H4K20me3 - proj_df[nrow(proj_df),"sampleid"]=gsub("_[1,2,3]","",proj_df[nrow(proj_df),"repid"]) + proj_df[nrow(proj_df), "sampleid"] <- gsub("_[1,2,3]", "", proj_df[nrow(proj_df), "repid"]) } DT::datatable(proj_df) ``` ```{r spikeplot, echo=FALSE} -spike_df=GENERATE_SPIKEIN_PLOT(input_df=proj_df, - spike_type=spikein_control) +spike_df <- GENERATE_SPIKEIN_PLOT( + input_df = proj_df, + spike_type = spikein_control +) DT::datatable(spike_df) ``` - diff --git a/workflow/scripts/_generate_spikein_wrapper.R b/workflow/scripts/_generate_spikein_wrapper.R index 468090b..9cd60ac 100644 --- a/workflow/scripts/_generate_spikein_wrapper.R +++ b/workflow/scripts/_generate_spikein_wrapper.R @@ -3,36 +3,48 @@ suppressPackageStartupMessages(library("argparse")) # create parser object parser <- ArgumentParser() -parser$add_argument("--rmd", type="character", required=TRUE, - help="path to rmd") -parser$add_argument("--carlisle_functions", type="character", required=TRUE, - help="path to carlisle functions file") -parser$add_argument("--report", type="character", required=TRUE, - help = "HTML report") -parser$add_argument("--bam_list", type="character", required=TRUE, - help = "list of .idxstats bam files") -parser$add_argument("--spikein_control", type="character", required=TRUE, - help = "spike in species type") +parser$add_argument("--rmd", + type = "character", required = TRUE, + help = "path to rmd" +) +parser$add_argument("--carlisle_functions", + type = "character", required = TRUE, + help = "path to carlisle functions file" +) +parser$add_argument("--report", + type = "character", required = TRUE, + help = "HTML report" +) +parser$add_argument("--bam_list", + type = "character", required = TRUE, + help = "list of .idxstats bam files" +) +parser$add_argument("--spikein_control", + type = "character", required = TRUE, + help = "spike in species type" +) args <- parser$parse_args() -debug="FALSE" -if (debug){ - carlisle_functions="/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" - report="~/../../Volumes/data/tmp/carlisle/spike_report.html" - bam_list="macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" - spikein_control="NC_000913.3" +debug <- "FALSE" +if (debug) { + carlisle_functions <- "/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" + report <- "~/../../Volumes/data/tmp/carlisle/spike_report.html" + bam_list <- "macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" + spikein_control <- "NC_000913.3" } else { - carlisle_functions=args$carlisle_functions - report=args$report - bam_list=args$bam_list - spikein_control=args$spikein_control + carlisle_functions <- args$carlisle_functions + report <- args$report + bam_list <- args$bam_list + spikein_control <- args$spikein_control } -parameters=list( - carlisle_functions=carlisle_functions, - bam_list=bam_list, - spikein_control=spikein_control) +parameters <- list( + carlisle_functions = carlisle_functions, + bam_list = bam_list, + spikein_control = spikein_control +) rmarkdown::render(args$rmd, - params=parameters, - output_file = report) \ No newline at end of file + params = parameters, + output_file = report +) diff --git a/workflow/scripts/_get_nreads_stats.py b/workflow/scripts/_get_nreads_stats.py index 938506e..543c49e 100644 --- a/workflow/scripts/_get_nreads_stats.py +++ b/workflow/scripts/_get_nreads_stats.py @@ -3,6 +3,7 @@ import sys import subprocess import yaml + # arguments # @Inputs # @param1 = R1 fastq file @@ -13,32 +14,44 @@ # @param6 = filtered idx stats file - dedup # @param7 = out yaml file -stats=dict() +stats = dict() r1fastqgz = sys.argv[1] -spikeinlen = pandas.read_csv(sys.argv[2],sep="\t",header=None) -genomelen = pandas.read_csv(sys.argv[3],sep="\t",header=None) -raw_idxstats = pandas.read_csv(sys.argv[4],sep="\t",header=None) -nodedup_idxstats = pandas.read_csv(sys.argv[5],sep="\t",header=None) -dedup_idxstats = pandas.read_csv(sys.argv[6],sep="\t",header=None) - -stats["raw_nreads_genome"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(genomelen[0]))][2])) -stats["raw_nreads_spikein"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(spikeinlen[0]))][2])) - -stats["no_dedup_nreads_genome"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2])) -stats["no_dedup_nreads_spikein"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2])) - -stats["dedup_nreads_genome"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(genomelen[0]))][2])) -stats["dedup_nreads_spikein"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(spikeinlen[0]))][2])) +spikeinlen = pandas.read_csv(sys.argv[2], sep="\t", header=None) +genomelen = pandas.read_csv(sys.argv[3], sep="\t", header=None) +raw_idxstats = pandas.read_csv(sys.argv[4], sep="\t", header=None) +nodedup_idxstats = pandas.read_csv(sys.argv[5], sep="\t", header=None) +dedup_idxstats = pandas.read_csv(sys.argv[6], sep="\t", header=None) + +stats["raw_nreads_genome"] = sum( + list(raw_idxstats[raw_idxstats[0].isin(list(genomelen[0]))][2]) +) +stats["raw_nreads_spikein"] = sum( + list(raw_idxstats[raw_idxstats[0].isin(list(spikeinlen[0]))][2]) +) + +stats["no_dedup_nreads_genome"] = sum( + list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2]) +) +stats["no_dedup_nreads_spikein"] = sum( + list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2]) +) + +stats["dedup_nreads_genome"] = sum( + list(dedup_idxstats[dedup_idxstats[0].isin(list(genomelen[0]))][2]) +) +stats["dedup_nreads_spikein"] = sum( + list(dedup_idxstats[dedup_idxstats[0].isin(list(spikeinlen[0]))][2]) +) # get number of reads cmd = "zcat " + r1fastqgz + " | wc -l " -ps = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) +ps = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) nlines = ps.communicate()[0] nlines = int(nlines) -npairs = nlines/4 -stats["nreads"] = int(npairs*2) +npairs = nlines / 4 +stats["nreads"] = int(npairs * 2) -with open(sys.argv[7], 'w') as file: +with open(sys.argv[7], "w") as file: dumped = yaml.dump(stats, file) diff --git a/workflow/scripts/_get_spikeinreads.py b/workflow/scripts/_get_spikeinreads.py index 9d6db9b..454e491 100644 --- a/workflow/scripts/_get_spikeinreads.py +++ b/workflow/scripts/_get_spikeinreads.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 import pandas import sys + # arguments # @Inputs # @param1 $1 = spike-in len file # @param2 $2 = idx stats file -spikeinlen = pandas.read_csv(sys.argv[1],sep="\t",header=None) -idxstats = pandas.read_csv(sys.argv[2],sep="\t",header=None) +spikeinlen = pandas.read_csv(sys.argv[1], sep="\t", header=None) +idxstats = pandas.read_csv(sys.argv[2], sep="\t", header=None) nreads_spikein = sum(list(idxstats[idxstats[0].isin(list(spikeinlen[0]))][2])) diff --git a/workflow/scripts/_go_enrichment.Rmd b/workflow/scripts/_go_enrichment.Rmd index 7c12efc..702a309 100644 --- a/workflow/scripts/_go_enrichment.Rmd +++ b/workflow/scripts/_go_enrichment.Rmd @@ -11,13 +11,13 @@ params: species: "hg38" geneset_id: "GOBP" dedup_status: "dedup" -editor_options: +editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} # reference -#https://bioconductor.org/packages/devel/bioc/vignettes/chipenrich/inst/doc/chipenrich-vignette.html +# https://bioconductor.org/packages/devel/bioc/vignettes/chipenrich/inst/doc/chipenrich-vignette.html knitr::opts_chunk$set(echo = TRUE) @@ -28,28 +28,28 @@ load_packages() ```{r params, include=FALSE} # set params -speciesID=params$species -output_dir=params$output_dir -peak_dir=params$peak_dir -peak_list=params$peak_list -debug="N" -geneset_id=params$geneset_id -dedup_status=params$dedup_status - -if (debug=="Y"){ - speciesID="hg19" - functionFile="~/../../Volumes/Pipelines-1/CARLISLE_dev/workflow/scripts/_carlisle_functions.R" - output_dir="~/../../Volumes/data/carlisle/v2.0/results/peaks/0.05/go_enrichment" - peak_dir="~/../../Volumes/data/carlisle/v2.0/results/peaks/0.05/" - peak_list="macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" - geneset_id="GOBP" # supported_genesets() c("GOBP", "GOCC", "GOMF") +speciesID <- params$species +output_dir <- params$output_dir +peak_dir <- params$peak_dir +peak_list <- params$peak_list +debug <- "N" +geneset_id <- params$geneset_id +dedup_status <- params$dedup_status + +if (debug == "Y") { + speciesID <- "hg19" + functionFile <- "~/../../Volumes/Pipelines-1/CARLISLE_dev/workflow/scripts/_carlisle_functions.R" + output_dir <- "~/../../Volumes/data/carlisle/v2.0/results/peaks/0.05/go_enrichment" + peak_dir <- "~/../../Volumes/data/carlisle/v2.0/results/peaks/0.05/" + peak_list <- "macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" + geneset_id <- "GOBP" # supported_genesets() c("GOBP", "GOCC", "GOMF") } # set up peak list -peak_list=as.list(strsplit(peak_list, 'xxx'))[[1]] +peak_list <- as.list(strsplit(peak_list, "xxx"))[[1]] -for (i in 1:length(peak_list[[1]])){ - peak_list[[1]][i]=paste0(peak_dir,peak_list[[1]][i]) +for (i in 1:length(peak_list[[1]])) { + peak_list[[1]][i] <- paste0(peak_dir, peak_list[[1]][i]) } ``` @@ -57,30 +57,40 @@ for (i in 1:length(peak_list[[1]])){ ```{r peakdf, echo=FALSE} # create df for peak info -peak_df=data.frame(peak_list) -colnames(peak_df)="peak_bed" +peak_df <- data.frame(peak_list) +colnames(peak_df) <- "peak_bed" # pull sampleID's -for (rowid in rownames(peak_df)){ - total_string_length=length(strsplit(peak_df[rowid,"peak_bed"],"/")[[1]]) - strsplit(peak_df[rowid,"peak_bed"],"/")[[1]][total_string_length] - - peak_df[rowid,"threshold"]=strsplit(peak_df[rowid,"peak_bed"] - ,"/")[[1]][total_string_length-3] - peak_df[rowid,"peak_caller"]=strsplit(peak_df[rowid,"peak_bed"], - "/")[[1]][total_string_length-2] - peak_df[rowid,"sampleid"]=strsplit(strsplit(peak_df[rowid,"peak_bed"], - "/")[[1]][total_string_length],"[.]")[[1]][1] - peak_df[rowid,"peak_type"]=strsplit(strsplit(peak_df[rowid,"peak_bed"], - "/")[[1]][total_string_length],"[.]")[[1]][3] - peak_df[rowid,"dedup_status"]=strsplit(strsplit(peak_df[rowid,"peak_bed"], - "/")[[1]][total_string_length],"[.]")[[1]][2] +for (rowid in rownames(peak_df)) { + total_string_length <- length(strsplit(peak_df[rowid, "peak_bed"], "/")[[1]]) + strsplit(peak_df[rowid, "peak_bed"], "/")[[1]][total_string_length] + + peak_df[rowid, "threshold"] <- strsplit( + peak_df[rowid, "peak_bed"], + "/" + )[[1]][total_string_length - 3] + peak_df[rowid, "peak_caller"] <- strsplit( + peak_df[rowid, "peak_bed"], + "/" + )[[1]][total_string_length - 2] + peak_df[rowid, "sampleid"] <- strsplit(strsplit( + peak_df[rowid, "peak_bed"], + "/" + )[[1]][total_string_length], "[.]")[[1]][1] + peak_df[rowid, "peak_type"] <- strsplit(strsplit( + peak_df[rowid, "peak_bed"], + "/" + )[[1]][total_string_length], "[.]")[[1]][3] + peak_df[rowid, "dedup_status"] <- strsplit(strsplit( + peak_df[rowid, "peak_bed"], + "/" + )[[1]][total_string_length], "[.]")[[1]][2] } -if (debug=="Y"){ - max_samples=2 -} else{ - max_samples=nrow(peak_df) +if (debug == "Y") { + max_samples <- 2 +} else { + max_samples <- nrow(peak_df) } print(peak_df) @@ -92,24 +102,28 @@ print(peak_df) This plot gives a distribution of the distance of the peak midpoints to the TSSs. It can help in selecting a locus definition. For example, if genes are primarily within 5kb of TSSs, then the 5kb locus definition may be a good choice. In contrast, if most genes fall far from TSSs, the nearest_tss locus definition may be a good choice. ```{r qc1, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - print(paste0("** ",peak_df[rowid,"sampleid"]," | ", - peak_df[rowid,"peak_caller"], " | ", - peak_df[rowid,"peak_type"]," **")) +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + print(paste0( + "** ", peak_df[rowid, "sampleid"], " | ", + peak_df[rowid, "peak_caller"], " | ", + peak_df[rowid, "peak_type"], " **" + )) print("--QC Step 1") - tss_obj=plot_dist_to_tss(peaks = READ_PEAK_FILE(peak_file_in=peak_df[rowid,"peak_bed"]), - genome = speciesID) + tss_obj <- plot_dist_to_tss( + peaks = READ_PEAK_FILE(peak_file_in = peak_df[rowid, "peak_bed"]), + genome = speciesID + ) print(tss_obj) - + # create df from values to determine locust - ts_df=data.frame(x=tss_obj$panel.args[[1]]$x,y=tss_obj$panel.args[[1]]$y) - ts_max=max(ts_df$y) - locus_loc=subset(ts_df,y==ts_max)$x - - peak_df[rowid,"locus_loc_def"]=SET_LOCUST_DEF(locus_loc) - peak_df[rowid,"locusdf_list"]=paste(SET_LOCUST_LIST(locus_loc),collapse=",") - peak_df[rowid,"locus_loc_short"]=SET_LOCUST_LIST(locus_loc)[[1]] + ts_df <- data.frame(x = tss_obj$panel.args[[1]]$x, y = tss_obj$panel.args[[1]]$y) + ts_max <- max(ts_df$y) + locus_loc <- subset(ts_df, y == ts_max)$x + + peak_df[rowid, "locus_loc_def"] <- SET_LOCUST_DEF(locus_loc) + peak_df[rowid, "locusdf_list"] <- paste(SET_LOCUST_LIST(locus_loc), collapse = ",") + peak_df[rowid, "locus_loc_short"] <- SET_LOCUST_LIST(locus_loc)[[1]] } ``` @@ -128,13 +142,15 @@ If the defined locus is within 1, 5, or 10kb, then the following three plots are - Nkb_outside:The locus is the region more than Nkb upstream or downstream from a TSS to the midpoint between the adjacent TSS. ```{r qc2, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - print(paste0("** ",peak_df[rowid,"sampleid"]," | ", - peak_df[rowid,"peak_caller"], " | ", - peak_df[rowid,"peak_type"]," **")) +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + print(paste0( + "** ", peak_df[rowid, "sampleid"], " | ", + peak_df[rowid, "peak_caller"], " | ", + peak_df[rowid, "peak_type"], " **" + )) print("--QC Step 2") - PLOT_QC_MAIN(function_in="spline",rowid_in=rowid) + PLOT_QC_MAIN(function_in = "spline", rowid_in = rowid) } ``` @@ -144,13 +160,15 @@ This plot visualizes the relationship between the number of peaks assigned to a If many gene loci have multiple peaks assigned to them, polyenrich is likely an appropriate method. If there are a low number of peaks per gene, then chipenrich() may be the appropriate method. ```{r qc3, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - print(paste0("** ",peak_df[rowid,"sampleid"]," | ", - peak_df[rowid,"peak_caller"], " | ", - peak_df[rowid,"peak_type"]," **")) +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + print(paste0( + "** ", peak_df[rowid, "sampleid"], " | ", + peak_df[rowid, "peak_caller"], " | ", + peak_df[rowid, "peak_type"], " **" + )) print("--QC Step 3") - PLOT_QC_MAIN("polyenrich",rowid) + PLOT_QC_MAIN("polyenrich", rowid) } ``` @@ -158,13 +176,15 @@ for (rowid in rownames(peak_df[c(1:max_samples),])){ This plot visualizes the relationship between proportion of the gene locus covered by peaks and the locus length (on the log10 scale). For clarity of visualization, each point represents 25 gene loci binned after sorting by locus length. ```{r qc4, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - print(paste0("** ",peak_df[rowid,"sampleid"]," | ", - peak_df[rowid,"peak_caller"], " | ", - peak_df[rowid,"peak_type"]," **")) +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + print(paste0( + "** ", peak_df[rowid, "sampleid"], " | ", + peak_df[rowid, "peak_caller"], " | ", + peak_df[rowid, "peak_type"], " **" + )) print("--QC Step 4") - PLOT_QC_MAIN("cov",rowid) + PLOT_QC_MAIN("cov", rowid) } ``` @@ -174,10 +194,10 @@ The hybrid method is used when one is unsure of which method, between ChIP-Enric The hybrid p-value is given as 2*min(chipenrich_pvalue, polyenrich_pvalue). This test will retain the same Type 1 level and will be a consistent test if one of chipenrich or polyenrich is consistent. This can be extended to any number of tests, but currently we only allow a hybrid test for chipenrich and polyenrich. Currently this method is only employed for SEACR analysis. ```{r peak_h, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - peak_caller=peak_df[rowid,"peak_caller"] - if (peak_caller=="seacr"){ - GO_ANALYSIS_MAIN(rowid,"hybrid") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + peak_caller <- peak_df[rowid, "peak_caller"] + if (peak_caller == "seacr") { + GO_ANALYSIS_MAIN(rowid, "hybrid") } } ``` @@ -189,10 +209,10 @@ The Broad-Enrich method uses the cumulative peak coverage of genes in its logist ```{r peak_b, echo=FALSE} # set flag to run broad # run BROAD -for (rowid in rownames(peak_df[c(1:max_samples),])){ - peaktype=peak_df[rowid,"peak_type"] - if(peaktype=="broad"){ - GO_ANALYSIS_MAIN(rowid,"broad") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + peaktype <- peak_df[rowid, "peak_type"] + if (peaktype == "broad") { + GO_ANALYSIS_MAIN(rowid, "broad") } } ``` @@ -203,24 +223,24 @@ ChIP-Enrich is designed for use with 1,000s or 10,000s of narrow genomic regions The ChIP-Enrich method uses the presence of a peak in its logistic regression model for enrichment: peak ~ GO + s(log10_length). Here, GO is a binary vector indicating whether a gene is in the gene set being tested, peak is a binary vector indicating the presence of a peak in a gene, and s(log10_length) is a binomial cubic smoothing spline which adjusts for the relationship between the presence of a peak and locus length. Currently, only MACS2 and GOPEAKS utilize this method. ```{r narrow1, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - peaktype=peak_df[rowid,"peak_type"] - if(peaktype=="narrow"){ - GO_ANALYSIS_MAIN(rowid,"enrich") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + peaktype <- peak_df[rowid, "peak_type"] + if (peaktype == "narrow") { + GO_ANALYSIS_MAIN(rowid, "enrich") } } ``` #### 2) Poly-Enrich ###### 2A): Without linking distal enhancers -Poly-Enrich is also designed for narrow peaks, for experiments with 100,000s of peaks, or in cases where the number of binding sites per gene affects its regulation. +Poly-Enrich is also designed for narrow peaks, for experiments with 100,000s of peaks, or in cases where the number of binding sites per gene affects its regulation. The Poly-Enrich method uses the number of peaks in genes in its negative binomial regression model for enrichment: num_peaks ~ GO + s(log10_length). Here, GO is a binary vector indicating whether a gene is in the gene set being tested, num_peaks is a numeric vector indicating the number of peaks in each gene, and s(log10_length) is a negative binomial cubic smoothing spline which adjusts for the relationship between the number of peaks in a gene and locus length. Currently, only MACS2 and GOPEAKS utilize this method. ```{r narrow2a, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - peaktype=peak_df[rowid,"peak_type"] - if(peaktype=="narrow"){ - GO_ANALYSIS_MAIN(rowid,"poly") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + peaktype <- peak_df[rowid, "peak_type"] + if (peaktype == "narrow") { + GO_ANALYSIS_MAIN(rowid, "poly") } } ``` @@ -230,11 +250,11 @@ Poly-Enrich can also be used for linking human distal enhancer regions to their Poly-Enrich also allows weighting of individual genomic regions based on a score, which can be useful for differential methylation enrichment analysis and ChIP-seq. Currently the options are: signalValue and logsignalValue. signalValue weighs each genomic region or peak based on the Signal Value given in the narrowPeak format or a user-supplied column (column name should be signalValue), while logsignalValue takes the log of these values. Currently, only MACS2 and GOPEAKS utilize this method. ```{r narrow2b, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - peaktype=peak_df[rowid,"peak_type"] - peakcaller=peak_df[rowid,"peak_caller"] - if(peaktype=="narrow" & peakcaller != "gopeaks"){ - GO_ANALYSIS_MAIN(rowid,"poly_weighted") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + peaktype <- peak_df[rowid, "peak_type"] + peakcaller <- peak_df[rowid, "peak_caller"] + if (peaktype == "narrow" & peakcaller != "gopeaks") { + GO_ANALYSIS_MAIN(rowid, "poly_weighted") } } ``` @@ -245,7 +265,7 @@ The proximity to regulatory regions (proxReg) test is a complementary test to an ProxReg first calculates the distance between the midpoints of peaks and the nearest transcription start site or the nearest enhancer region midpoint for each genomic region. Each genomic region is then assigned to the gene with the nearest transcription start site. The distances are then classified according to whether the gene is in the gene set or not, and a signed Wilcoxon rank-sum test is used to calculate if the regions are closer or farther in the gene set than average. ```{r reg, echo=FALSE} -for (rowid in rownames(peak_df[c(1:max_samples),])){ - GO_ANALYSIS_MAIN(rowid,"reglocation") +for (rowid in rownames(peak_df[c(1:max_samples), ])) { + GO_ANALYSIS_MAIN(rowid, "reglocation") } ``` diff --git a/workflow/scripts/_go_enrichment_wrapper.R b/workflow/scripts/_go_enrichment_wrapper.R index 1c3ee4c..f745105 100644 --- a/workflow/scripts/_go_enrichment_wrapper.R +++ b/workflow/scripts/_go_enrichment_wrapper.R @@ -3,51 +3,69 @@ suppressPackageStartupMessages(library("argparse")) # create parser object parser <- ArgumentParser() -parser$add_argument("--rmd", type="character", required=TRUE, - help="path to rmd") -parser$add_argument("--carlisle_functions", type="character", required=TRUE, - help="path to carlisle functions file") -parser$add_argument("--output_dir", type="character", required=FALSE, - help = "output_dir") -parser$add_argument("--report", type="character", required=TRUE, - help = "HTML report") -parser$add_argument("--peak_list", type="character", required=TRUE, - help = "peak_list") -parser$add_argument("--species", type="character", required=TRUE, - help = "species") -parser$add_argument("--geneset_id", type="character", required=FALSE, default="GOBP", - help = "geneset_id") -parser$add_argument("--dedup_status", type="character", required=FALSE, default="dedup", - help = "deduplication status") +parser$add_argument("--rmd", + type = "character", required = TRUE, + help = "path to rmd" +) +parser$add_argument("--carlisle_functions", + type = "character", required = TRUE, + help = "path to carlisle functions file" +) +parser$add_argument("--output_dir", + type = "character", required = FALSE, + help = "output_dir" +) +parser$add_argument("--report", + type = "character", required = TRUE, + help = "HTML report" +) +parser$add_argument("--peak_list", + type = "character", required = TRUE, + help = "peak_list" +) +parser$add_argument("--species", + type = "character", required = TRUE, + help = "species" +) +parser$add_argument("--geneset_id", + type = "character", required = FALSE, default = "GOBP", + help = "geneset_id" +) +parser$add_argument("--dedup_status", + type = "character", required = FALSE, default = "dedup", + help = "deduplication status" +) args <- parser$parse_args() -debug="FALSE" -if (debug){ - carlisle_functions="/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" - output_dir="~/../../Volumes/data/tmp" - report="~/../../Volumes/data/tmp/carlisle/report.html" - peak_list="macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" - species="hg38" - geneset_id="GOBP" - dedup_status="dedup" +debug <- "FALSE" +if (debug) { + carlisle_functions <- "/data/CCBR_Pipeliner/Pipelines/CARLISLE/latest/workflow/scripts/_carlisle_functions.R" + output_dir <- "~/../../Volumes/data/tmp" + report <- "~/../../Volumes/data/tmp/carlisle/report.html" + peak_list <- "macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_1_vs_nocontrol.dedup.narrow.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.broad.peaks.bed macs2/peak_output/53_H3K4me3_2_vs_nocontrol.dedup.narrow.peaks.bed" + species <- "hg38" + geneset_id <- "GOBP" + dedup_status <- "dedup" } else { - carlisle_functions=args$carlisle_functions - output_dir=args$output_dir - report=args$report - peak_list=args$peak_list - species=args$species - geneset_id=args$geneset_id - dedup_status=args$dedup_status + carlisle_functions <- args$carlisle_functions + output_dir <- args$output_dir + report <- args$report + peak_list <- args$peak_list + species <- args$species + geneset_id <- args$geneset_id + dedup_status <- args$dedup_status } -parameters=list( - carlisle_functions=carlisle_functions, - output_dir=output_dir, - peak_list=peak_list, - species=species, - geneset_id=geneset_id, - dedup_status=dedup_status) +parameters <- list( + carlisle_functions = carlisle_functions, + output_dir = output_dir, + peak_list = peak_list, + species = species, + geneset_id = geneset_id, + dedup_status = dedup_status +) rmarkdown::render(args$rmd, - params=parameters, - output_file = report) \ No newline at end of file + params = parameters, + output_file = report +) diff --git a/workflow/scripts/_jobby.py b/workflow/scripts/_jobby.py index cb5f584..747183e 100644 --- a/workflow/scripts/_jobby.py +++ b/workflow/scripts/_jobby.py @@ -4,14 +4,14 @@ # -*- coding: UTF-8 -*- """ -ABOUT: +ABOUT: `jobby` will take your past jobs and display their job information. - Why? We have pipelines running on several different clusters and + Why? We have pipelines running on several different clusters and job schedulers. `jobby` is an attempt to centralize and abstract the process of querying different job schedulers. On each supported - target system, `jobby` will attempt to determine the best method for - getting job information to return to the user in a standardized - format and unified cli. + target system, `jobby` will attempt to determine the best method for + getting job information to return to the user in a standardized + format and unified cli. REQUIRES: - python>=3.5 @@ -22,26 +22,26 @@ National Institute of Allergy and Infectious Diseases (NIAID) This software/database is a "United States Government Work" under - the terms of the United States Copyright Act. It was written as + the terms of the United States Copyright Act. It was written as part of the author's official duties as a United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. - + Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, NCBR do not and - cannot warrant the performance or results that may be obtained by + cannot warrant the performance or results that may be obtained by using this software or data. NCBR and NIH disclaim all warranties, - express or implied, including warranties of performance, + express or implied, including warranties of performance, merchantability or fitness for any particular purpose. - - Please cite the author and NIH resources like the "Biowulf Cluster" + + Please cite the author and NIH resources like the "Biowulf Cluster" in any work or product based on this material. USAGE: $ jobby [OPTIONS] JOB_ID [JOB_ID ...] EXAMPLE: - $ jobby 18627545 15627516 58627597 + $ jobby 18627545 15627516 58627597 """ # Python standard library @@ -50,52 +50,53 @@ from subprocess import PIPE import argparse # added in python/3.5 import textwrap # added in python/3.5 -import tempfile # added in python/3.5 +import tempfile # added in python/3.5 # Jobby metadata -__version__ = 'v0.2.0' -__authors__ = 'Skyler Kuhn' -__email__ = 'skyler.kuhn@nih.gov' -__home__ = os.path.dirname(os.path.abspath(__file__)) +__version__ = "v0.2.0" +__authors__ = "Skyler Kuhn" +__email__ = "skyler.kuhn@nih.gov" +__home__ = os.path.dirname(os.path.abspath(__file__)) _name = os.path.basename(sys.argv[0]) -_description = 'Will take your job(s)... and display their information!' +_description = "Will take your job(s)... and display their information!" # Classes -class Colors(): +class Colors: """Class encoding for ANSI escape sequeces for styling terminal text. Any string that is formatting with these styles must be terminated with the escape sequence, i.e. `Colors.end`. """ + # Escape sequence - end = '\33[0m' + end = "\33[0m" # Formatting options - bold = '\33[1m' - italic = '\33[3m' - url = '\33[4m' - blink = '\33[5m' - higlighted = '\33[7m' + bold = "\33[1m" + italic = "\33[3m" + url = "\33[4m" + blink = "\33[5m" + higlighted = "\33[7m" # Text Colors - black = '\33[30m' - red = '\33[31m' - green = '\33[32m' - yellow = '\33[33m' - blue = '\33[34m' - pink = '\33[35m' - cyan = '\33[96m' - white = '\33[37m' + black = "\33[30m" + red = "\33[31m" + green = "\33[32m" + yellow = "\33[33m" + blue = "\33[34m" + pink = "\33[35m" + cyan = "\33[96m" + white = "\33[37m" # Background fill colors - bg_black = '\33[40m' - bg_red = '\33[41m' - bg_green = '\33[42m' - bg_yellow = '\33[43m' - bg_blue = '\33[44m' - bg_pink = '\33[45m' - bg_cyan = '\33[46m' - bg_white = '\33[47m' + bg_black = "\33[40m" + bg_red = "\33[41m" + bg_green = "\33[42m" + bg_yellow = "\33[43m" + bg_blue = "\33[44m" + bg_pink = "\33[45m" + bg_cyan = "\33[46m" + bg_white = "\33[47m" -# Helper Functions +# Helper Functions def which(cmd, path=None): """Checks if an executable is in $PATH @param cmd : @@ -120,7 +121,7 @@ def which(cmd, path=None): def err(*message, **kwargs): """Prints any provided args to standard error. - kwargs can be provided to modify print functions + kwargs can be provided to modify print functions behavior. @param message : Values printed to standard error @@ -130,7 +131,6 @@ def err(*message, **kwargs): print(*message, file=sys.stderr, **kwargs) - def fatal(*message, **kwargs): """Prints any provided args to standard error and exits with an exit code of 1. @@ -146,42 +146,40 @@ def fatal(*message, **kwargs): def get_toolkit(tool_list): """Finds the best suited tool from a list of possible choices. Assumes tool list is already - ordered from the best to worst choice. The first + ordered from the best to worst choice. The first tool found in a user's $PATH is returned. @param tool_list list[]: List of ordered tools to find @returns best_choice : First tool found in tool_list """ - best_choice = None + best_choice = None for exe in tool_list: if which(exe): best_choice = exe break - + # Did not find any tools # to potentially use if not best_choice: - err( - 'Error: Did not find any tools to get job information!' - ) + err("Error: Did not find any tools to get job information!") fatal( - 'Expected one of the following tools to be in $PATH:' - '\t{0}'.format(tool_list) + "Expected one of the following tools to be in $PATH:" + "\t{0}".format(tool_list) ) - + return best_choice def add_missing(linelist, insertion_dict): - """Adds missing information to a list. This can be used - to add missing job information fields to the results of + """Adds missing information to a list. This can be used + to add missing job information fields to the results of job querying tool. @param linelist list[]: List containing job information for each field of interest @param insertion_dict dict[] = str Dictionary used to insert missing information to a given - index, where the keys are indices of the `linelist` and the + index, where the keys are indices of the `linelist` and the values are information to add. Please note that the indices should be zero based. Note that multiple consequetive values should be inserted at once as a list, see example below: @@ -192,16 +190,16 @@ def add_missing(linelist, insertion_dict): # Get the order of indices # add missing information # starting from largest to - # smallest, if we insert - # missing values in this + # smallest, if we insert + # missing values in this # order we do not need to - # calculate the offset of + # calculate the offset of # new indices tmp_list = linelist indices = sorted(list(insertion_dict.keys()), reverse=True) for i in indices: # Check if multiple values - # need to be inserted at a + # need to be inserted at a # given index if isinstance(insertion_dict[i], list): for v in reversed(insertion_dict[i]): @@ -212,17 +210,12 @@ def add_missing(linelist, insertion_dict): def convert_size(size_bytes): - """Converts bytes to a human readable format. - """ - # Sizes range from B to YiB, + """Converts bytes to a human readable format.""" + # Sizes range from B to YiB, # warning larger sizes storage - # may results in blackhole - size_name = ( - "B", "KiB", "MiB", - "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB" - ) - if size_bytes == 0: + # may results in blackhole + size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB") + if size_bytes == 0: return "0B" i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) @@ -234,36 +227,53 @@ def to_bytes(size): """Convert a human readable size unit into bytes. Returns None if cannot convert/parse provided size.""" size2bytes = { - "b":1, "bytes":1, "byte":1, - "k":1024, "kib":1024, "kb":1000, - "m": 1024**2, "mib": 1024**2, "mb": 1000**2, - "g": 1024**3, "gib": 1024**3, "gb": 1000**3, - "t": 1024**4, "tib": 1024**4, "tb": 1000**4, - "p": 1024**5, "pib": 1024**5, "pb": 1000**5, - "e": 1024**6, "eib": 1024**6, "eb": 1000**6, - "z": 1024**7, "zib": 1024**7, "zb": 1000**7, - "y": 1024**8, "yib": 1024**8, "yb": 1000**8 + "b": 1, + "bytes": 1, + "byte": 1, + "k": 1024, + "kib": 1024, + "kb": 1000, + "m": 1024**2, + "mib": 1024**2, + "mb": 1000**2, + "g": 1024**3, + "gib": 1024**3, + "gb": 1000**3, + "t": 1024**4, + "tib": 1024**4, + "tb": 1000**4, + "p": 1024**5, + "pib": 1024**5, + "pb": 1000**5, + "e": 1024**6, + "eib": 1024**6, + "eb": 1000**6, + "z": 1024**7, + "zib": 1024**7, + "zb": 1000**7, + "y": 1024**8, + "yib": 1024**8, + "yb": 1000**8, } - - size = size.replace(' ','') - match = re.search('(?P[0-9.]+)(?P[a-zA-Z]+)$', size) - + + size = size.replace(" ", "") + match = re.search("(?P[0-9.]+)(?P[a-zA-Z]+)$", size) + if match: - human_units = match.group('units').lower() + human_units = match.group("units").lower() human_units = human_units.lstrip().rstrip() scaling_factor = size2bytes[human_units] - bytes = int(math.ceil(scaling_factor * float(match.group('size')))) + bytes = int(math.ceil(scaling_factor * float(match.group("size")))) else: # Cannot parse units, # cannot convert value # into bytes return None - - return bytes + return bytes -# Core logic for getting +# Core logic for getting # job information def sge(jobs, threads, tmp_dir): """Displays SGE job information to standard output. @@ -281,19 +291,19 @@ def uge(jobs, threads, tmp_dir): Parsed command-line arguments @return None """ - # NOTE: add later for LOCUS cluster + # NOTE: add later for LOCUS cluster pass def dashboard_cli(jobs, threads=1, tmp_dir=None): """Biowulf-specific tool to get SLURM job information. - HPC staff recommend using this over the default slurm - `sacct` command for performance reasons. By default, + HPC staff recommend using this over the default slurm + `sacct` command for performance reasons. By default, the `dashboard_cli` returns information for the following fields: - jobid state submit_time partition nodes - cpus mem timelimit gres dependency - queued_time state_reason start_time elapsed_time end_time + jobid state submit_time partition nodes + cpus mem timelimit gres dependency + queued_time state_reason start_time elapsed_time end_time cpu_max mem_max eval Runs command: $ dashboard_cli jobs \\ @@ -302,41 +312,50 @@ def dashboard_cli(jobs, threads=1, tmp_dir=None): --tab --archive """ fields = [ - "jobid","jobname", - "state","partition", - "gres","cpus","mem", - "cpu_max","mem_max", - "timelimit","queued_time", - "start_time","end_time", - "elapsed_time","nodelist", - "user", "std_out", "std_err", - "work_dir" + "jobid", + "jobname", + "state", + "partition", + "gres", + "cpus", + "mem", + "cpu_max", + "mem_max", + "timelimit", + "queued_time", + "start_time", + "end_time", + "elapsed_time", + "nodelist", + "user", + "std_out", + "std_err", + "work_dir", ] - + # Display header information, # --tab option does not print # the header - print('\t'.join(fields)) + print("\t".join(fields)) # Display job information cmd = subprocess.run( - 'dashboard_cli jobs --archive --tab --joblist {0} --fields {1}'.format( - ','.join(jobs), - ','.join(fields) + "dashboard_cli jobs --archive --tab --joblist {0} --fields {1}".format( + ",".join(jobs), ",".join(fields) ), stdout=PIPE, stderr=PIPE, universal_newlines=True, - shell=True + shell=True, ) # Check for failure # of the last command if cmd.returncode != 0: err("\nError: Failed to get job information with 'dashboard_cli'!") - err('Please see error message below:') - fatal(' └── ', cmd.stderr) + err("Please see error message below:") + fatal(" └── ", cmd.stderr) - print(cmd.stdout.rstrip('\n')) + print(cmd.stdout.rstrip("\n")) def sacct(jobs, threads=1, tmp_dir=None): @@ -344,11 +363,11 @@ def sacct(jobs, threads=1, tmp_dir=None): `sacct` should be available on all SLURM clusters. The `dashboard_cli` is prioritized over using `sacct` due to perform reasons; however, this method will be - portable across different SLURM clusters. To get maximum - memory usage for a job, we will need to parse the MaxRSS + portable across different SLURM clusters. To get maximum + memory usage for a job, we will need to parse the MaxRSS field from the `$SLURM_JOBID.batch` lines. Returns job information for the following fields: - jobid jobname state partition reqtres + jobid jobname state partition reqtres alloccpus reqmem maxrss timelimit reserved start end elapsed nodelist user workdir @@ -357,49 +376,70 @@ def sacct(jobs, threads=1, tmp_dir=None): Runs command: $ sacct -j 12345679,12345680 \\ --fields FIELD,FIELD,FIELD \\ - -P --delimiter $'\t' + -P --delimiter $'\t' """ - header = [ - "jobid","jobname","state","partition", - "gres","cpus","mem","cpu_max","mem_max", - "timelimit","queued_time","start_time", - "end_time","elapsed_time","nodelist", - "user","std_out","std_err", "work_dir" + header = [ + "jobid", + "jobname", + "state", + "partition", + "gres", + "cpus", + "mem", + "cpu_max", + "mem_max", + "timelimit", + "queued_time", + "start_time", + "end_time", + "elapsed_time", + "nodelist", + "user", + "std_out", + "std_err", + "work_dir", ] fields = [ - "jobid", "jobname", - "state", "partition", - "reqtres", "alloccpus", - "reqmem", "maxrss", - "timelimit", "reserved", - "start", "end", - "elapsed", "nodelist", - "user", "workdir" + "jobid", + "jobname", + "state", + "partition", + "reqtres", + "alloccpus", + "reqmem", + "maxrss", + "timelimit", + "reserved", + "start", + "end", + "elapsed", + "nodelist", + "user", + "workdir", ] # Missing std_out and std_err - missing_fields = {15:['-','-']} + missing_fields = {15: ["-", "-"]} # Display header information, - print('\t'.join(header)) + print("\t".join(header)) # Display job information cmd = subprocess.run( "sacct -j {0} -P --delimiter $'\\t' --format={1}".format( - ','.join(jobs), - ','.join(fields) + ",".join(jobs), ",".join(fields) ), - stdout=PIPE, + stdout=PIPE, stderr=PIPE, universal_newlines=True, - shell=True + shell=True, ) # Check for failure # of the last command if cmd.returncode != 0: err("\nError: Failed to get job information with 'dashboard_cli'!") - err('Please see error message below:') - fatal(' └── ', cmd.stderr) - + err("Please see error message below:") + fatal(" └── ", cmd.stderr) + # Get max memory information, # Stored as $SLURM_JOBID.batch # in the MaxRSS field @@ -407,22 +447,22 @@ def sacct(jobs, threads=1, tmp_dir=None): # Remove trailing newline from # standard output and split lines # on remaining newline characters - job_information = cmd.stdout.rstrip('\n').split('\n') + job_information = cmd.stdout.rstrip("\n").split("\n") for i, line in enumerate(job_information): if i < 1: # skip over header continue - linelist = line.lstrip().rstrip().split('\t') - if linelist[0].endswith('.batch'): - jobid = linelist[0].strip().split('.')[0] - maxmem = linelist[7].replace(' ', '') + linelist = line.lstrip().rstrip().split("\t") + if linelist[0].endswith(".batch"): + jobid = linelist[0].strip().split(".")[0] + maxmem = linelist[7].replace(" ", "") mem_bytes = to_bytes(maxmem) if not mem_bytes: # Could not convert - # max_mem value into + # max_mem value into # bytes - j2m[jobid] = '-' - continue # goto next line + j2m[jobid] = "-" + continue # goto next line human_readable_mem = convert_size(mem_bytes) j2m[jobid] = human_readable_mem @@ -432,22 +472,22 @@ def sacct(jobs, threads=1, tmp_dir=None): if i < 1: # skip over header continue - linelist = line.lstrip().rstrip().split('\t') + linelist = line.lstrip().rstrip().split("\t") jobid = linelist[0].strip() - if '.' not in jobid: + if "." not in jobid: try: max_mem = j2m[jobid] except KeyError: - # Job maybe still be + # Job maybe still be # running or in a non- # completed state. - max_mem = '-' - status = linelist[2].split(' ')[0] + max_mem = "-" + status = linelist[2].split(" ")[0] linelist[2] = status missing_fields[8] = max_mem linelist = add_missing(linelist, missing_fields) - linelist = [info if info else '-' for info in linelist] - print('\t'.join(linelist)) + linelist = [info if info else "-" for info in linelist] + print("\t".join(linelist)) def slurm(jobs, threads, tmp_dir): @@ -456,11 +496,11 @@ def slurm(jobs, threads, tmp_dir): Parsed command-line arguments @return None """ - # Try to use the following tools in this + # Try to use the following tools in this # order to get job information! # [1] `dashboard_cli` is Biowulf-specific # [2] `sacct` should always be there - tool_priority = ['dashboard_cli', 'sacct'] + tool_priority = ["dashboard_cli", "sacct"] job_tool = get_toolkit(tool_priority) # Get information about each job # must use eval() to make string @@ -470,65 +510,57 @@ def slurm(jobs, threads, tmp_dir): def jobby(args): """ - Wrapper to each supported job scheduler: slurm, etc. + Wrapper to each supported job scheduler: slurm, etc. Each scheduler has a custom handler to most effectively - get and parse job information. + get and parse job information. @param sub_args : Parsed command-line arguments @return None """ # Get command line options abstract_handler = None - job_ids = args.JOB_ID + job_ids = args.JOB_ID scheduler = args.scheduler - threads = args.threads - tmp_dir = args.tmp_dir + threads = args.threads + tmp_dir = args.tmp_dir - # Set handler for each - # supported scheduler - if scheduler == 'slurm': + # Set handler for each + # supported scheduler + if scheduler == "slurm": abstract_handler = slurm else: # Unsupported job scheduler, # needs to be implemented - fatal( - 'Error: "{0}" is an unsupported job scheduler!'.format(scheduler) - ) - - # Display job(s) information + fatal('Error: "{0}" is an unsupported job scheduler!'.format(scheduler)) + + # Display job(s) information # to standard output - abstract_handler( - jobs=job_ids, - threads=threads, - tmp_dir=tmp_dir - ) + abstract_handler(jobs=job_ids, threads=threads, tmp_dir=tmp_dir) # Parse command-line arguments def parsed_arguments(name, description): - """Parses user-provided command-line arguments. This requires - argparse and textwrap packages. To create custom help formatting - a text wrapped docstring is used to create the help message for - required options. As so, the help message for require options - must be suppressed. If a new required argument is added to the + """Parses user-provided command-line arguments. This requires + argparse and textwrap packages. To create custom help formatting + a text wrapped docstring is used to create the help message for + required options. As so, the help message for require options + must be suppressed. If a new required argument is added to the cli, it must be updated in the usage statement docstring below. @param name : - Name of the pipeline or command-line tool + Name of the pipeline or command-line tool @param description : - Short description of pipeline or command-line tool + Short description of pipeline or command-line tool """ # Add styled name and description c = Colors - styled_name = "{0}{1}{2}{3}{4}".format( - c.bold, c.bg_black, - c.cyan, name, c.end - ) + styled_name = "{0}{1}{2}{3}{4}".format(c.bold, c.bg_black, c.cyan, name, c.end) description = "{0}{1}{2}".format(c.bold, description, c.end) temp = tempfile.gettempdir() # Please note: update the usage statement # below if a new option is added! - usage_statement = textwrap.dedent("""\ + usage_statement = textwrap.dedent( + """\ {0}: {1} {3}{4}Synopsis:{5} @@ -538,138 +570,136 @@ def parsed_arguments(name, description): {3}{4}Description:{5} - {2} will take your past jobs and display their job information + {2} will take your past jobs and display their job information in a standardized format. Why???! We have pipelines running on several - different clusters (using different job schedulers). {2} centralizes + different clusters (using different job schedulers). {2} centralizes and abstracts the process of querying different job schedulers within - a unified command-line interface. - + a unified command-line interface. + For each supported scheduler, jobby will determine the best method - on a given target system for getting job information to return to the + on a given target system for getting job information to return to the user in a common output format. {3}{4}Required Positional Arguments:{5} - Identiers of past jobs. One or more JOB_IDs - can be provided. Multiple JOB_IDs should be - seperated by a space. Information for each - of the JOB_IDs will be displayed to standard - output. Please see example section below for + Identiers of past jobs. One or more JOB_IDs + can be provided. Multiple JOB_IDs should be + seperated by a space. Information for each + of the JOB_IDs will be displayed to standard + output. Please see example section below for more information. {3}{4}Options:{5} - -s,--scheduler {{slurm | ...}} + -s,--scheduler {{slurm | ...}} @Default: slurm - Job scheduler. Defines the job scheduler + Job scheduler. Defines the job scheduler of the target system. Additional support - for more schedulers coming soon! + for more schedulers coming soon! @Example: --scheduler slurm - -n, --threads THREADS + -n, --threads THREADS @Default: 1 - Number of threads to query the scheduler + Number of threads to query the scheduler in parallel. @Example: --threads: 8 - -t, --tmp-dir TMP_DIR + -t, --tmp-dir TMP_DIR @Default: {7}/ - Temporary directory. Path on the filesystem - for writing temporary output files. Ideally, - this path should point to a dedicated space - on the filesystem for writing tmp files. If - you need to inject a variable into this path - that should NOT be expanded, please quote the - options value in single quotes. The default + Temporary directory. Path on the filesystem + for writing temporary output files. Ideally, + this path should point to a dedicated space + on the filesystem for writing tmp files. If + you need to inject a variable into this path + that should NOT be expanded, please quote the + options value in single quotes. The default location of this option is set to the system default via the $TMPDIR environment variable. @Example: --tmp-dir '/scratch/$USER/' - + -h, --help Shows help and usage information and exits. @Example: --help - + -v, --version Displays version information and exits. @Example: --version - """.format(styled_name, description, name, c.bold, c.url, c.end, c.italic, temp)) + """.format( + styled_name, description, name, c.bold, c.url, c.end, c.italic, temp + ) + ) # Display example usage in epilog - run_epilog = textwrap.dedent("""\ + run_epilog = textwrap.dedent( + """\ {2}{3}Example:{4} - # Please avoid running jobby + # Please avoid running jobby # on a cluster's head node! ./jobby -s slurm -n 4 18627542 13627516 58627597 48627666 {2}{3}Version:{4} {1} - """.format(name, __version__, c.bold, c.url, c.end)) + """.format( + name, __version__, c.bold, c.url, c.end + ) + ) # Create a top-level parser parser = argparse.ArgumentParser( - usage = argparse.SUPPRESS, + usage=argparse.SUPPRESS, formatter_class=argparse.RawDescriptionHelpFormatter, - description = usage_statement, - epilog = run_epilog, - add_help=False + description=usage_statement, + epilog=run_epilog, + add_help=False, ) # Required Positional Arguments # List of JOB_IDs, 1 ... N_JOB_IDS - parser.add_argument( - 'JOB_ID', - nargs = '+', - help = argparse.SUPPRESS - ) + parser.add_argument("JOB_ID", nargs="+", help=argparse.SUPPRESS) # Options # Adding verison information parser.add_argument( - '-v', '--version', - action = 'version', - version = '%(prog)s {}'.format(__version__), - help = argparse.SUPPRESS + "-v", + "--version", + action="version", + version="%(prog)s {}".format(__version__), + help=argparse.SUPPRESS, ) # Add custom help message - parser.add_argument( - '-h', '--help', - action='help', - help=argparse.SUPPRESS - ) + parser.add_argument("-h", "--help", action="help", help=argparse.SUPPRESS) - # Base directory to write - # temporary/intermediate files + # Base directory to write + # temporary/intermediate files parser.add_argument( - '-t', '--tmp-dir', - type = str, - required = False, - default = temp, - help = argparse.SUPPRESS + "-t", + "--tmp-dir", + type=str, + required=False, + default=temp, + help=argparse.SUPPRESS, ) - # Number of threads for the + # Number of threads for the # pipeline's main proceess - # This is only applicable for - # local rules or when running + # This is only applicable for + # local rules or when running # in local mode. parser.add_argument( - '-n', '--threads', - type = int, - required = False, - default = 1, - help = argparse.SUPPRESS + "-n", "--threads", type=int, required=False, default=1, help=argparse.SUPPRESS ) # Job scheduler to query, # available: SLURM, ... # More coming soon! parser.add_argument( - '-s', '--scheduler', - type = lambda s: str(s).lower(), - required = False, - default = "slurm", - choices = ['slurm'], - help = argparse.SUPPRESS + "-s", + "--scheduler", + type=lambda s: str(s).lower(), + required=False, + default="slurm", + choices=["slurm"], + help=argparse.SUPPRESS, ) # Define handlers for each sub-parser - parser.set_defaults(func = jobby) + parser.set_defaults(func=jobby) # Parse command-line args args = parser.parse_args() @@ -680,20 +710,17 @@ def main(): # Sanity check for usage if len(sys.argv) == 1: # Nothing was provided - fatal('Invalid usage: {} [-h] [--version] ...'.format(_name)) - + fatal("Invalid usage: {} [-h] [--version] ...".format(_name)) + # Collect args for sub-command - args = parsed_arguments( - name = _name, - description = _description - ) - + args = parsed_arguments(name=_name, description=_description) + # Display version information - err('{} ({})'.format(_name, __version__)) - # Mediator method to call the + err("{} ({})".format(_name, __version__)) + # Mediator method to call the # default handler function args.func(args) -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/workflow/scripts/_make_alignment_stats_table.R b/workflow/scripts/_make_alignment_stats_table.R index b17f9c8..1812efd 100644 --- a/workflow/scripts/_make_alignment_stats_table.R +++ b/workflow/scripts/_make_alignment_stats_table.R @@ -9,60 +9,73 @@ suppressPackageStartupMessages(library("tidyverse")) parser <- ArgumentParser() -parser$add_argument("--yamlDir", type="character", required=TRUE, - help="absolute path to location of yamls") -parser$add_argument("--excludeFromName", type="character", required=TRUE, - help="sample info as TSV") -parser$add_argument("--scaleConstant", type="double", default = 1e6, required=FALSE, - help="scaling constant, typically 1e6.[default %(default)s]") -parser$add_argument("--outTable", type="character", required=TRUE, - help="absolute path to output table TSV file.") +parser$add_argument("--yamlDir", + type = "character", required = TRUE, + help = "absolute path to location of yamls" +) +parser$add_argument("--excludeFromName", + type = "character", required = TRUE, + help = "sample info as TSV" +) +parser$add_argument("--scaleConstant", + type = "double", default = 1e6, required = FALSE, + help = "scaling constant, typically 1e6.[default %(default)s]" +) +parser$add_argument("--outTable", + type = "character", required = TRUE, + help = "absolute path to output table TSV file." +) args <- parser$parse_args() -debug=0 -if (debug==1){ - yaml_dir="/home/kopardevn/CCBR/projects/ccbr1155/CS030586_CARAP/results/alignment_stats" - exclude_from_name = ".alignment_stats.yaml" - scale_constant = 1e6 - out_table = "test.tsv" +debug <- 0 +if (debug == 1) { + yaml_dir <- "/home/kopardevn/CCBR/projects/ccbr1155/CS030586_CARAP/results/alignment_stats" + exclude_from_name <- ".alignment_stats.yaml" + scale_constant <- 1e6 + out_table <- "test.tsv" } else { - yaml_dir=args$yamlDir - exclude_from_name=args$excludeFromName - scale_constant=args$scaleConstant - out_table=args$outTable + yaml_dir <- args$yamlDir + exclude_from_name <- args$excludeFromName + scale_constant <- args$scaleConstant + out_table <- args$outTable } setwd(yaml_dir) -files=list.files(path = yaml_dir, - pattern = "*.yaml") -df = data.frame() -for ( f in files ){ - sampleName <- gsub(pattern = exclude_from_name, replacement = "",f) +files <- list.files( + path = yaml_dir, + pattern = "*.yaml" +) +df <- data.frame() +for (f in files) { + sampleName <- gsub(pattern = exclude_from_name, replacement = "", f) readinyaml <- as.data.frame(read_yaml(f, fileEncoding = "UTF-8")) - rownames(readinyaml)=c(sampleName) - df = rbind(df,readinyaml) + rownames(readinyaml) <- c(sampleName) + df <- rbind(df, readinyaml) } -df$scaling_factor = scale_constant/df$dedup_nreads_spikein -df$duplication_rate_genome = (df$raw_nreads_genome - df$dedup_nreads_genome) / df$raw_nreads_genome -df$duplication_rate_spikein = (df$raw_nreads_spikein - df$dedup_nreads_spikein) / df$raw_nreads_spikein -column_order = c("sample_name", - "nreads", - "raw_nreads_genome", - "raw_nreads_spikein", - "scaling_factor", - "duplication_rate_genome", - "duplication_rate_spikein", - "no_dedup_nreads_genome", - "no_dedup_nreads_spikein", - "dedup_nreads_genome", - "dedup_nreads_spikein") +df$scaling_factor <- scale_constant / df$dedup_nreads_spikein +df$duplication_rate_genome <- (df$raw_nreads_genome - df$dedup_nreads_genome) / df$raw_nreads_genome +df$duplication_rate_spikein <- (df$raw_nreads_spikein - df$dedup_nreads_spikein) / df$raw_nreads_spikein +column_order <- c( + "sample_name", + "nreads", + "raw_nreads_genome", + "raw_nreads_spikein", + "scaling_factor", + "duplication_rate_genome", + "duplication_rate_spikein", + "no_dedup_nreads_genome", + "no_dedup_nreads_spikein", + "dedup_nreads_genome", + "dedup_nreads_spikein" +) -df %>% rownames_to_column(var="sample_name") -> df -df = df[,column_order] +df %>% rownames_to_column(var = "sample_name") -> df +df <- df[, column_order] write.table(df, - file=out_table, - quote = FALSE, - sep = "\t", - row.names = FALSE, - col.names = TRUE) + file = out_table, + quote = FALSE, + sep = "\t", + row.names = FALSE, + col.names = TRUE +) diff --git a/workflow/scripts/_make_counts_matrix.py b/workflow/scripts/_make_counts_matrix.py index d5f5567..d7c13a6 100644 --- a/workflow/scripts/_make_counts_matrix.py +++ b/workflow/scripts/_make_counts_matrix.py @@ -6,79 +6,121 @@ # sums to countsmatrix.tsv # sampleinfo.tsv is also created to be used as colData for downstream DESeq2 analysis -import subprocess,argparse,sys,pandas,os,functools,uuid -parser = argparse.ArgumentParser(description='create counts matrix') -parser.add_argument('--bedbedgraph', required=True, type=str, help="list of peak calls and scaled bedgraph file, tab delimited, group|sampleName|bedfile|bedgraph|scalingFactor|fragmentsBed per line") -parser.add_argument('--tmpdir', required=True, type=str, help="TMP dir") -parser.add_argument('--countsmatrix', required=True, type=str, help="bedgraph-AUC-based output counts matrix TSV") -parser.add_argument('--fragmentscountsmatrix', required=True, type=str, help="fragmentsBed-based output counts matrix TSV") -parser.add_argument('--sampleinfo', required=True, type=str, help="output sample info TSV") +import subprocess, argparse, sys, pandas, os, functools, uuid + +parser = argparse.ArgumentParser(description="create counts matrix") +parser.add_argument( + "--bedbedgraph", + required=True, + type=str, + help="list of peak calls and scaled bedgraph file, tab delimited, group|sampleName|bedfile|bedgraph|scalingFactor|fragmentsBed per line", +) +parser.add_argument("--tmpdir", required=True, type=str, help="TMP dir") +parser.add_argument( + "--countsmatrix", + required=True, + type=str, + help="bedgraph-AUC-based output counts matrix TSV", +) +parser.add_argument( + "--fragmentscountsmatrix", + required=True, + type=str, + help="fragmentsBed-based output counts matrix TSV", +) +parser.add_argument( + "--sampleinfo", required=True, type=str, help="output sample info TSV" +) args = parser.parse_args() randstr = str(uuid.uuid4()) if not os.path.exists(args.tmpdir): - try: - os.mkdir(args.tmpdir) - except: - exit(args.tmpdir + ": Folder doesnt exist and cannot be created!") + try: + os.mkdir(args.tmpdir) + except: + exit(args.tmpdir + ": Folder doesnt exist and cannot be created!") -if not os.access(args.tmpdir,os.W_OK): - exit(args.tmpdir + ": Folder is not writeable!") +if not os.access(args.tmpdir, os.W_OK): + exit(args.tmpdir + ": Folder is not writeable!") -bedbedgraph = pandas.read_csv(args.bedbedgraph,header=None,sep="\t") -bedbedgraph.columns=["group","sampleName","bed","bedgraph","scalingFactor","fragmentsBed"] +bedbedgraph = pandas.read_csv(args.bedbedgraph, header=None, sep="\t") +bedbedgraph.columns = [ + "group", + "sampleName", + "bed", + "bedgraph", + "scalingFactor", + "fragmentsBed", +] -concatfile = os.path.join(args.tmpdir,randstr+".concat.bed") -cmd = "cat " + " ".join(list(bedbedgraph["bed"])) + " | cut -f1-3 | sort -k1,1 -k2,2n > " + concatfile +concatfile = os.path.join(args.tmpdir, randstr + ".concat.bed") +cmd = ( + "cat " + + " ".join(list(bedbedgraph["bed"])) + + " | cut -f1-3 | sort -k1,1 -k2,2n > " + + concatfile +) concat = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) -mergedfile = os.path.join(args.tmpdir,randstr+".merged.bed") +mergedfile = os.path.join(args.tmpdir, randstr + ".merged.bed") cmd = "bedtools merge -i " + concatfile + " > " + mergedfile merge = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) -sampleinfofile = open(args.sampleinfo,'w') +sampleinfofile = open(args.sampleinfo, "w") sampleinfofile.write("samplename\tgroup\n") -counts=dict() # bedgraph AUC based counts -fcounts=dict() # fragment based counts -for i,row in bedbedgraph.iterrows(): - group = row['group'] - bg = row['bedgraph'] - samplename = row['sampleName'] - sf = row['scalingFactor'] - fbed = row['fragmentsBed'] - filepath,filename = os.path.split(bg) - basename,ext = os.path.splitext(filename) - sampleinfofile.write("%s\t%s\n"%(samplename,group)) - cmd = "bedtools intersect -wa -wb -a "+bg+" -b "+mergedfile - print("Now running: %s"%(cmd)) - bt = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) - peakCounts=dict() - for l in bt.stdout.split("\n"): - l=l.strip().split("\t") - if len(l) != 7: - continue - peakID = l[4] + ":" + l[5] + "-" + l[6] - score = (int(l[2]) - int(l[1])) * float(l[3]) - if not peakID in peakCounts: - peakCounts[peakID]=0.0 - peakCounts[peakID]+=score - counts[samplename] = pandas.DataFrame.from_dict(peakCounts,orient='index',columns=[samplename]) - cmd = "bedmap --echo-ref-name --count "+mergedfile+" "+fbed - print("Now running %s"%(cmd)) - bo = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) - fPeakCounts=dict() - for l in bo.stdout.split('\n'): - l = l.strip().split('|') - if len(l)!=2: - continue - fPeakCounts[l[0]]=l[1] - fcounts[samplename] = pandas.DataFrame.from_dict(fPeakCounts,orient='index',columns=[samplename]) +counts = dict() # bedgraph AUC based counts +fcounts = dict() # fragment based counts +for i, row in bedbedgraph.iterrows(): + group = row["group"] + bg = row["bedgraph"] + samplename = row["sampleName"] + sf = row["scalingFactor"] + fbed = row["fragmentsBed"] + filepath, filename = os.path.split(bg) + basename, ext = os.path.splitext(filename) + sampleinfofile.write("%s\t%s\n" % (samplename, group)) + cmd = "bedtools intersect -wa -wb -a " + bg + " -b " + mergedfile + print("Now running: %s" % (cmd)) + bt = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) + peakCounts = dict() + for l in bt.stdout.split("\n"): + l = l.strip().split("\t") + if len(l) != 7: + continue + peakID = l[4] + ":" + l[5] + "-" + l[6] + score = (int(l[2]) - int(l[1])) * float(l[3]) + if not peakID in peakCounts: + peakCounts[peakID] = 0.0 + peakCounts[peakID] += score + counts[samplename] = pandas.DataFrame.from_dict( + peakCounts, orient="index", columns=[samplename] + ) + cmd = "bedmap --echo-ref-name --count " + mergedfile + " " + fbed + print("Now running %s" % (cmd)) + bo = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) + fPeakCounts = dict() + for l in bo.stdout.split("\n"): + l = l.strip().split("|") + if len(l) != 2: + continue + fPeakCounts[l[0]] = l[1] + fcounts[samplename] = pandas.DataFrame.from_dict( + fPeakCounts, orient="index", columns=[samplename] + ) sampleinfofile.close() -counts_matrix_df = functools.reduce(lambda x,y:x.merge(y,left_index=True,right_index=True),counts.values()) -counts_matrix_df.to_csv(args.countsmatrix,sep="\t",header=True,index=True,index_label="peakID") -fcounts_matrix_df = functools.reduce(lambda x,y:x.merge(y,left_index=True,right_index=True),fcounts.values()) -fcounts_matrix_df.to_csv(args.fragmentscountsmatrix,sep="\t",header=True,index=True,index_label="peakID") +counts_matrix_df = functools.reduce( + lambda x, y: x.merge(y, left_index=True, right_index=True), counts.values() +) +counts_matrix_df.to_csv( + args.countsmatrix, sep="\t", header=True, index=True, index_label="peakID" +) +fcounts_matrix_df = functools.reduce( + lambda x, y: x.merge(y, left_index=True, right_index=True), fcounts.values() +) +fcounts_matrix_df.to_csv( + args.fragmentscountsmatrix, sep="\t", header=True, index=True, index_label="peakID" +) -#os.rmdir(args.tmpdir) \ No newline at end of file +# os.rmdir(args.tmpdir) diff --git a/workflow/scripts/_make_library_norm_table.R b/workflow/scripts/_make_library_norm_table.R index 7279e87..6156915 100644 --- a/workflow/scripts/_make_library_norm_table.R +++ b/workflow/scripts/_make_library_norm_table.R @@ -9,72 +9,80 @@ suppressPackageStartupMessages(library("tidyverse")) parser <- ArgumentParser() -parser$add_argument("--yamlDir", type="character", required=TRUE, - help="absolute path to location of yamls") -parser$add_argument("--excludeFromName", type="character", required=TRUE, - help="sample info as TSV") -parser$add_argument("--outTable", type="character", required=TRUE, - help="absolute path to output table TSV file.") +parser$add_argument("--yamlDir", + type = "character", required = TRUE, + help = "absolute path to location of yamls" +) +parser$add_argument("--excludeFromName", + type = "character", required = TRUE, + help = "sample info as TSV" +) +parser$add_argument("--outTable", + type = "character", required = TRUE, + help = "absolute path to output table TSV file." +) args <- parser$parse_args() # assign params variables -debug=0 -if (debug==1){ - yaml_dir="~/../../Volumes/CUTRUN/analysis/CS030666/carlisle_230118/results/alignment_stats" - exclude_from_name = ".alignment_stats.yaml" - scale_constant = 1e6 - out_table = "test.tsv" +debug <- 0 +if (debug == 1) { + yaml_dir <- "~/../../Volumes/CUTRUN/analysis/CS030666/carlisle_230118/results/alignment_stats" + exclude_from_name <- ".alignment_stats.yaml" + scale_constant <- 1e6 + out_table <- "test.tsv" } else { - yaml_dir=args$yamlDir - exclude_from_name=args$excludeFromName - scale_constant=args$scaleConstant - out_table=args$outTable + yaml_dir <- args$yamlDir + exclude_from_name <- args$excludeFromName + scale_constant <- args$scaleConstant + out_table <- args$outTable } # grab all files setwd(yaml_dir) -files=list.files(path = yaml_dir, - pattern = "*.yaml") +files <- list.files( + path = yaml_dir, + pattern = "*.yaml" +) # read in df and prep -df = data.frame() -for ( f in files ){ - sampleName <- gsub(pattern = exclude_from_name, replacement = "",f) +df <- data.frame() +for (f in files) { + sampleName <- gsub(pattern = exclude_from_name, replacement = "", f) readinyaml <- as.data.frame(read_yaml(f, fileEncoding = "UTF-8")) - rownames(readinyaml)=c(sampleName) - df = rbind(df,readinyaml) + rownames(readinyaml) <- c(sampleName) + df <- rbind(df, readinyaml) } # run once for dedup, once for nondedup -final_df=data.frame() -for (dedup_type in c("dedup_nreads_genome","no_dedup_nreads_genome")){ +final_df <- data.frame() +for (dedup_type in c("dedup_nreads_genome", "no_dedup_nreads_genome")) { # determine scaling factor dependent on library size - col_median=median(df[,dedup_type]) - if (col_median>1000000000){ - lib_factor=1e8 - }else if (col_median>100000000){ - lib_factor=1e7 - }else if (col_median>10000000){ - lib_factor=1e6 - } else if (col_median>1000000){ - lib_factor=1e5 - } else if (col_median>100000){ - lib_factor=1e4 - } else if (col_median>10000){ - lib_factor=1e3 - } else if (col_median>1000){ - lib_factor=1e2 + col_median <- median(df[, dedup_type]) + if (col_median > 1000000000) { + lib_factor <- 1e8 + } else if (col_median > 100000000) { + lib_factor <- 1e7 + } else if (col_median > 10000000) { + lib_factor <- 1e6 + } else if (col_median > 1000000) { + lib_factor <- 1e5 + } else if (col_median > 100000) { + lib_factor <- 1e4 + } else if (col_median > 10000) { + lib_factor <- 1e3 + } else if (col_median > 1000) { + lib_factor <- 1e2 } else { - lib_factor=1e1 + lib_factor <- 1e1 } - df$library_size=df[,dedup_type]/lib_factor - df$sampleid=rownames(df) - df$dedup_type=strsplit(dedup_type,"_")[[1]][1] - df$dedup_type=gsub("no","no_dedup",df$dedup_type) + df$library_size <- df[, dedup_type] / lib_factor + df$sampleid <- rownames(df) + df$dedup_type <- strsplit(dedup_type, "_")[[1]][1] + df$dedup_type <- gsub("no", "no_dedup", df$dedup_type) # create final df - select_cols=c("sampleid","library_size","dedup_type") - final_df=rbind(final_df,df[,select_cols]) + select_cols <- c("sampleid", "library_size", "dedup_type") + final_df <- rbind(final_df, df[, select_cols]) } -write.table(final_df,out_table,row.names = FALSE) +write.table(final_df, out_table, row.names = FALSE) diff --git a/workflow/scripts/_make_results_bed.py b/workflow/scripts/_make_results_bed.py index 2cbe8bb..d58c78f 100644 --- a/workflow/scripts/_make_results_bed.py +++ b/workflow/scripts/_make_results_bed.py @@ -1,43 +1,70 @@ #!/usr/bin/env python3 -import argparse,pandas,os,yaml,subprocess,uuid +import argparse, pandas, os, yaml, subprocess, uuid from operator import index from email import header -randbed=str(uuid.uuid4())+".bed" +randbed = str(uuid.uuid4()) + ".bed" -parser = argparse.ArgumentParser(description='create bed') -parser.add_argument('--results', required=True, type=str, help="results bed file") -parser.add_argument('--fdr_cutoff', required=True, type=float, help="FDR cutoff") -parser.add_argument('--log2FC_cutoff', required=True, type=float, help="log2FC cutoff") -parser.add_argument('--elbowyaml', required=True, type=str, help="elbow limits") -parser.add_argument('--bed', required=True, type=str, help="output bed with colors") +parser = argparse.ArgumentParser(description="create bed") +parser.add_argument("--results", required=True, type=str, help="results bed file") +parser.add_argument("--fdr_cutoff", required=True, type=float, help="FDR cutoff") +parser.add_argument("--log2FC_cutoff", required=True, type=float, help="log2FC cutoff") +parser.add_argument("--elbowyaml", required=True, type=str, help="elbow limits") +parser.add_argument("--bed", required=True, type=str, help="output bed with colors") args = parser.parse_args() -df = pandas.read_csv(args.results,sep="\t",header=0,usecols=['seqnames','start','end','log2FoldChange','padj']) -df['color'] = ["160,160,160"]*len(df.index) -df['name'] = ["."]*len(df.index) -df['strand'] = ["+"]*len(df.index) -df['score'] = [0]*len(df.index) -df['seven'] = [0]*len(df.index) -df['eight'] = [0]*len(df.index) +df = pandas.read_csv( + args.results, + sep="\t", + header=0, + usecols=["seqnames", "start", "end", "log2FoldChange", "padj"], +) +df["color"] = ["160,160,160"] * len(df.index) +df["name"] = ["."] * len(df.index) +df["strand"] = ["+"] * len(df.index) +df["score"] = [0] * len(df.index) +df["seven"] = [0] * len(df.index) +df["eight"] = [0] * len(df.index) -df.loc[df['log2FoldChange'] < 0,'strand'] = "-" +df.loc[df["log2FoldChange"] < 0, "strand"] = "-" with open(args.elbowyaml) as f: elbowdf = yaml.safe_load(f) -elbowdown = elbowdf['low_limit'] -elbowup = elbowdf['up_limit'] +elbowdown = elbowdf["low_limit"] +elbowup = elbowdf["up_limit"] if abs(elbowdown) < args.log2FC_cutoff: - df.loc[(df['padj'] < args.fdr_cutoff) & (df['log2FoldChange'] < elbowdown),'color']="229,255,204" -df.loc[(df['padj'] < args.fdr_cutoff) & (df['log2FoldChange'] < args.log2FC_cutoff),'color']="128,255,0" + df.loc[ + (df["padj"] < args.fdr_cutoff) & (df["log2FoldChange"] < elbowdown), "color" + ] = "229,255,204" +df.loc[ + (df["padj"] < args.fdr_cutoff) & (df["log2FoldChange"] < args.log2FC_cutoff), + "color", +] = "128,255,0" if elbowup < args.log2FC_cutoff: - df.loc[(df['padj'] < args.fdr_cutoff) & (df['log2FoldChange'] > elbowup),'color']="255,153,51" -df.loc[(df['padj'] < args.fdr_cutoff) & (df['log2FoldChange'] > args.log2FC_cutoff),'color']="255,51,51" -df = df.reindex(columns=['seqnames','start','end','name','score','strand','seven','eight','color']) + df.loc[ + (df["padj"] < args.fdr_cutoff) & (df["log2FoldChange"] > elbowup), "color" + ] = "255,153,51" +df.loc[ + (df["padj"] < args.fdr_cutoff) & (df["log2FoldChange"] > args.log2FC_cutoff), + "color", +] = "255,51,51" +df = df.reindex( + columns=[ + "seqnames", + "start", + "end", + "name", + "score", + "strand", + "seven", + "eight", + "color", + ] +) -df.to_csv(randbed,sep="\t",header=False,index=False) +df.to_csv(randbed, sep="\t", header=False, index=False) -cmd = "sort -S 40G -T /dev/shm -k1,1 -k2,2n "+randbed+" > "+args.bed -s = subprocess.run(cmd,shell=True,check=True,text=True,capture_output=True) +cmd = "sort -S 40G -T /dev/shm -k1,1 -k2,2n " + randbed + " > " + args.bed +s = subprocess.run(cmd, shell=True, check=True, text=True, capture_output=True) os.remove(randbed) diff --git a/workflow/scripts/_plot_correlation.R b/workflow/scripts/_plot_correlation.R index 3727320..60bf691 100644 --- a/workflow/scripts/_plot_correlation.R +++ b/workflow/scripts/_plot_correlation.R @@ -1,92 +1,102 @@ -# Plot correlation heatmap and PCA - -library(ggplot2) -library(reshape2) -library(ComplexHeatmap) -library(RColorBrewer) -library(circlize) - -args = commandArgs(trailingOnly=TRUE) - -in.corr = args[1] -in.pca = args[2] -in.read_depth = args[3] -dupstatus = args[4] -out.corr = args[5] -out.pca = args[6] - -# Read deeptools output -print("Load heatmap") -heatmap = read.table(in.corr,check.names = FALSE) -colnames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",colnames(heatmap))) -rownames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",rownames(heatmap))) -print(heatmap) - -print("Load PCA") -pca = read.table(in.pca,header=TRUE,check.names = FALSE) -print(pca) - -# Create metadata table -print("Create metadata table") -metadata = data.frame(Replicate=colnames(heatmap)) -metadata$Sample = gsub("_[1-9]$","",metadata$Replicate) - -# Add read depth -read_depth = read.table(in.read_depth,sep='\t',header=TRUE) -metadata = merge(metadata,read_depth[,c("sample_name","nreads","no_dedup_nreads_genome","dedup_nreads_genome")], - by.x="Replicate",by.y="sample_name") -if(dupstatus == "dedup"){ - metadata$Depth = metadata$dedup_nreads_genome/1000000 -}else{ - metadata$Depth = metadata$no_dedup_nreads_genome/1000000 -} -metadata = metadata[match(rownames(heatmap),metadata$Replicate),] -print(metadata) - -# Create colors -sample_colors = setNames(colorRampPalette(brewer.pal(11, "Spectral"))(length(unique(metadata$Sample))), - unique(metadata$Sample)) -depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black")) - -colors = list(Sample=sample_colors, - Depth=depth_colors) - -# Heatmap - -## Plot heatmap -print("Print heatmap") -png(out.corr,width=600,height=600) -print(Heatmap(heatmap, - top_annotation = HeatmapAnnotation(df=metadata[,c("Sample","Depth")], - col=colors[c("Sample","Depth")]), - show_row_dend = FALSE, - show_row_names = FALSE, - col = colorRamp2(c(0,1), c("white", "blue")), - heatmap_legend_param = list( - title = "Pearson\ncorrelation" - ), - column_dend_height = unit(0.8,"inches"), - name="Pearson correlation, genome-wide coverage (10kb bins)")) -dev.off() - -# PCA -print("Print PCA") - -## Calculate variance from eigenvalue -eigenvalue = pca[,c("Component","Eigenvalue")] -eigenvalue$Variance = eigenvalue$Eigenvalue/sum(eigenvalue$Eigenvalue)*100 - -pca = melt(pca[,1:dim(pca)[2]-1],id.var="Component",variable.name="Replicate",value.name="Loading") -pca = dcast(pca,Replicate~Component,value.var="Loading") -pca$Replicate = gsub("[.]no_dedup","",gsub("[.]dedup","",pca$Replicate)) -pca = merge(pca,metadata,by="Replicate") - -## Plot PCA -png(out.pca) -print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) + - xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) + - ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) + - scale_color_manual(values=sample_colors) + - ggtitle("PCA, genome-wide coverage (10kb bins)") + - theme_classic() + theme(legend.position="bottom")) -dev.off() +# Plot correlation heatmap and PCA + +library(ggplot2) +library(reshape2) +library(ComplexHeatmap) +library(RColorBrewer) +library(circlize) + +args <- commandArgs(trailingOnly = TRUE) + +in.corr <- args[1] +in.pca <- args[2] +in.read_depth <- args[3] +dupstatus <- args[4] +out.corr <- args[5] +out.pca <- args[6] + +# Read deeptools output +print("Load heatmap") +heatmap <- read.table(in.corr, check.names = FALSE) +colnames(heatmap) <- gsub("[.]no_dedup", "", gsub("[.]dedup", "", colnames(heatmap))) +rownames(heatmap) <- gsub("[.]no_dedup", "", gsub("[.]dedup", "", rownames(heatmap))) +print(heatmap) + +print("Load PCA") +pca <- read.table(in.pca, header = TRUE, check.names = FALSE) +print(pca) + +# Create metadata table +print("Create metadata table") +metadata <- data.frame(Replicate = colnames(heatmap)) +metadata$Sample <- gsub("_[1-9]$", "", metadata$Replicate) + +# Add read depth +read_depth <- read.table(in.read_depth, sep = "\t", header = TRUE) +metadata <- merge(metadata, read_depth[, c("sample_name", "nreads", "no_dedup_nreads_genome", "dedup_nreads_genome")], + by.x = "Replicate", by.y = "sample_name" +) +if (dupstatus == "dedup") { + metadata$Depth <- metadata$dedup_nreads_genome / 1000000 +} else { + metadata$Depth <- metadata$no_dedup_nreads_genome / 1000000 +} +metadata <- metadata[match(rownames(heatmap), metadata$Replicate), ] +print(metadata) + +# Create colors +sample_colors <- setNames( + colorRampPalette(brewer.pal(11, "Spectral"))(length(unique(metadata$Sample))), + unique(metadata$Sample) +) +depth_colors <- colorRamp2(breaks = c(0, 30, 60), colors = c("white", "grey50", "black")) + +colors <- list( + Sample = sample_colors, + Depth = depth_colors +) + +# Heatmap + +## Plot heatmap +print("Print heatmap") +png(out.corr, width = 600, height = 600) +print(Heatmap(heatmap, + top_annotation = HeatmapAnnotation( + df = metadata[, c("Sample", "Depth")], + col = colors[c("Sample", "Depth")] + ), + show_row_dend = FALSE, + show_row_names = FALSE, + col = colorRamp2(c(0, 1), c("white", "blue")), + heatmap_legend_param = list( + title = "Pearson\ncorrelation" + ), + column_dend_height = unit(0.8, "inches"), + name = "Pearson correlation, genome-wide coverage (10kb bins)" +)) +dev.off() + +# PCA +print("Print PCA") + +## Calculate variance from eigenvalue +eigenvalue <- pca[, c("Component", "Eigenvalue")] +eigenvalue$Variance <- eigenvalue$Eigenvalue / sum(eigenvalue$Eigenvalue) * 100 + +pca <- melt(pca[, 1:dim(pca)[2] - 1], id.var = "Component", variable.name = "Replicate", value.name = "Loading") +pca <- dcast(pca, Replicate ~ Component, value.var = "Loading") +pca$Replicate <- gsub("[.]no_dedup", "", gsub("[.]dedup", "", pca$Replicate)) +pca <- merge(pca, metadata, by = "Replicate") + +## Plot PCA +png(out.pca) +print(ggplot(pca, aes(x = `1`, y = `2`, color = Sample)) + + geom_point(size = 3) + + xlab(paste("PC1 (", round(eigenvalue$Variance[1], 1), "%)", sep = "")) + + ylab(paste("PC2 (", round(eigenvalue$Variance[2], 1), "%)", sep = "")) + + scale_color_manual(values = sample_colors) + + ggtitle("PCA, genome-wide coverage (10kb bins)") + + theme_classic() + + theme(legend.position = "bottom")) +dev.off() diff --git a/workflow/scripts/_plot_feature_enrichment.R b/workflow/scripts/_plot_feature_enrichment.R index 343976f..67ef5b5 100644 --- a/workflow/scripts/_plot_feature_enrichment.R +++ b/workflow/scripts/_plot_feature_enrichment.R @@ -1,47 +1,58 @@ -# Plot HOMER enrichment - -library(reshape2) -library(ggplot2) -library(plyr) -library(ComplexHeatmap) -library(circlize) - -theme_set(theme_bw()) - -args = commandArgs(trailingOnly=TRUE) - -peaks.dir = args[1] -peak_mode = args[2] -dupstatus = args[3] -fig = args[4] - -# Load HOMER annotation summary -homer.files = list.files(path=peaks.dir,pattern="annotation.summary", - recursive=TRUE,full.names = TRUE) -homer = lapply(homer.files,function(x) read.table(x,sep='\t',header=TRUE,check.names = FALSE,nrows=14)) -names(homer) = gsub("gopeaks/|macs2/|seacr/","", - gsub(".annotation.summary","", - gsub("/annotation/homer","", - gsub("^.*results/peaks/","",homer.files)))) -homer = lapply(homer,function(x) x[1:rownames(x[which(x$Annotation == "Annotation"),])-1,]) -homer = ldply(homer,.id="File") - -homer = cbind(homer,colsplit(homer$File,"/",c("Threshold","Sample"))) -homer = cbind(homer,colsplit(homer$Sample,"[.]",c("Comparison","Duplication","Caller"))) -homer = cbind(homer,colsplit(homer$Comparison,"_vs_",c("Replicate","Control"))) -homer = homer[which(homer$Duplication == dupstatus & homer$Caller == peak_mode),] -homer$`LogP enrichment (+values depleted)` = as.numeric(homer$`LogP enrichment (+values depleted)`) - -# Plot enrichment heatmap -homer = dcast(homer,Replicate~Annotation,value.var="LogP enrichment (+values depleted)") -rownames(homer) = homer$Replicate -homer = homer[,2:dim(homer)[2]] - -png(fig,width = 700, height = 400) -hm = Heatmap(homer, - name="LogP enrichment\n(+values depleted)", - col=colorRamp2(breaks=c(-70000,-50,0,50,70000), - colors=c("red","yellow","white","green","blue")), - width=unit(100, "mm")) -draw(hm,heatmap_legend_side = "left") -dev.off() +# Plot HOMER enrichment + +library(reshape2) +library(ggplot2) +library(plyr) +library(ComplexHeatmap) +library(circlize) + +theme_set(theme_bw()) + +args <- commandArgs(trailingOnly = TRUE) + +peaks.dir <- args[1] +peak_mode <- args[2] +dupstatus <- args[3] +fig <- args[4] + +# Load HOMER annotation summary +homer.files <- list.files( + path = peaks.dir, pattern = "annotation.summary", + recursive = TRUE, full.names = TRUE +) +homer <- lapply(homer.files, function(x) read.table(x, sep = "\t", header = TRUE, check.names = FALSE, nrows = 14)) +names(homer) <- gsub( + "gopeaks/|macs2/|seacr/", "", + gsub( + ".annotation.summary", "", + gsub( + "/annotation/homer", "", + gsub("^.*results/peaks/", "", homer.files) + ) + ) +) +homer <- lapply(homer, function(x) x[1:rownames(x[which(x$Annotation == "Annotation"), ]) - 1, ]) +homer <- ldply(homer, .id = "File") + +homer <- cbind(homer, colsplit(homer$File, "/", c("Threshold", "Sample"))) +homer <- cbind(homer, colsplit(homer$Sample, "[.]", c("Comparison", "Duplication", "Caller"))) +homer <- cbind(homer, colsplit(homer$Comparison, "_vs_", c("Replicate", "Control"))) +homer <- homer[which(homer$Duplication == dupstatus & homer$Caller == peak_mode), ] +homer$`LogP enrichment (+values depleted)` <- as.numeric(homer$`LogP enrichment (+values depleted)`) + +# Plot enrichment heatmap +homer <- dcast(homer, Replicate ~ Annotation, value.var = "LogP enrichment (+values depleted)") +rownames(homer) <- homer$Replicate +homer <- homer[, 2:dim(homer)[2]] + +png(fig, width = 700, height = 400) +hm <- Heatmap(homer, + name = "LogP enrichment\n(+values depleted)", + col = colorRamp2( + breaks = c(-70000, -50, 0, 50, 70000), + colors = c("red", "yellow", "white", "green", "blue") + ), + width = unit(100, "mm") +) +draw(hm, heatmap_legend_side = "left") +dev.off() diff --git a/workflow/scripts/_plot_peak_counts.R b/workflow/scripts/_plot_peak_counts.R index eee5774..263441b 100644 --- a/workflow/scripts/_plot_peak_counts.R +++ b/workflow/scripts/_plot_peak_counts.R @@ -1,64 +1,68 @@ -# Plot number of peaks called per sample by caller, duplication status, and threshold - -library(reshape2) -library(ggplot2) -library(plyr) -library(openxlsx) - -theme_set(theme_bw()) - -args = commandArgs(trailingOnly=TRUE) - -input = args[1] - -# Load number of peaks -print("Load peaks") -peaks = read.table(input,sep="") -colnames(peaks) = c("Peaks","File") -peaks = peaks[which(peaks$File != "total"),] -peaks$File = gsub(".peaks.bed$","",gsub("/peak_output","",gsub("^.*results/peaks/","",peaks$File))) -peaks = cbind(peaks,colsplit(peaks$File,"/",c("Threshold","Caller","Sample"))) -peaks = cbind(peaks,colsplit(peaks$Sample,"[.]",c("Comparison","Duplication","Mode"))) -peaks$Caller = paste(peaks$Caller,peaks$Mode,sep="_") -peaks = peaks[,c("Comparison","Caller","Duplication","Threshold","Peaks")] -peaks = cbind(peaks,colsplit(peaks$Comparison,"_vs_",c("Replicate","Control"))) -peaks$Sample = gsub("_[1-9]$","",peaks$Replicate) - -setwd(args[2]) - -# Write out table -print("Write table") -peak_output = dcast(peaks,Threshold+Duplication+Comparison~Caller,value.var="Peaks") -peak_output = peak_output[order(peak_output$Threshold,peak_output$Duplication,peak_output$Comparison),] -write.xlsx(peak_output,file="Peak counts.xlsx") - -# For each threshold, plot number of peaks -for (threshold in unique(peaks$Threshold)){ - print(threshold) -peaks_threshold = peaks[which(peaks$Threshold == threshold),] - -# Compare peaks with and without duplication -peaks_threshold_dup = dcast(peaks_threshold,Replicate+Caller~Duplication,value.var="Peaks") -cor=round(as.numeric(unlist(cor.test(peaks_threshold_dup$dedup,peaks_threshold_dup$no_dedup))["estimate.cor"]),2) -peaks.max=max(max(peaks_threshold_dup$dedup),max(peaks_threshold_dup$no_dedup)) - -png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300) - print(ggplot(peaks_threshold_dup,aes(x=log10(no_dedup),y=log10(dedup))) + geom_point(aes(color=Caller)) + - geom_abline(intercept=0,slope=1,linetype="dashed") + - xlab("log10(Peaks), no deduplication") + ylab("log10(Peaks), deduplication") + - xlim(0,log10(peaks.max)+0.5) + ylim(0,log10(peaks.max)+0.5) + - ggtitle(paste("Peaks by read duplication, q-value threshold ",threshold,sep=""), - subtitle=paste("Pearson correlation: ",cor,sep=""))) -dev.off() - -# Plot summary across callers -width=(length(unique(peaks_threshold$Sample))*50)+100 -height=(length(unique(peaks_threshold$Caller))*100)+100 -png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height) - print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) + - geom_boxplot() + - facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") + - theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + - ggtitle(paste("Peaks,\nq-value threshold ",threshold,",\ndeduplicated reads",sep=""))) -dev.off() -} +# Plot number of peaks called per sample by caller, duplication status, and threshold + +library(reshape2) +library(ggplot2) +library(plyr) +library(openxlsx) + +theme_set(theme_bw()) + +args <- commandArgs(trailingOnly = TRUE) + +input <- args[1] + +# Load number of peaks +print("Load peaks") +peaks <- read.table(input, sep = "") +colnames(peaks) <- c("Peaks", "File") +peaks <- peaks[which(peaks$File != "total"), ] +peaks$File <- gsub(".peaks.bed$", "", gsub("/peak_output", "", gsub("^.*results/peaks/", "", peaks$File))) +peaks <- cbind(peaks, colsplit(peaks$File, "/", c("Threshold", "Caller", "Sample"))) +peaks <- cbind(peaks, colsplit(peaks$Sample, "[.]", c("Comparison", "Duplication", "Mode"))) +peaks$Caller <- paste(peaks$Caller, peaks$Mode, sep = "_") +peaks <- peaks[, c("Comparison", "Caller", "Duplication", "Threshold", "Peaks")] +peaks <- cbind(peaks, colsplit(peaks$Comparison, "_vs_", c("Replicate", "Control"))) +peaks$Sample <- gsub("_[1-9]$", "", peaks$Replicate) + +setwd(args[2]) + +# Write out table +print("Write table") +peak_output <- dcast(peaks, Threshold + Duplication + Comparison ~ Caller, value.var = "Peaks") +peak_output <- peak_output[order(peak_output$Threshold, peak_output$Duplication, peak_output$Comparison), ] +write.xlsx(peak_output, file = "Peak counts.xlsx") + +# For each threshold, plot number of peaks +for (threshold in unique(peaks$Threshold)) { + print(threshold) + peaks_threshold <- peaks[which(peaks$Threshold == threshold), ] + + # Compare peaks with and without duplication + peaks_threshold_dup <- dcast(peaks_threshold, Replicate + Caller ~ Duplication, value.var = "Peaks") + cor <- round(as.numeric(unlist(cor.test(peaks_threshold_dup$dedup, peaks_threshold_dup$no_dedup))["estimate.cor"]), 2) + peaks.max <- max(max(peaks_threshold_dup$dedup), max(peaks_threshold_dup$no_dedup)) + + png(paste("duplication_corr.", threshold, ".png", sep = ""), width = 350, height = 300) + print(ggplot(peaks_threshold_dup, aes(x = log10(no_dedup), y = log10(dedup))) + + geom_point(aes(color = Caller)) + + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + + xlab("log10(Peaks), no deduplication") + + ylab("log10(Peaks), deduplication") + + xlim(0, log10(peaks.max) + 0.5) + + ylim(0, log10(peaks.max) + 0.5) + + ggtitle(paste("Peaks by read duplication, q-value threshold ", threshold, sep = ""), + subtitle = paste("Pearson correlation: ", cor, sep = "") + )) + dev.off() + + # Plot summary across callers + width <- (length(unique(peaks_threshold$Sample)) * 50) + 100 + height <- (length(unique(peaks_threshold$Caller)) * 100) + 100 + png(paste("peaks_by_caller.", threshold, ".png", sep = ""), width = width, height = height) + print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"), ], aes(x = Sample, y = Peaks)) + + geom_boxplot() + + facet_wrap(~Caller, nrow = length(unique(peaks_threshold$Caller)), scales = "free_y") + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + ggtitle(paste("Peaks,\nq-value threshold ", threshold, ",\ndeduplicated reads", sep = ""))) + dev.off() +} diff --git a/workflow/scripts/_plot_results_venn.R b/workflow/scripts/_plot_results_venn.R index be23302..615d011 100644 --- a/workflow/scripts/_plot_results_venn.R +++ b/workflow/scripts/_plot_results_venn.R @@ -1,76 +1,94 @@ #!/usr/bin/env Rscript -rm(list=ls()) +rm(list = ls()) # package list -list.of.packages=c("argparse","ggvenn") +list.of.packages <- c("argparse", "ggvenn") # load packages invisible(lapply(list.of.packages, library, character.only = TRUE)) # handle debugging -debug="FALSE" +debug <- "FALSE" # files=list.files(path=path) # debug="TRUE" # path="~/Documents/Projects/ccbr1155/CS030586/contrasts/siSmyd3_2m_Smyd3_0.25HCHO_500K_vs_siNC_2m_Smyd3_0.25HCHO_500K__dedup__narrowPeak" # path="/data/CCBR/projects/ccbr1155/CS030586_CARAP/diff" -if (debug){ - auc = grep(pattern = "_AUCbased_diffresults.txt",files)[1] - aucresults = paste(path,files[auc],sep="/") - frag = grep(pattern = "_fragmentsbased_diffresults.txt",files)[1] - fragmentsresults = paste(path,files[frag],sep="/") +if (debug) { + auc <- grep(pattern = "_AUCbased_diffresults.txt", files)[1] + aucresults <- paste(path, files[auc], sep = "/") + frag <- grep(pattern = "_fragmentsbased_diffresults.txt", files)[1] + fragmentsresults <- paste(path, files[frag], sep = "/") } else { # create parser object parser <- ArgumentParser() - - parser$add_argument("--aucresults", type="character", required=TRUE, - help="path to aucresults") - parser$add_argument("--fragmentsresults", type="character", required=TRUE, - help="path to fragmentsresults") - parser$add_argument("--pdf", type="character", required=TRUE, - help="output pdf file") - parser$add_argument("--title", type="character", required=TRUE, - help="plot title in pdf") - parser$add_argument("--fdr_cutoff", type="double", default=0.05, required=FALSE, - help="FDR cutoff [default %(default)s]") - parser$add_argument("--log2fc_cutoff", type="double", default=0.59, required=FALSE, - help="log2foldchange cutoff [default %(default)s]") - + + parser$add_argument("--aucresults", + type = "character", required = TRUE, + help = "path to aucresults" + ) + parser$add_argument("--fragmentsresults", + type = "character", required = TRUE, + help = "path to fragmentsresults" + ) + parser$add_argument("--pdf", + type = "character", required = TRUE, + help = "output pdf file" + ) + parser$add_argument("--title", + type = "character", required = TRUE, + help = "plot title in pdf" + ) + parser$add_argument("--fdr_cutoff", + type = "double", default = 0.05, required = FALSE, + help = "FDR cutoff [default %(default)s]" + ) + parser$add_argument("--log2fc_cutoff", + type = "double", default = 0.59, required = FALSE, + help = "log2foldchange cutoff [default %(default)s]" + ) + args <- parser$parse_args() - aucresults=args$aucresults - fragmentsresults=args$fragmentsresults + aucresults <- args$aucresults + fragmentsresults <- args$fragmentsresults } -readandfilter <- function(resultsfile){ - df = read.csv(resultsfile, - header = TRUE, - row.names = NULL, - sep = "\t", - check.names = FALSE, - comment.char = "#", - strip.white = TRUE) +readandfilter <- function(resultsfile) { + df <- read.csv(resultsfile, + header = TRUE, + row.names = NULL, + sep = "\t", + check.names = FALSE, + comment.char = "#", + strip.white = TRUE + ) df[is.na(df)] <- 1 - df$significance = "NS" - k_fdr = df$padj < args$fdr_cutoff - k_up = df$log2FoldChange > args$log2fc_cutoff - k_down = df$log2FoldChange < (-1 * args$log2fc_cutoff) - if (nrow(df[k_fdr & k_up,]) != 0) {df[k_fdr & k_up,]$significance = "UP"} - if (nrow(df[k_fdr & k_down,]) != 0) {df[k_fdr & k_down,]$significance = "DOWN"} + df$significance <- "NS" + k_fdr <- df$padj < args$fdr_cutoff + k_up <- df$log2FoldChange > args$log2fc_cutoff + k_down <- df$log2FoldChange < (-1 * args$log2fc_cutoff) + if (nrow(df[k_fdr & k_up, ]) != 0) { + df[k_fdr & k_up, ]$significance <- "UP" + } + if (nrow(df[k_fdr & k_down, ]) != 0) { + df[k_fdr & k_down, ]$significance <- "DOWN" + } return(df) } -auc_df = readandfilter(aucresults) -frag_df = readandfilter(fragmentsresults) +auc_df <- readandfilter(aucresults) +frag_df <- readandfilter(fragmentsresults) -AUC_UP=auc_df[auc_df$significance=="UP",]$peakID -AUC_DOWN=auc_df[auc_df$significance=="DOWN",]$peakID -FRAG_UP=frag_df[frag_df$significance=="UP",]$peakID -FRAG_DOWN=frag_df[frag_df$significance=="DOWN",]$peakID -x<-list(AUC_UP=AUC_UP,AUC_DOWN=AUC_DOWN,FRAG_UP=FRAG_UP,FRAG_DOWN=FRAG_DOWN) +AUC_UP <- auc_df[auc_df$significance == "UP", ]$peakID +AUC_DOWN <- auc_df[auc_df$significance == "DOWN", ]$peakID +FRAG_UP <- frag_df[frag_df$significance == "UP", ]$peakID +FRAG_DOWN <- frag_df[frag_df$significance == "DOWN", ]$peakID +x <- list(AUC_UP = AUC_UP, AUC_DOWN = AUC_DOWN, FRAG_UP = FRAG_UP, FRAG_DOWN = FRAG_DOWN) pdf(args$pdf) ggvenn(x, - stroke_size = 0.4, - set_name_size = 3, - fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF")) + + stroke_size = 0.4, + set_name_size = 3, + fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF") +) + ggtitle(args$title) + theme(plot.title = element_text(size = 8)) dev.off() diff --git a/workflow/scripts/_run_jobby_on_snakemake_log b/workflow/scripts/_run_jobby_on_snakemake_log index a6a41e7..3ca3356 100644 --- a/workflow/scripts/_run_jobby_on_snakemake_log +++ b/workflow/scripts/_run_jobby_on_snakemake_log @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# input: +# input: ## intake snakemake log ## full path to jobby.py script @@ -10,4 +10,4 @@ jobidswc=$(echo $jobids | wc -c) # run jobby script module load python -if [ "$jobidswc" != "1" ];then python $2 $jobids; fi \ No newline at end of file +if [ "$jobidswc" != "1" ];then python $2 $jobids; fi diff --git a/workflow/scripts/_spooker b/workflow/scripts/_spooker index 9d2fd8a..3a9a0d1 100644 --- a/workflow/scripts/_spooker +++ b/workflow/scripts/_spooker @@ -95,4 +95,4 @@ if [[ $ISBIOWULF == true || $ISFRCE == true ]];then else # not biowulf or frce ... so exit exit 0 -fi \ No newline at end of file +fi