diff --git a/bin/get_chromhmm_results.py b/bin/get_chromhmm_results.py new file mode 100755 index 00000000..9744dd83 --- /dev/null +++ b/bin/get_chromhmm_results.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# coding: utf-8 + +import argparse +import pandas as pd +import numpy as np + + +parser = argparse.ArgumentParser(description="Process ChromHMM output into bed file of predicted enhancers") + +parser.add_argument("-e", "--emissions", type=str, required=True, help="Path to emission file") +parser.add_argument("-b", "--bed", type=str, required=True, help="Path to bed file") +parser.add_argument("-t", "--threshold", type=float, required=False, default=0.9, help="Threshold for state emissions") +parser.add_argument("-m", "--markers", nargs='+', required=False, default=["H3K27ac", "H3K4me3"], help="ChIP-Seq markers that indicate an enhancer") +parser.add_argument("-o", "--output", type=str, required=True, help="Path to output bed with enhancer positions") + +args = parser.parse_args() + +path_emissions = args.emissions +path_bed = args.bed +threshold = args.threshold +markers = args.markers +output = args.output + + +# Read emissions file for the provided markers +emissions = pd.read_csv(path_emissions, sep = "\t")[["State (Emission order)"] + markers].rename(columns={"State (Emission order)": "State"}) + + +# Read input bed file and remove unecessary columns +bed = pd.read_csv(path_bed, + sep="\t", + skiprows=1, + names=["chr", "start", "end", "state", "score", "strand", "start_1", "end_1", "rgb"] + ).drop(columns=["strand", "score", "start_1", "end_1", "rgb"]) + + +# Keep state if any of the markers is enriched > threshold for this state +states = emissions[np.any([emissions[marker] >= threshold for marker in markers], axis=0)]["State"].tolist() + + +# Subset bed file for selected states +out_bed = bed[np.isin(bed["state"], states)].drop(columns=["state"]) + +# Write output +out_bed.to_csv(output, index=False, sep="\t", header=False) + diff --git a/modules/local/chromhmm/binarize_bams.nf b/modules/local/chromhmm/binarize_bams.nf index c8bac588..83e54169 100644 --- a/modules/local/chromhmm/binarize_bams.nf +++ b/modules/local/chromhmm/binarize_bams.nf @@ -5,19 +5,25 @@ process BINARIZE_BAMS { input: path cellmarkfiletable path bams, stageAs: "reformatted_bams/*" + val organism output: path "binarized_bams" script: """ - //TODO: Get parameters for organism - java -jar ${projectDir}/assets/ChromHMM.jar BinarizeBam \ - ${projectDir}/assets/CHROMSIZES/mm10.txt \ + ${projectDir}/assets/CHROMSIZES/"${organism}".txt \ reformatted_bams \ $cellmarkfiletable \ binarized_bams """ + + stub: + """ + mkdir binarized_bams + touch binarized_bams/chr1.txt + touch binarized_bams/chr2.txt + """ } diff --git a/modules/local/chromhmm/get_output.nf b/modules/local/chromhmm/get_output.nf new file mode 100644 index 00000000..e489e26b --- /dev/null +++ b/modules/local/chromhmm/get_output.nf @@ -0,0 +1,23 @@ +process GET_OUTPUT { + + container "quay.io/biocontainers/pandas:1.4.3" + + input: + tuple path(emissions), path(bed) + + output: + path "enhancers_${bed.baseName.split('_')[0]}.bed" + + script: + """ + get_chromhmm_results.py \ + --emissions $emissions \ + --bed $bed \ + --output enhancers_${bed.baseName.split('_')[0]}.bed + """ + + stub: + """ + touch enhancers_${bed.baseName.split('_')[0]}.bed + """ +} \ No newline at end of file diff --git a/modules/local/chromhmm/learn_model.nf b/modules/local/chromhmm/learn_model.nf index 6c113abb..34ac7076 100644 --- a/modules/local/chromhmm/learn_model.nf +++ b/modules/local/chromhmm/learn_model.nf @@ -4,19 +4,33 @@ process LEARN_MODEL { input: path binarized_bams + val states + val organism output: - path "ChromHMM_output" + path "ChromHMM_output/emissions_${states}.txt", emit: emissions + path "ChromHMM_output/*_${states}_dense.bed", emit: beds script: """ //TODO: Check for parameters for the number of states (default 10) and organism java -jar ${projectDir}/assets/ChromHMM.jar LearnModel \ + -p $task.cpus \ $binarized_bams \ ChromHMM_output \ - 10 \ - mm10 + $states \ + $organism """ + + stub: + """ + mkdir ChromHMM_output + touch ChromHMM_output/emissions_${states}.txt + touch ChromHMM_output/L1_${states}_dense.bed + touch ChromHMM_output/L10_${states}_dense.bed + touch ChromHMM_output/P6_${states}_dense.bed + touch ChromHMM_output/P13_${states}_dense.bed + """ } diff --git a/modules/local/chromhmm/make_cellmarkfiletable.nf b/modules/local/chromhmm/make_cellmarkfiletable.nf index 9fe08318..0f2d865f 100644 --- a/modules/local/chromhmm/make_cellmarkfiletable.nf +++ b/modules/local/chromhmm/make_cellmarkfiletable.nf @@ -12,4 +12,9 @@ process MAKE_CELLMARKFILETABLE { """ make_cellmarkfiletable.py --input_dir $bamDirectory --output cellmarkfiletable.txt """ + + stub: + """ + touch cellmarkfiletable.txt + """ } diff --git a/modules/local/chromhmm/reformat_bam.nf b/modules/local/chromhmm/reformat_bam.nf index 567f1abe..a97bcb48 100644 --- a/modules/local/chromhmm/reformat_bam.nf +++ b/modules/local/chromhmm/reformat_bam.nf @@ -12,5 +12,10 @@ process REFORMAT_BAM { """ reformat_bam.sh $bamFileIn ${bamFileIn.baseName}_reformatted.bam """ + + stub: + """ + touch ${bamFileIn.baseName}_reformatted.bam + """ } diff --git a/subworkflows/local/chromhmm.nf b/subworkflows/local/chromhmm.nf index 1cac2b4d..f9db26e3 100644 --- a/subworkflows/local/chromhmm.nf +++ b/subworkflows/local/chromhmm.nf @@ -3,12 +3,16 @@ include { REFORMAT_BAM } from "../../modules/local/chromhmm/reformat_bam" include { SAMTOOLS_INDEX as INDEX_BAM } from "../../modules/nf-core/samtools/index/main" include { BINARIZE_BAMS } from "../../modules/local/chromhmm/binarize_bams" include { LEARN_MODEL } from "../../modules/local/chromhmm/learn_model" +include { GET_OUTPUT } from "../../modules/local/chromhmm/get_output" workflow CHROMHMM { take: raw_bams + chromhmm_states // default to 10 + organism // mm10 + main: ch_bams_raw = Channel.fromPath("${raw_bams}/*/*.bam") @@ -19,7 +23,11 @@ workflow CHROMHMM { INDEX_BAM(REFORMAT_BAM.out) - BINARIZE_BAMS(MAKE_CELLMARKFILETABLE.out, INDEX_BAM.out.bai.collect()) + BINARIZE_BAMS(MAKE_CELLMARKFILETABLE.out, INDEX_BAM.out.bai.collect(), organism) + + LEARN_MODEL(BINARIZE_BAMS.out, chromhmm_states, organism) + + ch_emission_bed = LEARN_MODEL.out.emissions.combine(LEARN_MODEL.out.beds.flatten()) - LEARN_MODEL(BINARIZE_BAMS.out) + GET_OUTPUT(ch_emission_bed) }