diff --git a/CHANGELOG.md b/CHANGELOG.md index 847022d..6ab8e35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [v0.5.4] + +## [v0.6.0] +## Changed +- Sites with no coverage now report "nan" methylation frequency and score. ## Added +- Option `--aggregate` to pair information from two strands and output additional files. - Support for ChEBI codes in C and Python pileup API. ## [v0.5.3] diff --git a/Makefile b/Makefile index ce9f5f4..0daf293 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,18 @@ OS := $(shell uname) ARCH := $(shell arch) +OS := $(shell uname) ifeq ($(OS), Darwin) -# mainly for dev builds using homebrew things -ARGP ?= /opt/homebrew/opt/argp-standalone/lib/libargp.a -ARGP_INCLUDE ?= -I/opt/homebrew/opt/argp-standalone/include -EXTRA_LDFLAGS ?= -L/opt/homebrew/opt/openssl@3/lib -CFLAGS ?= -fpic -O3 -std=c99 + # mainly for dev builds using homebrew things + EXTRA_LDFLAGS ?= -L$(shell brew --prefix openssl@1.1)/lib + ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a + ARGP_INCLUDE ?= -I$(shell brew --prefix argp-standalone)/include +else + ARGP ?= + ARGP_INCLUDE ?= endif + CC ?= gcc CFLAGS ?= -fpic -msse3 -O3 -std=c99 DEFLATE ?= $(PWD)/libdeflate diff --git a/README.md b/README.md index 547dfcd..1a54fb8 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Alternatively to install from the source code, clone the repository and then use ./modbam2bed See the Makefile for more information. The code has been tested on MacOS (with -dependencies from brew) and on Ubuntu 18.04. +dependencies from brew) and on Ubuntu 18.04 and 20.04. ### Usage @@ -43,30 +43,37 @@ modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl. order). -m, --mod_base=BASE Modified base of interest, one of: 5mC, 5hmC, 5fC, 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. + -p, --prefix=PREFIX Output file prefix. Only used when multiple output + filters are given. -r, --region=chr:start-end Genomic region to process. -t, --threads=THREADS Number of threads for BAM processing. Base filtering options: -a, --canon_threshold=THRESHOLD Bases with mod. probability < THRESHOLD are - counted as canonical. + counted as canonical (default 0.33). + --aggregated Output additional aggregated (across strand) + counts, requires --cpg or --chg. -b, --mod_threshold=THRESHOLD Bases with mod. probability > THRESHOLD are - counted as modified. + counted as modified (default 0.66). -c, --cpg Output records filtered to CpG sites. + --chg Output records filtered to CHG sites. + --chh Output records filtered to CHH sites. + -k, --mask Respect soft-masking in reference file. Read filtering options: + -d, --max_depth=DEPTH Max. per-file depth; avoids excessive memory + usage. -g, --read_group=RG Only process reads from given read group. --haplotype=VAL Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL. --tag_name=TN Only process reads with a given tag (see --tag_value). - --tag_value=VAL Only process reads with a given (integer) tag - value. + --tag_value=VAL Only process reads with a given tag value. -?, --help Give this help list --usage Give a short usage message -V, --version Print program version - ``` ### Method and output format @@ -120,12 +127,10 @@ with verbatim base counts. The code has not been developed extensively and currently has some limitations: - * Support for motif filtering is limit to CpG sites. Without this filtering - enabled all reference positions that are the canonical base (on forward or - reverse strand) equivalent to the modified base under consideration are - reported. - * No option to combine per-strand counts into a total count (how to do this - generically depends on motif). + * Support for motif filtering is limit to CpG, CHG, and CHH, sites. Without + this filtering enabled all reference positions that are the canonical base + (on forward or reverse strand) equivalent to the modified base under + consideration are reported. * Insertion columns are completely ignored for simplicitly (and avoid any heuristics). @@ -207,7 +212,7 @@ for testing and comparison to his independently developed code. **Licence and Copyright** -© 2021 Oxford Nanopore Technologies Ltd. +© 2021- Oxford Nanopore Technologies Ltd. `modbam2bed` is distributed under the terms of the Mozilla Public License 2.0. diff --git a/src/args.c b/src/args.c index f3f5759..1681040 100644 --- a/src/args.c +++ b/src/args.c @@ -53,6 +53,8 @@ static struct argp_option options[] = { "Output records filtered to CHH sites.", 2}, {"chg", 0x500, 0, 0, "Output records filtered to CHG sites.", 2}, + {"aggregate", 0x600, 0, 0, + "Output additional aggregated (across strand) counts, requires --cpg or --chg.", 2}, {"mask", 'k', 0, 0, "Respect soft-masking in reference file.", 2}, {0, 0, 0, 0, @@ -123,6 +125,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) { case 0x500: arguments->chg = true; break; + case 0x600: + arguments->accumulated = true; + break; case 'k': arguments->mask = true; break; @@ -208,6 +213,7 @@ arguments_t parse_arguments(int argc, char** argv) { args.chh = false; args.chg = false; args.mask = false; + args.accumulated = false; args.extended = false; args.threads = 1; args.prefix = "mod-counts"; diff --git a/src/args.h b/src/args.h index f6af27c..ef43cd2 100644 --- a/src/args.h +++ b/src/args.h @@ -18,6 +18,7 @@ typedef struct arguments { bool chh; bool chg; bool extended; + bool accumulated; int threads; int lowthreshold; int highthreshold; diff --git a/src/counts.c b/src/counts.c index 8b037fd..04c5c38 100644 --- a/src/counts.c +++ b/src/counts.c @@ -82,14 +82,18 @@ void print_pileup_data(plp_data pileup){ } -output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) { +output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated) { output_files files = xalloc(1, sizeof(_output_files), "output_files"); // default to stdout for zero or one filters files->multi = (int)cpg + chh + chg > 1; files->take_all = (int)cpg + chh + chg == 0; + files->accumulated = accumulated; files->fcpg = stdout; files->fchh = stdout; files->fchg = stdout; + files->fcpg_acc = NULL; + files->fchh_acc = NULL; + files->fchg_acc = NULL; files->cpg = cpg; files->chh = chh; files->chg = chg; @@ -110,6 +114,28 @@ output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) { } free(fname); } + + if (files->accumulated) { + char* fname_acc = xalloc(strlen(prefix) + 13, sizeof(char), "fname"); + if (cpg) { + strcpy(fname_acc, prefix); strcat(fname_acc, ".cpg.acc.bed"); + files->fcpg_acc = fopen(fname_acc, "w"); + } + if (chg) { + strcpy(fname_acc, prefix); strcat(fname_acc, ".chg.acc.bed"); + files->fchg_acc = fopen(fname_acc, "w"); + } + free(fname_acc); + } + + // store these in an array for later ease + // [CpG, CHG] + init_output_buffers(files); + files->buf_size = _buf_size; + files->motif_offsets[0] = 1; + files->motif_offsets[1] = 2; + files->motif_acc_files[0] = files->fcpg_acc; + files->motif_acc_files[1] = files->fchg_acc; return files; } @@ -117,6 +143,9 @@ void close_bed_files(output_files files) { if (files->fcpg != stdout) { fclose(files->fcpg); } if (files->fchh != stdout) { fclose(files->fchh); } if (files->fchg != stdout) { fclose(files->fchg); } + if (files->fcpg_acc != NULL) { fclose(files->fcpg_acc); } + if (files->fchh_acc != NULL) { fclose(files->fchh_acc); } + if (files->fchg_acc != NULL) { fclose(files->fchg_acc); } free(files); } @@ -184,6 +213,59 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) { return is_chg; } + +void inline print_record( + FILE* fout, const char* rname, size_t start, size_t end, + char* feature, char orient, size_t depth, + bool extended, size_t cd, size_t md, size_t fd) { + // https://www.encodeproject.org/data-standards/wgbs/ + // column 11: "Percentage of reads that show methylation at this position in the genome" + // - Seems to disregard possibility of non-C canonical calls + // lets calculate this as proportion of meth:non-meth C + size_t tot = cd + md; + float meth = tot == 0 ? nanf("") : (100.0f * md) / tot; + // column 5: "Score from 0-1000. Capped number of reads" + // lets go with proportion of (mod or canon):(mod or canon or filtered) + size_t score = depth == 0 ? nanf("") : (1000 * tot) / depth; + + // TODO: don't print when nan? + fprintf(fout, + "%s\t%zu\t%zu\t" + "%s\t%zu\t%c\t" + "%zu\t%zu\t0,0,0\t%zu\t%.2f", + rname, start, end, + feature, score, orient, + start, end + 1, depth, meth); + if (extended) { + fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd); + } else { + fprintf(fout, "\n"); + } +} + + +void init_output_buffers(output_files bed_files) { + // information regarding motif offset pairing + for (size_t i=0; i < bed_files->buf_size; ++i) { + bed_files->out_buffer[i] = (bed_buffer){-1, false, 0, 0, 0, 0}; + } +} + +void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature) { + // flush accumulation buffers + if (bed_files->accumulated) { + for(size_t ibuf=0; ibuf < bed_files->buf_size; ++ibuf) { + bed_buffer buf = bed_files->out_buffer[ibuf]; + FILE* fout = bed_files->motif_acc_files[ibuf]; + if (buf.pos != -1 && fout != NULL) { + print_record( + fout, chr, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev], + buf.depth, extended, buf.cd, buf.md, buf.fd); + } + } + } +} + /** Prints a pileup data structure as bedmethyl file * * @param pileup a pileup counts structure. @@ -192,15 +274,14 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) { * @param extended whether to include counts of canonical, modified and filtered bases. * @param feature name to use for feature column of BED (e.g. 5mC). * @param canon_base canonical base to match. - * @param cpg filter output to CpG sites. - * @param chh filter output to CHH sites. - * @param chg filter output to CHG sites. + * @param output_files file handles and output options. + * @param out_buffer state for strand accumulation (modified on output). * @returns void * */ void print_bedmethyl( plp_data pileup, char *ref, int rstart, bool extended, - char* feature, char canon_base, output_files bed_files){ + char* feature, char canon_base, output_files bed_files) { // ecoli1 100718 100719 . 4 + 100718 100719 0,0,0 3 0 // this is a bit naff, we should introspect these indices, or have them @@ -219,10 +300,6 @@ void print_bedmethyl( else if (canon_base == 'G') {cif=6; cir=1; rc_canon_base = 'C';} else if (canon_base == 'T') {cif=7; cir=0; rc_canon_base = 'A';} else {fprintf(stderr, "ERROR: Unrecognised canonical base: '%c'\n", canon_base); exit(1);} - //if (canon_base != 'C' && (cpg || chh || chg)) { - // fprintf(stderr, "ERROR: CpG filtering cannot be used when canonical base is not 'C'.\n"); - // exit(1); - //} int rlen = strlen(ref); @@ -262,43 +339,59 @@ void print_bedmethyl( for (size_t j = 0; j < numbases; ++j) { depth += pileup->matrix[i * featlen + bases[j]]; } - if (depth == 0) continue; - // https://www.encodeproject.org/data-standards/wgbs/ - // column 11: "Percentage of reads that show methylation at this position in the genome" - // - Seems to disregard possibility of non-C canonical calls - // lets calculate this as proportion of meth:non-meth C size_t cd = pileup->matrix[i * featlen + ci]; size_t md = pileup->matrix[i * featlen + mi]; size_t fd = pileup->matrix[i * featlen + fi]; - size_t tot = cd + md; - if (tot == 0) continue; - float meth = tot == 0 ? 0 : (100.0f * md) / tot; - // column 5: "Score from 0-1000. Capped number of reads" - // lets go with proportion of (mod or canon):(mod or canon or filtered) - size_t score = depth == 0 ? 0 : (1000 * tot) / depth; - - // default to stdout + + // choose output for this locus, the motifs are mutually exclusive so + // no need to loop FILE* fout = stdout; if (bed_files->multi) { if (is_cpg) { fout = bed_files->fcpg; } else if (is_chh) { fout = bed_files->fchh; } else if (is_chg) { fout = bed_files->fchg; } } - - fprintf(fout, - "%s\t%zu\t%zu\t" - "%s\t%zu\t%c\t" - "%zu\t%zu\t0,0,0\t%zu\t%.2f", - pileup->rname, pos, pos + 1, - feature, score, "+-"[isrev], - pos, pos + 1, depth, meth - ); - if (extended) { - fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd); - } else { - fprintf(fout, "\n"); + print_record( + fout, pileup->rname, pos, pos + 1, feature, "+-"[isrev], + depth, extended, cd, md, fd); + + // strand accumulated + if (bed_files->accumulated && (is_cpg || is_chg)) { + size_t ibuf, motif_offset; + bool do_output; + if (is_cpg) { + ibuf = 0; do_output = bed_files->cpg; + } else { // chg + ibuf = 1; do_output = bed_files->chh; + } + motif_offset = bed_files->motif_offsets[ibuf]; + fout = bed_files->motif_acc_files[ibuf]; + if (do_output) { + assert(fout != NULL); + bed_buffer buf = bed_files->out_buffer[ibuf]; + if (buf.pos == -1) { + bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd}; + } else if (pos - buf.pos == motif_offset ) { // paired + assert(buf.isrev != isrev); // shouldn't happen, they can't be same + buf.depth += depth; + buf.cd += cd; + buf.md += md; + buf.fd += fd; + print_record( + fout, pileup->rname, buf.pos, buf.pos + motif_offset + 1, feature, '.', + buf.depth, extended, buf.cd, buf.md, buf.fd); + bed_files->out_buffer[ibuf] = (bed_buffer){-1, false, 0, 0, 0, 0}; + } else { // unrelated + print_record( + fout, pileup->rname, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev], + buf.depth, extended, buf.cd, buf.md, buf.fd); + bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd}; + } + } } - } + + } // position loop + } diff --git a/src/counts.h b/src/counts.h index 5a65481..06a5c5d 100644 --- a/src/counts.h +++ b/src/counts.h @@ -4,7 +4,6 @@ #include #include - static const int _INT_MAX = INT_MAX; // medaka-style feature data @@ -17,21 +16,42 @@ typedef struct _plp_data { } _plp_data; typedef _plp_data *plp_data; +typedef struct bed_buffer { + int pos; + bool isrev; + size_t depth, cd, md, fd; +} bed_buffer; + // files open for writing outputs +// this buf_size is silly, but its to work around CFFI sillyness +static const size_t _buf_size = 2; typedef struct _output_files { bool multi; bool take_all; + bool accumulated; bool cpg; bool chh; bool chg; FILE *fcpg; FILE *fchh; FILE *fchg; + FILE *fcpg_acc; + FILE *fchh_acc; + FILE *fchg_acc; + size_t buf_size; + bed_buffer out_buffer[2]; + size_t motif_offsets[2]; + FILE* motif_acc_files[2]; } _output_files; typedef _output_files *output_files; -output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg); + +output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated); void close_bed_files(output_files); +// reset state of buffers (to handle loci split by thread blocks) +void init_output_buffers(output_files bed_files); +void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature); + // Check sequences for motifs // CpG diff --git a/src/modbam2bed.c b/src/modbam2bed.c index 8d07d72..84a4de8 100644 --- a/src/modbam2bed.c +++ b/src/modbam2bed.c @@ -60,7 +60,10 @@ void process_region(arguments_t args, const char *chr, int start, int end, char args.lowthreshold, args.highthreshold, args.mod_base.code, args.hts_maxcnt); if (pileup == NULL) return; + + init_output_buffers(bed_files); print_bedmethyl(pileup, ref, 0, args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files); + flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev); destroy_plp_data(pileup); } #else @@ -72,6 +75,7 @@ void process_region(arguments_t args, const char *chr, int start, int end, char hts_tpool_result *r; const int width = 1000000; + init_output_buffers(bed_files); int nregs = 1 + (end - start) / width; float done = 0; for (int rstart = start; rstart < end; rstart += width) { twarg *tw_args = xalloc(1, sizeof(*tw_args), "thread worker args"); // freed in worker @@ -109,6 +113,10 @@ void process_region(arguments_t args, const char *chr, int start, int end, char } hts_tpool_delete_result(r, 0); } + + // finalise any remaining singleton strands + flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev); + fprintf(stderr, "\r100 %% "); fprintf(stderr, "\n"); // clean up pool @@ -149,7 +157,7 @@ int main(int argc, char *argv[]) { // open output files, sort out filter options output_files bed_files = open_bed_files( - args.prefix, args.cpg, args.chh, args.chg); + args.prefix, args.cpg, args.chh, args.chg, args.accumulated); // load ref sequence faidx_t *fai = fai_load(args.ref); diff --git a/src/version.h b/src/version.h index 894612f..a58e55e 100644 --- a/src/version.h +++ b/src/version.h @@ -1,2 +1,2 @@ // remember to bump version in modbampy/__init__.py too -const char *argp_program_version = "0.5.4"; +const char *argp_program_version = "0.6.0";