Aggregating by strand

epi2me-labs · Jun 27, 2022 · 2260f61 · 2260f61
1 parent cb83af7
commit 2260f61
Show file tree

Hide file tree

Showing 9 changed files with 200 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,8 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [v0.5.4]
+
+## [v0.6.0]
+## Changed
+- Sites with no coverage now report "nan" methylation frequency and score.
 ## Added
+- Option `--aggregate` to pair information from two strands and output additional files.
 - Support for ChEBI codes in C and Python pileup API.
 
 ## [v0.5.3]

diff --git a/Makefile b/Makefile
@@ -1,14 +1,18 @@
 OS := $(shell uname)
 ARCH := $(shell arch)
 
+OS := $(shell uname)
 ifeq ($(OS), Darwin)
-# mainly for dev builds using homebrew things
-ARGP ?= /opt/homebrew/opt/argp-standalone/lib/libargp.a
-ARGP_INCLUDE ?= -I/opt/homebrew/opt/argp-standalone/include
-EXTRA_LDFLAGS ?= -L/opt/homebrew/opt/openssl@3/lib
-CFLAGS ?= -fpic -O3 -std=c99
+    # mainly for dev builds using homebrew things
+    EXTRA_LDFLAGS ?= -L$(shell brew --prefix [email protected])/lib
+    ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a
+    ARGP_INCLUDE ?= -I$(shell brew --prefix argp-standalone)/include
+else
+    ARGP ?=
+    ARGP_INCLUDE ?=
 endif
 
+
 CC ?= gcc
 CFLAGS ?= -fpic -msse3 -O3 -std=c99
 DEFLATE ?= $(PWD)/libdeflate

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Alternatively to install from the source code, clone the repository and then use
     ./modbam2bed
 
 See the Makefile for more information. The code has been tested on MacOS (with
-dependencies from brew) and on Ubuntu 18.04.
+dependencies from brew) and on Ubuntu 18.04 and 20.04.
 
 ### Usage
 
@@ -43,30 +43,37 @@ modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl.
                              order).
   -m, --mod_base=BASE        Modified base of interest, one of: 5mC, 5hmC, 5fC,
                              5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao.
+  -p, --prefix=PREFIX        Output file prefix. Only used when multiple output
+                             filters are given.
   -r, --region=chr:start-end Genomic region to process.
   -t, --threads=THREADS      Number of threads for BAM processing.
 
  Base filtering options:
   -a, --canon_threshold=THRESHOLD
                              Bases with mod. probability < THRESHOLD are
-                             counted as canonical.
+                             counted as canonical (default 0.33).
+      --aggregated           Output additional aggregated (across strand)
+                             counts, requires --cpg or --chg.
   -b, --mod_threshold=THRESHOLD   Bases with mod. probability > THRESHOLD are
-                             counted as modified.
+                             counted as modified (default 0.66).
   -c, --cpg                  Output records filtered to CpG sites.
+      --chg                  Output records filtered to CHG sites.
+      --chh                  Output records filtered to CHH sites.
+  -k, --mask                 Respect soft-masking in reference file.
 
  Read filtering options:
+  -d, --max_depth=DEPTH      Max. per-file depth; avoids excessive memory
+                             usage.
   -g, --read_group=RG        Only process reads from given read group.
       --haplotype=VAL        Only process reads from a given haplotype.
                              Equivalent to --tag_name HP --tag_value VAL.
       --tag_name=TN          Only process reads with a given tag (see
                              --tag_value).
-      --tag_value=VAL        Only process reads with a given (integer) tag
-                             value.
+      --tag_value=VAL        Only process reads with a given tag value.
 
   -?, --help                 Give this help list
       --usage                Give a short usage message
   -V, --version              Print program version
-
 ```
 
 ### Method and output format
@@ -120,12 +127,10 @@ with verbatim base counts.
 
 The code has not been developed extensively and currently has some limitations:
 
- * Support for motif filtering is limit to CpG sites. Without this filtering
-   enabled all reference positions that are the canonical base (on forward or
-   reverse strand) equivalent to the modified base under consideration are
-   reported.
- * No option to combine per-strand counts into a total count (how to do this
-   generically depends on motif).
+ * Support for motif filtering is limit to CpG, CHG, and CHH, sites. Without
+   this filtering enabled all reference positions that are the canonical base
+   (on forward or reverse strand) equivalent to the modified base under
+   consideration are reported.
  * Insertion columns are completely ignored for simplicitly (and avoid
    any heuristics).
 
@@ -207,7 +212,7 @@ for testing and comparison to his independently developed code.
 
 **Licence and Copyright**
 
-© 2021 Oxford Nanopore Technologies Ltd.
+© 2021- Oxford Nanopore Technologies Ltd.
 
 `modbam2bed` is distributed under the terms of the Mozilla Public License 2.0.
 

diff --git a/src/args.c b/src/args.c
@@ -53,6 +53,8 @@ static struct argp_option options[] = {
         "Output records filtered to CHH sites.", 2},
     {"chg", 0x500, 0, 0,
         "Output records filtered to CHG sites.", 2},
+    {"aggregate", 0x600, 0, 0,
+        "Output additional aggregated (across strand) counts, requires --cpg or --chg.", 2},
     {"mask", 'k', 0, 0,
         "Respect soft-masking in reference file.", 2},
     {0, 0, 0, 0,
@@ -123,6 +125,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
         case 0x500:
             arguments->chg = true;
             break;
+        case 0x600:
+            arguments->accumulated = true;
+            break;
         case 'k':
             arguments->mask = true;
             break;
@@ -208,6 +213,7 @@ arguments_t parse_arguments(int argc, char** argv) {
     args.chh = false;
     args.chg = false;
     args.mask = false;
+    args.accumulated = false;
     args.extended = false;
     args.threads = 1;
     args.prefix = "mod-counts";

diff --git a/src/args.h b/src/args.h
@@ -18,6 +18,7 @@ typedef struct arguments {
     bool chh;
     bool chg;
     bool extended;
+    bool accumulated;
     int threads;
     int lowthreshold;
     int highthreshold;

diff --git a/src/counts.c b/src/counts.c
@@ -82,14 +82,18 @@ void print_pileup_data(plp_data pileup){
 }
 
 
-output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) {
+output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated) {
     output_files files = xalloc(1, sizeof(_output_files), "output_files");
     // default to stdout for zero or one filters
     files->multi = (int)cpg + chh + chg > 1;
     files->take_all = (int)cpg + chh + chg == 0;
+    files->accumulated = accumulated;
     files->fcpg = stdout;
     files->fchh = stdout;
     files->fchg = stdout;
+    files->fcpg_acc = NULL;
+    files->fchh_acc = NULL;
+    files->fchg_acc = NULL;
     files->cpg = cpg;
     files->chh = chh;
     files->chg = chg;
@@ -110,13 +114,38 @@ output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) {
         }
         free(fname);
     }
+
+    if (files->accumulated) {
+        char* fname_acc = xalloc(strlen(prefix) + 13, sizeof(char), "fname");
+        if (cpg) {
+            strcpy(fname_acc, prefix); strcat(fname_acc, ".cpg.acc.bed");
+            files->fcpg_acc = fopen(fname_acc, "w");
+        }
+        if (chg) {
+            strcpy(fname_acc, prefix); strcat(fname_acc, ".chg.acc.bed");
+            files->fchg_acc = fopen(fname_acc, "w");
+        }
+        free(fname_acc);
+    }
+
+    // store these in an array for later ease
+    // [CpG, CHG]
+    init_output_buffers(files);
+    files->buf_size = _buf_size;
+    files->motif_offsets[0] = 1;
+    files->motif_offsets[1] = 2;
+    files->motif_acc_files[0] = files->fcpg_acc;
+    files->motif_acc_files[1] = files->fchg_acc;
     return files;
 }
 
 void close_bed_files(output_files files) {
    if (files->fcpg != stdout) { fclose(files->fcpg); }
    if (files->fchh != stdout) { fclose(files->fchh); }
    if (files->fchg != stdout) { fclose(files->fchg); }
+   if (files->fcpg_acc != NULL) { fclose(files->fcpg_acc); }
+   if (files->fchh_acc != NULL) { fclose(files->fchh_acc); }
+   if (files->fchg_acc != NULL) { fclose(files->fchg_acc); }
    free(files);
 }
 
@@ -184,6 +213,59 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) {
     return is_chg;
 }
 
+
+void inline print_record(
+        FILE* fout, const char* rname, size_t start, size_t end,
+        char* feature, char orient, size_t depth,
+        bool extended, size_t cd, size_t md, size_t fd) {
+    // https://www.encodeproject.org/data-standards/wgbs/
+    // column 11: "Percentage of reads that show methylation at this position in the genome"
+    //  - Seems to disregard possibility of non-C canonical calls
+    // lets calculate this as proportion of meth:non-meth C
+    size_t tot = cd + md;
+    float meth = tot == 0 ? nanf("") : (100.0f * md) / tot;
+    // column 5: "Score from 0-1000. Capped number of reads"
+    // lets go with proportion of (mod or canon):(mod or canon or filtered)
+    size_t score = depth == 0 ? nanf("") : (1000 * tot) / depth;
+
+    // TODO: don't print when nan?
+    fprintf(fout,
+        "%s\t%zu\t%zu\t"
+        "%s\t%zu\t%c\t"
+        "%zu\t%zu\t0,0,0\t%zu\t%.2f",
+        rname, start, end,
+        feature, score, orient,
+        start, end + 1, depth, meth);
+    if (extended) {
+        fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd);
+    } else {
+        fprintf(fout, "\n");
+    }
+}
+
+
+void init_output_buffers(output_files bed_files) {
+    // information regarding motif offset pairing
+    for (size_t i=0; i < bed_files->buf_size; ++i) {
+        bed_files->out_buffer[i] = (bed_buffer){-1, false, 0, 0, 0, 0};
+    }
+}
+
+void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature) {
+    // flush accumulation buffers
+    if (bed_files->accumulated) {
+        for(size_t ibuf=0; ibuf < bed_files->buf_size; ++ibuf) {
+            bed_buffer buf = bed_files->out_buffer[ibuf];
+            FILE* fout = bed_files->motif_acc_files[ibuf];
+            if (buf.pos != -1 && fout != NULL) {
+                print_record(
+                    fout, chr, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
+                    buf.depth, extended, buf.cd, buf.md, buf.fd);
+            }
+        }
+    }
+}
+
 /** Prints a pileup data structure as bedmethyl file
  *
  *  @param pileup a pileup counts structure.
@@ -192,15 +274,14 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) {
  *  @param extended whether to include counts of canonical, modified and filtered bases.
  *  @param feature name to use for feature column of BED (e.g. 5mC).
  *  @param canon_base canonical base to match.
- *  @param cpg filter output to CpG sites.
- *  @param chh filter output to CHH sites.
- *  @param chg filter output to CHG sites.
+ *  @param output_files file handles and output options.
+ *  @param out_buffer state for strand accumulation (modified on output).
  *  @returns void
  *
  */
 void print_bedmethyl(
         plp_data pileup, char *ref, int rstart, bool extended,
-        char* feature, char canon_base, output_files bed_files){
+        char* feature, char canon_base, output_files bed_files) {
     // ecoli1  100718  100719  .       4       +       100718  100719  0,0,0   3       0
 
     // this is a bit naff, we should introspect these indices, or have them
@@ -219,10 +300,6 @@ void print_bedmethyl(
     else if (canon_base == 'G') {cif=6; cir=1; rc_canon_base = 'C';}
     else if (canon_base == 'T') {cif=7; cir=0; rc_canon_base = 'A';}
     else {fprintf(stderr, "ERROR: Unrecognised canonical base: '%c'\n", canon_base); exit(1);}
-    //if (canon_base != 'C' && (cpg || chh || chg)) {
-    //    fprintf(stderr, "ERROR: CpG filtering cannot be used when canonical base is not 'C'.\n");
-    //    exit(1);
-    //}
 
     int rlen = strlen(ref);
 
@@ -262,43 +339,59 @@ void print_bedmethyl(
         for (size_t j = 0; j < numbases; ++j) {
             depth += pileup->matrix[i * featlen + bases[j]];
         }
-        if (depth == 0) continue;
-        // https://www.encodeproject.org/data-standards/wgbs/
-        // column 11: "Percentage of reads that show methylation at this position in the genome"
-        //  - Seems to disregard possibility of non-C canonical calls
-        // lets calculate this as proportion of meth:non-meth C
         size_t cd = pileup->matrix[i * featlen + ci];
         size_t md = pileup->matrix[i * featlen + mi];
         size_t fd = pileup->matrix[i * featlen + fi];
-        size_t tot = cd + md;
-        if (tot == 0) continue;
-        float meth = tot == 0 ? 0 : (100.0f * md) / tot;
-        // column 5: "Score from 0-1000. Capped number of reads"
-        // lets go with proportion of (mod or canon):(mod or canon or filtered)
-        size_t score = depth == 0 ? 0 : (1000 * tot) / depth;
-
-        // default to stdout 
+
+        // choose output for this locus, the motifs are mutually exclusive so
+        // no need to loop
         FILE* fout = stdout;
         if (bed_files->multi) {
             if (is_cpg) { fout = bed_files->fcpg; }
             else if (is_chh) { fout = bed_files->fchh; }
             else if (is_chg) { fout = bed_files->fchg; }
         }
-
-        fprintf(fout,
-            "%s\t%zu\t%zu\t"
-            "%s\t%zu\t%c\t"
-            "%zu\t%zu\t0,0,0\t%zu\t%.2f",
-            pileup->rname, pos, pos + 1,
-            feature, score, "+-"[isrev],
-            pos, pos + 1, depth, meth
-        );
-        if (extended) {
-            fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd);
-        } else {
-            fprintf(fout, "\n");
+        print_record(
+            fout, pileup->rname, pos, pos + 1, feature, "+-"[isrev],
+            depth, extended, cd, md, fd);
+
+        // strand accumulated
+        if (bed_files->accumulated && (is_cpg || is_chg)) {
+            size_t ibuf, motif_offset;
+            bool do_output;
+            if (is_cpg) {
+                ibuf = 0; do_output = bed_files->cpg;
+            } else { // chg
+                ibuf = 1; do_output = bed_files->chh;
+            }
+            motif_offset = bed_files->motif_offsets[ibuf];
+            fout = bed_files->motif_acc_files[ibuf];
+            if (do_output) {
+                assert(fout != NULL);
+                bed_buffer buf = bed_files->out_buffer[ibuf];
+                if (buf.pos == -1) {
+                    bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd};
+                } else if (pos - buf.pos == motif_offset ) { // paired
+                    assert(buf.isrev != isrev); // shouldn't happen, they can't be same
+                    buf.depth += depth;
+                    buf.cd += cd;
+                    buf.md += md;
+                    buf.fd += fd;
+                    print_record(
+                        fout, pileup->rname, buf.pos, buf.pos + motif_offset + 1, feature, '.',
+                        buf.depth, extended, buf.cd, buf.md, buf.fd);
+                    bed_files->out_buffer[ibuf] = (bed_buffer){-1, false, 0, 0, 0, 0};
+                } else { // unrelated
+                    print_record(
+                        fout, pileup->rname, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
+                        buf.depth, extended, buf.cd, buf.md, buf.fd);
+                    bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd};
+                }
+            }
         }
-    }
+
+    } // position loop
+
 }