Skip to content

Commit

Permalink
Aggregating by strand
Browse files Browse the repository at this point in the history
  • Loading branch information
cjw85 committed Jun 27, 2022
1 parent cb83af7 commit 2260f61
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 59 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.5.4]

## [v0.6.0]
## Changed
- Sites with no coverage now report "nan" methylation frequency and score.
## Added
- Option `--aggregate` to pair information from two strands and output additional files.
- Support for ChEBI codes in C and Python pileup API.

## [v0.5.3]
Expand Down
14 changes: 9 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
OS := $(shell uname)
ARCH := $(shell arch)

OS := $(shell uname)
ifeq ($(OS), Darwin)
# mainly for dev builds using homebrew things
ARGP ?= /opt/homebrew/opt/argp-standalone/lib/libargp.a
ARGP_INCLUDE ?= -I/opt/homebrew/opt/argp-standalone/include
EXTRA_LDFLAGS ?= -L/opt/homebrew/opt/openssl@3/lib
CFLAGS ?= -fpic -O3 -std=c99
# mainly for dev builds using homebrew things
EXTRA_LDFLAGS ?= -L$(shell brew --prefix [email protected])/lib
ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a
ARGP_INCLUDE ?= -I$(shell brew --prefix argp-standalone)/include
else
ARGP ?=
ARGP_INCLUDE ?=
endif


CC ?= gcc
CFLAGS ?= -fpic -msse3 -O3 -std=c99
DEFLATE ?= $(PWD)/libdeflate
Expand Down
31 changes: 18 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Alternatively to install from the source code, clone the repository and then use
./modbam2bed

See the Makefile for more information. The code has been tested on MacOS (with
dependencies from brew) and on Ubuntu 18.04.
dependencies from brew) and on Ubuntu 18.04 and 20.04.

### Usage

Expand All @@ -43,30 +43,37 @@ modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl.
order).
-m, --mod_base=BASE Modified base of interest, one of: 5mC, 5hmC, 5fC,
5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao.
-p, --prefix=PREFIX Output file prefix. Only used when multiple output
filters are given.
-r, --region=chr:start-end Genomic region to process.
-t, --threads=THREADS Number of threads for BAM processing.
Base filtering options:
-a, --canon_threshold=THRESHOLD
Bases with mod. probability < THRESHOLD are
counted as canonical.
counted as canonical (default 0.33).
--aggregated Output additional aggregated (across strand)
counts, requires --cpg or --chg.
-b, --mod_threshold=THRESHOLD Bases with mod. probability > THRESHOLD are
counted as modified.
counted as modified (default 0.66).
-c, --cpg Output records filtered to CpG sites.
--chg Output records filtered to CHG sites.
--chh Output records filtered to CHH sites.
-k, --mask Respect soft-masking in reference file.
Read filtering options:
-d, --max_depth=DEPTH Max. per-file depth; avoids excessive memory
usage.
-g, --read_group=RG Only process reads from given read group.
--haplotype=VAL Only process reads from a given haplotype.
Equivalent to --tag_name HP --tag_value VAL.
--tag_name=TN Only process reads with a given tag (see
--tag_value).
--tag_value=VAL Only process reads with a given (integer) tag
value.
--tag_value=VAL Only process reads with a given tag value.
-?, --help Give this help list
--usage Give a short usage message
-V, --version Print program version
```

### Method and output format
Expand Down Expand Up @@ -120,12 +127,10 @@ with verbatim base counts.

The code has not been developed extensively and currently has some limitations:

* Support for motif filtering is limit to CpG sites. Without this filtering
enabled all reference positions that are the canonical base (on forward or
reverse strand) equivalent to the modified base under consideration are
reported.
* No option to combine per-strand counts into a total count (how to do this
generically depends on motif).
* Support for motif filtering is limit to CpG, CHG, and CHH, sites. Without
this filtering enabled all reference positions that are the canonical base
(on forward or reverse strand) equivalent to the modified base under
consideration are reported.
* Insertion columns are completely ignored for simplicitly (and avoid
any heuristics).

Expand Down Expand Up @@ -207,7 +212,7 @@ for testing and comparison to his independently developed code.

**Licence and Copyright**

© 2021 Oxford Nanopore Technologies Ltd.
© 2021- Oxford Nanopore Technologies Ltd.

`modbam2bed` is distributed under the terms of the Mozilla Public License 2.0.

Expand Down
6 changes: 6 additions & 0 deletions src/args.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ static struct argp_option options[] = {
"Output records filtered to CHH sites.", 2},
{"chg", 0x500, 0, 0,
"Output records filtered to CHG sites.", 2},
{"aggregate", 0x600, 0, 0,
"Output additional aggregated (across strand) counts, requires --cpg or --chg.", 2},
{"mask", 'k', 0, 0,
"Respect soft-masking in reference file.", 2},
{0, 0, 0, 0,
Expand Down Expand Up @@ -123,6 +125,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
case 0x500:
arguments->chg = true;
break;
case 0x600:
arguments->accumulated = true;
break;
case 'k':
arguments->mask = true;
break;
Expand Down Expand Up @@ -208,6 +213,7 @@ arguments_t parse_arguments(int argc, char** argv) {
args.chh = false;
args.chg = false;
args.mask = false;
args.accumulated = false;
args.extended = false;
args.threads = 1;
args.prefix = "mod-counts";
Expand Down
1 change: 1 addition & 0 deletions src/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ typedef struct arguments {
bool chh;
bool chg;
bool extended;
bool accumulated;
int threads;
int lowthreshold;
int highthreshold;
Expand Down
165 changes: 129 additions & 36 deletions src/counts.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,18 @@ void print_pileup_data(plp_data pileup){
}


output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) {
output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated) {
output_files files = xalloc(1, sizeof(_output_files), "output_files");
// default to stdout for zero or one filters
files->multi = (int)cpg + chh + chg > 1;
files->take_all = (int)cpg + chh + chg == 0;
files->accumulated = accumulated;
files->fcpg = stdout;
files->fchh = stdout;
files->fchg = stdout;
files->fcpg_acc = NULL;
files->fchh_acc = NULL;
files->fchg_acc = NULL;
files->cpg = cpg;
files->chh = chh;
files->chg = chg;
Expand All @@ -110,13 +114,38 @@ output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg) {
}
free(fname);
}

if (files->accumulated) {
char* fname_acc = xalloc(strlen(prefix) + 13, sizeof(char), "fname");
if (cpg) {
strcpy(fname_acc, prefix); strcat(fname_acc, ".cpg.acc.bed");
files->fcpg_acc = fopen(fname_acc, "w");
}
if (chg) {
strcpy(fname_acc, prefix); strcat(fname_acc, ".chg.acc.bed");
files->fchg_acc = fopen(fname_acc, "w");
}
free(fname_acc);
}

// store these in an array for later ease
// [CpG, CHG]
init_output_buffers(files);
files->buf_size = _buf_size;
files->motif_offsets[0] = 1;
files->motif_offsets[1] = 2;
files->motif_acc_files[0] = files->fcpg_acc;
files->motif_acc_files[1] = files->fchg_acc;
return files;
}

void close_bed_files(output_files files) {
if (files->fcpg != stdout) { fclose(files->fcpg); }
if (files->fchh != stdout) { fclose(files->fchh); }
if (files->fchg != stdout) { fclose(files->fchg); }
if (files->fcpg_acc != NULL) { fclose(files->fcpg_acc); }
if (files->fchh_acc != NULL) { fclose(files->fchh_acc); }
if (files->fchg_acc != NULL) { fclose(files->fchg_acc); }
free(files);
}

Expand Down Expand Up @@ -184,6 +213,59 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) {
return is_chg;
}


void inline print_record(
FILE* fout, const char* rname, size_t start, size_t end,
char* feature, char orient, size_t depth,
bool extended, size_t cd, size_t md, size_t fd) {
// https://www.encodeproject.org/data-standards/wgbs/
// column 11: "Percentage of reads that show methylation at this position in the genome"
// - Seems to disregard possibility of non-C canonical calls
// lets calculate this as proportion of meth:non-meth C
size_t tot = cd + md;
float meth = tot == 0 ? nanf("") : (100.0f * md) / tot;
// column 5: "Score from 0-1000. Capped number of reads"
// lets go with proportion of (mod or canon):(mod or canon or filtered)
size_t score = depth == 0 ? nanf("") : (1000 * tot) / depth;

// TODO: don't print when nan?
fprintf(fout,
"%s\t%zu\t%zu\t"
"%s\t%zu\t%c\t"
"%zu\t%zu\t0,0,0\t%zu\t%.2f",
rname, start, end,
feature, score, orient,
start, end + 1, depth, meth);
if (extended) {
fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd);
} else {
fprintf(fout, "\n");
}
}


void init_output_buffers(output_files bed_files) {
// information regarding motif offset pairing
for (size_t i=0; i < bed_files->buf_size; ++i) {
bed_files->out_buffer[i] = (bed_buffer){-1, false, 0, 0, 0, 0};
}
}

void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature) {
// flush accumulation buffers
if (bed_files->accumulated) {
for(size_t ibuf=0; ibuf < bed_files->buf_size; ++ibuf) {
bed_buffer buf = bed_files->out_buffer[ibuf];
FILE* fout = bed_files->motif_acc_files[ibuf];
if (buf.pos != -1 && fout != NULL) {
print_record(
fout, chr, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
buf.depth, extended, buf.cd, buf.md, buf.fd);
}
}
}
}

/** Prints a pileup data structure as bedmethyl file
*
* @param pileup a pileup counts structure.
Expand All @@ -192,15 +274,14 @@ bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) {
* @param extended whether to include counts of canonical, modified and filtered bases.
* @param feature name to use for feature column of BED (e.g. 5mC).
* @param canon_base canonical base to match.
* @param cpg filter output to CpG sites.
* @param chh filter output to CHH sites.
* @param chg filter output to CHG sites.
* @param output_files file handles and output options.
* @param out_buffer state for strand accumulation (modified on output).
* @returns void
*
*/
void print_bedmethyl(
plp_data pileup, char *ref, int rstart, bool extended,
char* feature, char canon_base, output_files bed_files){
char* feature, char canon_base, output_files bed_files) {
// ecoli1 100718 100719 . 4 + 100718 100719 0,0,0 3 0

// this is a bit naff, we should introspect these indices, or have them
Expand All @@ -219,10 +300,6 @@ void print_bedmethyl(
else if (canon_base == 'G') {cif=6; cir=1; rc_canon_base = 'C';}
else if (canon_base == 'T') {cif=7; cir=0; rc_canon_base = 'A';}
else {fprintf(stderr, "ERROR: Unrecognised canonical base: '%c'\n", canon_base); exit(1);}
//if (canon_base != 'C' && (cpg || chh || chg)) {
// fprintf(stderr, "ERROR: CpG filtering cannot be used when canonical base is not 'C'.\n");
// exit(1);
//}

int rlen = strlen(ref);

Expand Down Expand Up @@ -262,43 +339,59 @@ void print_bedmethyl(
for (size_t j = 0; j < numbases; ++j) {
depth += pileup->matrix[i * featlen + bases[j]];
}
if (depth == 0) continue;
// https://www.encodeproject.org/data-standards/wgbs/
// column 11: "Percentage of reads that show methylation at this position in the genome"
// - Seems to disregard possibility of non-C canonical calls
// lets calculate this as proportion of meth:non-meth C
size_t cd = pileup->matrix[i * featlen + ci];
size_t md = pileup->matrix[i * featlen + mi];
size_t fd = pileup->matrix[i * featlen + fi];
size_t tot = cd + md;
if (tot == 0) continue;
float meth = tot == 0 ? 0 : (100.0f * md) / tot;
// column 5: "Score from 0-1000. Capped number of reads"
// lets go with proportion of (mod or canon):(mod or canon or filtered)
size_t score = depth == 0 ? 0 : (1000 * tot) / depth;

// default to stdout

// choose output for this locus, the motifs are mutually exclusive so
// no need to loop
FILE* fout = stdout;
if (bed_files->multi) {
if (is_cpg) { fout = bed_files->fcpg; }
else if (is_chh) { fout = bed_files->fchh; }
else if (is_chg) { fout = bed_files->fchg; }
}

fprintf(fout,
"%s\t%zu\t%zu\t"
"%s\t%zu\t%c\t"
"%zu\t%zu\t0,0,0\t%zu\t%.2f",
pileup->rname, pos, pos + 1,
feature, score, "+-"[isrev],
pos, pos + 1, depth, meth
);
if (extended) {
fprintf(fout, "\t%zu\t%zu\t%zu\n", cd, md, fd);
} else {
fprintf(fout, "\n");
print_record(
fout, pileup->rname, pos, pos + 1, feature, "+-"[isrev],
depth, extended, cd, md, fd);

// strand accumulated
if (bed_files->accumulated && (is_cpg || is_chg)) {
size_t ibuf, motif_offset;
bool do_output;
if (is_cpg) {
ibuf = 0; do_output = bed_files->cpg;
} else { // chg
ibuf = 1; do_output = bed_files->chh;
}
motif_offset = bed_files->motif_offsets[ibuf];
fout = bed_files->motif_acc_files[ibuf];
if (do_output) {
assert(fout != NULL);
bed_buffer buf = bed_files->out_buffer[ibuf];
if (buf.pos == -1) {
bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd};
} else if (pos - buf.pos == motif_offset ) { // paired
assert(buf.isrev != isrev); // shouldn't happen, they can't be same
buf.depth += depth;
buf.cd += cd;
buf.md += md;
buf.fd += fd;
print_record(
fout, pileup->rname, buf.pos, buf.pos + motif_offset + 1, feature, '.',
buf.depth, extended, buf.cd, buf.md, buf.fd);
bed_files->out_buffer[ibuf] = (bed_buffer){-1, false, 0, 0, 0, 0};
} else { // unrelated
print_record(
fout, pileup->rname, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
buf.depth, extended, buf.cd, buf.md, buf.fd);
bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd};
}
}
}
}

} // position loop

}


Expand Down
Loading

0 comments on commit 2260f61

Please sign in to comment.