From 5ebddcf74922e353186319421ee2144c0e4e7fa0 Mon Sep 17 00:00:00 2001 From: canfirtina Date: Tue, 19 Mar 2024 19:26:15 +0100 Subject: [PATCH] New quantization, some fixes in evaluation scripts --- README.md | 2 + src/Makefile | 2 +- src/main.cpp | 186 ++++++------ src/revent.c | 259 +++++++++++----- src/revent.h | 13 +- src/rindex.c | 217 ++++++++------ src/rindex.h | 7 +- src/rmap.cpp | 90 +++--- src/rmap.h | 2 - src/roptions.c | 17 +- src/roptions.h | 9 +- src/rseed.c | 2 +- src/rsig.c | 23 +- src/rsig.h | 10 +- src/rsketch.c | 279 +++++++----------- src/rsketch.h | 21 +- src/rutils.c | 53 +++- src/rutils.h | 4 +- test/README.md | 16 +- test/evaluation/contamination/run_rawhash2.sh | 5 +- .../relative_abundance/run_rawhash.sh | 4 +- .../relative_abundance/run_rawhash2.sh | 4 +- .../relative_abundance/run_sigmap.sh | 4 +- .../relative_abundance/run_uncalled.sh | 4 +- test/scripts/run_rawhash.sh | 2 +- test/scripts/run_sigmap.sh | 2 +- test/scripts/run_uncalled.sh | 2 +- 27 files changed, 696 insertions(+), 543 deletions(-) diff --git a/README.md b/README.md index 1ffb2d6..53a4a87 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ RawHash performs real-time mapping of nanopore raw signals. When the prefix of r # Recent changes +* We came up with a better and more accurate quantization mechanism in RawHash2. The new quantization mechanism dynamically arranges the bucket sizes that each signal value is quantized depending on the normalized distribution of the signal values. **This provides significant improvements in both accuracy and performance.** + * We have integrated the signal alignment functionality with DTW as proposed in RawAlign (see the citation below). The parameters may still not be highly optimized as this is still in experimental stage. Use it with caution. * Offline overlapping functionality is integrated. diff --git a/src/Makefile b/src/Makefile index 57d7a8d..d87f862 100644 --- a/src/Makefile +++ b/src/Makefile @@ -252,4 +252,4 @@ hit.o: rmap.h kalloc.h khash.h rmap.o: rindex.h rsig.h kthread.h rh_kvec.h rutils.h rsketch.h revent.h sequence_until.h dtw.h revent.o: roptions.h kalloc.h rindex.o: roptions.h rutils.h rsketch.h rsig.h bseq.h khash.h rh_kvec.h kthread.h -main:o rawhash.h ketopt.h rutils.h +main:o rawhash.h ketopt.h rutils.h \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 138c008..bb4749e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,7 +6,7 @@ #include "rawhash.h" #include "ketopt.h" -#define RH_VERSION "2.1" +#define RH_VERSION "2.2" #ifdef __linux__ #include @@ -23,57 +23,57 @@ void liftrlimit() {} #endif static ko_longopt_t long_options[] = { - { (char*)"level_column", ko_required_argument, 300 }, - { (char*)"q-mid-occ", ko_required_argument, 301 }, - { (char*)"q-occ-frac", ko_required_argument, 302 }, - { (char*)"min-events", ko_required_argument, 303 }, - { (char*)"bw", ko_required_argument, 304 }, - { (char*)"max-target-gap", ko_required_argument, 305 }, - { (char*)"max-query-gap", ko_required_argument, 306 }, - { (char*)"min-anchors", ko_required_argument, 307 }, - { (char*)"min-score", ko_required_argument, 308 }, - { (char*)"chain-gap-scale", ko_required_argument, 309 }, - { (char*)"chain-skip-scale", ko_required_argument, 310 }, - { (char*)"best-chains", ko_required_argument, 311 }, - { (char*)"primary-ratio", ko_required_argument, 312 }, - { (char*)"primary-length", ko_required_argument, 313 }, - { (char*)"max-skips", ko_required_argument, 314 }, - { (char*)"max-iterations", ko_required_argument, 315 }, - { (char*)"rmq", ko_no_argument, 316 }, - { (char*)"rmq-inner-dist", ko_required_argument, 317 }, - { (char*)"rmq-size-cap", ko_required_argument, 318 }, - { (char*)"bw-long", ko_required_argument, 319 }, - { (char*)"max-chunks", ko_required_argument, 320 }, - { (char*)"min-mapq", ko_required_argument, 321 }, - { (char*)"alt-drop", ko_required_argument, 322 }, - { (char*)"w-besta", ko_required_argument, 323 }, - { (char*)"w-bestma", ko_required_argument, 324 }, - { (char*)"w-bestq", ko_required_argument, 325 }, - { (char*)"w-bestmq", ko_required_argument, 326 }, - { (char*)"w-bestmc", ko_required_argument, 327 }, - { (char*)"w-threshold", ko_required_argument, 328 }, - { (char*)"bp-per-sec", ko_required_argument, 329 }, - { (char*)"sample-rate", ko_required_argument, 330 }, - { (char*)"chunk-size", ko_required_argument, 331 }, - { (char*)"seg-window-length1", ko_required_argument, 332 }, - { (char*)"seg-window-length2", ko_required_argument, 333 }, - { (char*)"seg-threshold1", ko_required_argument, 334 }, - { (char*)"seg-threshold2", ko_required_argument, 335 }, - { (char*)"seg-peak_height", ko_required_argument, 336 }, - { (char*)"sequence-until", ko_no_argument, 337 }, - { (char*)"threshold", ko_required_argument, 338 }, - { (char*)"n-samples", ko_required_argument, 339 }, - { (char*)"test-frequency", ko_required_argument, 340 }, - { (char*)"min-reads", ko_required_argument, 341 }, - { (char*)"mid-occ-frac", ko_required_argument, 342 }, - { (char*)"depletion", ko_no_argument, 343 }, - { (char*)"store-sig", ko_no_argument, 344 }, - { (char*)"sig-target", ko_no_argument, 345 }, - { (char*)"disable-adaptive", ko_no_argument, 346 }, - { (char*)"sig-diff", ko_required_argument, 347 }, - { (char*)"align", ko_no_argument, 348 }, - { (char*)"dtw-evaluate-chains", ko_no_argument, 349 }, - { (char*)"dtw-output-cigar", ko_no_argument, 350 }, + { (char*)"level_column", ko_required_argument, 300 }, + { (char*)"q-mid-occ", ko_required_argument, 301 }, + { (char*)"mid_occ_frac", ko_required_argument, 302 }, + { (char*)"min-events", ko_required_argument, 303 }, + { (char*)"bw", ko_required_argument, 304 }, + { (char*)"max-target-gap", ko_required_argument, 305 }, + { (char*)"max-query-gap", ko_required_argument, 306 }, + { (char*)"min-anchors", ko_required_argument, 307 }, + { (char*)"min-score", ko_required_argument, 308 }, + { (char*)"chain-gap-scale", ko_required_argument, 309 }, + { (char*)"chain-skip-scale", ko_required_argument, 310 }, + { (char*)"best-chains", ko_required_argument, 311 }, + { (char*)"primary-ratio", ko_required_argument, 312 }, + { (char*)"primary-length", ko_required_argument, 313 }, + { (char*)"max-skips", ko_required_argument, 314 }, + { (char*)"max-iterations", ko_required_argument, 315 }, + { (char*)"rmq", ko_no_argument, 316 }, + { (char*)"rmq-inner-dist", ko_required_argument, 317 }, + { (char*)"rmq-size-cap", ko_required_argument, 318 }, + { (char*)"bw-long", ko_required_argument, 319 }, + { (char*)"max-chunks", ko_required_argument, 320 }, + { (char*)"min-mapq", ko_required_argument, 321 }, + { (char*)"alt-drop", ko_required_argument, 322 }, + { (char*)"w-besta", ko_required_argument, 323 }, + { (char*)"w-bestma", ko_required_argument, 324 }, + { (char*)"w-bestq", ko_required_argument, 325 }, + { (char*)"w-bestmq", ko_required_argument, 326 }, + { (char*)"w-bestmc", ko_required_argument, 327 }, + { (char*)"w-threshold", ko_required_argument, 328 }, + { (char*)"bp-per-sec", ko_required_argument, 329 }, + { (char*)"sample-rate", ko_required_argument, 330 }, + { (char*)"chunk-size", ko_required_argument, 331 }, + { (char*)"seg-window-length1", ko_required_argument, 332 }, + { (char*)"seg-window-length2", ko_required_argument, 333 }, + { (char*)"seg-threshold1", ko_required_argument, 334 }, + { (char*)"seg-threshold2", ko_required_argument, 335 }, + { (char*)"seg-peak_height", ko_required_argument, 336 }, + { (char*)"sequence-until", ko_no_argument, 337 }, + { (char*)"threshold", ko_required_argument, 338 }, + { (char*)"n-samples", ko_required_argument, 339 }, + { (char*)"test-frequency", ko_required_argument, 340 }, + { (char*)"min-reads", ko_required_argument, 341 }, + { (char*)"occ-frac", ko_required_argument, 342 }, + { (char*)"depletion", ko_no_argument, 343 }, + { (char*)"store-sig", ko_no_argument, 344 }, + { (char*)"sig-target", ko_no_argument, 345 }, + { (char*)"disable-adaptive", ko_no_argument, 346 }, + { (char*)"sig-diff", ko_required_argument, 347 }, + { (char*)"align", ko_no_argument, 348 }, + { (char*)"dtw-evaluate-chains", ko_no_argument, 349 }, + { (char*)"dtw-output-cigar", ko_no_argument, 350 }, { (char*)"dtw-border-constraint", ko_required_argument, 351 }, { (char*)"dtw-log-scores", ko_no_argument, 352 }, { (char*)"no-chainingscore-filtering", ko_no_argument, 353 }, @@ -83,8 +83,13 @@ static ko_longopt_t long_options[] = { { (char*)"dtw-min-score", ko_required_argument, 357 }, { (char*)"log-anchors", ko_no_argument, 358 }, { (char*)"log-num-anchors", ko_no_argument, 359 }, - { (char*)"ava", ko_no_argument, 360 }, - { (char*)"version", ko_no_argument, 361 }, + { (char*)"ava", ko_no_argument, 360 }, + // { (char*)"top-n-mean", ko_required_argument, 361 }, + { (char*)"rev-query", ko_no_argument, 362 }, + { (char*)"fine-min", ko_required_argument, 364 }, + { (char*)"fine-max", ko_required_argument, 365 }, + { (char*)"fine-range", ko_required_argument, 366 }, + { (char*)"version", ko_no_argument, 367 }, { 0, 0, 0 } }; @@ -118,9 +123,9 @@ int ri_set_opt(const char *preset, ri_idxopt_t *io, ri_mapopt_t *mo) ri_idxopt_init(io); ri_mapopt_init(mo); } else if (strcmp(preset, "sensitive") == 0) { - io->e = 6; io->q = 9; io->lq = 3; io->w = 0; io->n = 0; + io->e = 8; io->w = 0; io->n = 0; } else if (strcmp(preset, "r10sensitive") == 0) { - io->k = 9; io->e = 8; io->q = 8; io->lq = 2; io->w = 0; io->n = 0; + io->k = 9; io->e = 8; io->w = 0; io->n = 0; mo->window_length1 = 5; mo->window_length2 = 12; mo->threshold1 = 4.20265f; mo->threshold2 = 3.67058f; mo->peak_height = 0.2f; @@ -133,24 +138,21 @@ int ri_set_opt(const char *preset, ri_idxopt_t *io, ri_mapopt_t *mo) io->bp_per_sec = 400; io->sample_per_base = (float)io->sample_rate / io->bp_per_sec; - // mo->w_best2q = 0.0f; mo->w_best2c = 0.1f; mo->w_bestmq = 0.4f; mo->w_bestmc = 0.5f; mo->w_threshold = 0.7f; - - mo->max_num_chunk = 15, mo->min_mapq = 2, mo->min_num_anchors = 2, mo->min_chaining_score = 15, mo->chain_gap_scale = 1.2f, mo->chain_skip_scale = 0.0f; + mo->max_num_chunk = 10, mo->min_mapq = 2, mo->min_num_anchors = 2, mo->min_chaining_score = 15, mo->chain_gap_scale = 1.2f, mo->chain_skip_scale = 0.0f; } else if (strcmp(preset, "fast") == 0) { - io->e = 8; io->q = 9; io->lq = 3; io->w = 0; io->n = 0; mo->mini_batch_size = 750000000; - - mo->max_num_chunk = 15, mo->min_mapq = 5, mo->min_num_anchors = 2, mo->min_chaining_score = 10, mo->chain_gap_scale = 0.6f, mo->chain_skip_scale = 0.0f; + io->e = 11; io->w = 0; io->n = 0; + io->fine_range = 0.6; + mo->max_num_chunk = 10, mo->min_mapq = 5, mo->min_num_anchors = 2, mo->min_chaining_score = 10, mo->chain_gap_scale = 0.6f, mo->chain_skip_scale = 0.0f; } else if (strcmp(preset, "faster") == 0) { - io->e = 8; io->q = 9; io->lq = 3; io->w = 3; io->n = 0; mo->mini_batch_size = 1000000000; + io->e = 11; io->w = 3; io->n = 0; + io->fine_range = 0.6; + mo->max_num_chunk = 5; mo->min_mapq = 5, mo->min_num_anchors = 2, mo->min_chaining_score = 10, mo->chain_gap_scale = 0.6f, mo->chain_skip_scale = 0.0f; } else if (strcmp(preset, "viral") == 0) { - io->e = 5; io->q = 9; io->lq = 3; io->w = 0; io->n = 0; + io->e = 6; io->w = 0; io->n = 0; mo->bw = 100; mo->max_target_gap_length = 500; mo->max_query_gap_length = 500; - - // mo->w_best2q = 0.1f; mo->w_best2c = 0.7f; mo->w_bestmq = 0.0f; mo->w_bestmc = 0.2f; mo->w_threshold = 0.3f; - mo->max_num_chunk = 5, mo->min_mapq = 2, mo->min_num_anchors = 2, mo->min_chaining_score = 10; mo->chain_gap_scale = 1.2f; mo->chain_skip_scale = 0.3f; } else if (strcmp(preset, "sequence-until") == 0) { - io->e = 7; io->q = 9; io->lq = 3; io->w = 0; io->n = 0; mo->mini_batch_size = 750000000; + io->e = 8; io->w = 0; io->n = 0; } else return -1; return 0; } @@ -185,7 +187,7 @@ int ri_mapopt_parse_dtw_fill_method(ri_mapopt_t *opt, char* arg) { return 0; } -char* ri_maptopt_dtw_mode_to_string(uint32_t dtw_border_constraint){ +const char* ri_maptopt_dtw_mode_to_string(uint32_t dtw_border_constraint){ switch(dtw_border_constraint){ case RI_M_DTW_BORDER_CONSTRAINT_GLOBAL: return "full"; @@ -200,7 +202,7 @@ char* ri_maptopt_dtw_mode_to_string(uint32_t dtw_border_constraint){ int main(int argc, char *argv[]) { - const char *opt_str = "k:d:p:e:q:l:w:n:o:t:K:x:"; + const char *opt_str = "k:d:p:e:q:w:n:o:t:K:x:"; ketopt_t o = KETOPT_INIT; ri_mapopt_t opt; ri_idxopt_t ipt; @@ -239,7 +241,6 @@ int main(int argc, char *argv[]) else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'e') ipt.e = atoi(o.arg); else if (c == 'q') ipt.q = atoi(o.arg); - else if (c == 'l') ipt.lq = atoi(o.arg); else if (c == 'w') ipt.w = atoi(o.arg); else if (c == 'n') ipt.n = atoi(o.arg); else if (c == 't') n_threads = atoi(o.arg); @@ -260,7 +261,7 @@ int main(int argc, char *argv[]) if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); //max // opt.q_mid_occ = atoi(o.arg);// --q-mid-occ } - else if (c == 302) opt.q_occ_frac = atof(o.arg);// --q-occ-frac + else if (c == 302) opt.mid_occ_frac = atof(o.arg);// --occ-frac else if (c == 303) opt.min_events = atoi(o.arg); // --min-events else if (c == 304) opt.bw = atoi(o.arg);// --bw else if (c == 305) opt.max_target_gap_length = atoi(o.arg);// --max-target-gap @@ -306,11 +307,11 @@ int main(int argc, char *argv[]) else if (c == 339) opt.tn_samples = atoi(o.arg);// --n-samples else if (c == 340) opt.ttest_freq = atoi(o.arg);// --test-frequency else if (c == 341) opt.tmin_reads = atoi(o.arg);// --min-reads - else if (c == 342) opt.mid_occ_frac = atof(o.arg);// --mid-occ-frac + else if (c == 342) opt.mid_occ_frac = atof(o.arg);// --occ-frac else if (c == 343) {opt.best_n = 10;}// --depletion else if (c == 344) {ipt.flag |= RI_I_STORE_SIG;} // --store-sig else if (c == 345) {ipt.flag |= RI_I_SIG_TARGET;} // --sig-target - else if (c == 346) {ipt.flag |= RI_M_NO_ADAPTIVE;} // --disable-adaptive + else if (c == 346) {opt.flag |= RI_M_NO_ADAPTIVE;} // --disable-adaptive else if (c == 347) {ipt.diff = atof(o.arg);} // --sig-diff else if (c == 348) {opt.flag |= RI_M_ALIGN;} // --align else if (c == 349) opt.flag |= RI_M_DTW_EVALUATE_CHAINS; // --dtw-evaluate-chains @@ -334,13 +335,18 @@ int main(int argc, char *argv[]) else if (c == 357) opt.dtw_min_score = atof(o.arg); // --dtw-min-score else if (c == 358) opt.flag |= RI_M_LOG_ANCHORS; // --log-anchors else if (c == 359) opt.flag |= RI_M_LOG_NUM_ANCHORS; // --log-num-anchors - else if (c == 360) {ipt.flag |= RI_I_SIG_TARGET; opt.flag |= RI_M_ALL_CHAINS; opt.min_chaining_score = 15; opt.chunk_size = 10000000;}// --ava - else if (c == 361) {puts(RH_VERSION); return 0;}// --version + else if (c == 360) {ipt.flag |= RI_I_SIG_TARGET; opt.flag |= RI_M_ALL_CHAINS; opt.min_chaining_score = 15; opt.flag |= RI_M_NO_ADAPTIVE;;}// --ava + // else if (c == 361) {opt.top_n_mean = atoi(o.arg);}// --top-n-mean + else if (c == 362) {ipt.flag |= RI_I_REV_QUERY;}// --rev-query + else if (c == 364) {ipt.fine_min = atof(o.arg);}// --fine-min + else if (c == 365) {ipt.fine_max = atof(o.arg);}// --fine-max + else if (c == 366) {ipt.fine_range = atof(o.arg);}// --fine-range + else if (c == 367) {puts(RH_VERSION); return 0;}// --version else if (c == 'V') {puts(RH_VERSION); return 0;} } if (argc == o.ind || fp_help == stdout) { - fprintf(fp_help, "Usage: rawhash [options] | [query.fast5] [...]\n"); + fprintf(fp_help, "Usage: rawhash2 [options] | [query.fast5] [...]\n"); fprintf(fp_help, "Options:\n"); fprintf(fp_help, " K-mer (pore) Model:\n"); @@ -351,8 +357,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, "\n Indexing:\n"); fprintf(fp_help, " -d FILE [Strongly recommended to create before mapping] dump index to FILE [].\n"); fprintf(fp_help, " -e INT number of events concatanated in a single hash (usually no larger than 10). Also applies during mapping [%d].\n", ipt.e); - fprintf(fp_help, " -q INT most significant bits of signal values to process [%d]. Signal values are assumed to be in the IEEE standard for floating-point arithmetic.\n", ipt.q); - fprintf(fp_help, " -l INT least significant bits of the q bits to quantize along with the most signficant 2 bits of the q bits [%d].\n", ipt.lq); + fprintf(fp_help, " -q INT Number of bits to use for quantization [%d]. Number of quantized buckets are created accordingly (2^INT).\n", ipt.q); fprintf(fp_help, " -w INT minimizer window size [%d]. Enables minimizer-based seeding in indexing and mapping (may reduce accuracy but improves the performance and memory space efficiency).\n", ipt.w); fprintf(fp_help, " --store-sig Stores the target signal in the index file.\n"); fprintf(fp_help, " --sig-target The target sequence (reference) contains signals rather than base characters.\n"); @@ -361,8 +366,8 @@ int main(int argc, char *argv[]) // fprintf(fp_help, " -n NUM number of consecutive seeds to use for BLEND-based seeding [%d]. Enables the BLEND mechanism (may improve accuracy but reduces the performance at the moment)\n", ipt.n); fprintf(fp_help, "\n Seeding:\n"); - fprintf(fp_help, " --q-mid-occ INT1[,INT2] Lower and upper bounds of k-mer occurrences [%d, %d]. The final k-mer occurrence threshold is max{INT1, min{INT2, --q-occ-frac}}. This option prevents excessively small or large -f estimated from the input reference.\n", opt.min_mid_occ, opt.max_mid_occ); - fprintf(fp_help, " --q-occ-frac FLOAT Discard a query seed if its occurrence is higher than FLOAT fraction of all query seeds [%g]. Set 0 to disable. [Note: Both --q-mid-occ and --q-occ-frac should be met for a seed to be discarded].\n", opt.q_occ_frac); + fprintf(fp_help, " --q-mid-occ INT1[,INT2] Lower and upper bounds of k-mer occurrences [%d, %d]. The final k-mer occurrence threshold is max{INT1, min{INT2, --occ-frac}}. This option prevents excessively small or large -f estimated from the input reference.\n", opt.min_mid_occ, opt.max_mid_occ); + fprintf(fp_help, " --occ-frac FLOAT Discard a query seed if its occurrence is higher than FLOAT fraction of all query seeds [%g]. Set 0 to disable. [Note: Both --q-mid-occ and --occ-frac should be met for a seed to be discarded].\n", opt.q_occ_frac); fprintf(fp_help, "\n Chaining Parameters:\n"); fprintf(fp_help, " --min-events INT minimum number of INT events in a chunk to start chain elongation [%d].\n", opt.min_events); @@ -451,21 +456,26 @@ int main(int argc, char *argv[]) return 1; } - float* pore_vals = 0; + + ri_pore_t pore; + pore.pore_vals = NULL; + pore.pore_inds = NULL; + pore.max_val = -5000.0; + pore.min_val = 5000.0; if(!idx_rdr->is_idx && fpore == 0){ fprintf(stderr, "[ERROR] missing input: please specify a pore model file with -p when generating the index from a sequence file\n"); ri_idx_reader_close(idx_rdr); return 1; }else if(!idx_rdr->is_idx && fpore){ - load_pore(fpore, ipt.k, ipt.lev_col, &pore_vals); - if(!pore_vals){ + load_pore(fpore, ipt.k, ipt.lev_col, &pore); + if(!pore.pore_vals){ fprintf(stderr, "[ERROR] cannot parse the k-mer pore model file. Please see the example k-mer model files provided in the RawHash repository.\n"); ri_idx_reader_close(idx_rdr); return 1; } } - while ((ri = ri_idx_reader_read(idx_rdr, pore_vals, n_threads)) != 0) { + while ((ri = ri_idx_reader_read(idx_rdr, &pore, n_threads)) != 0) { int ret; if (ri_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", @@ -473,8 +483,9 @@ int main(int argc, char *argv[]) if (argc != o.ind + 1) ri_mapopt_update(&opt, ri); if (ri_verbose >= 3) ri_idx_stat(ri); if (argc - (o.ind + 1) == 0) { + fprintf(stderr, "[INFO] No files to query index on. Only the index is constructed.\n"); ri_idx_destroy(ri); - continue; // no query files + continue; // no query files, just creating the index } ret = 0; // if (!(opt.flag & MM_F_FRAG_MODE)) { //TODO: enable frag mode directly from options @@ -494,7 +505,8 @@ int main(int argc, char *argv[]) } // n_parts = idx_rdr->n_parts; ri_idx_reader_close(idx_rdr); - if(pore_vals)free(pore_vals); + if(pore.pore_vals)free(pore.pore_vals); + if(pore.pore_inds)free(pore.pore_inds); if (fflush(stdout) == EOF) { perror("[ERROR] failed to write the results"); diff --git a/src/revent.c b/src/revent.c index 7f74d8f..65984df 100644 --- a/src/revent.c +++ b/src/revent.c @@ -3,6 +3,7 @@ #include #include #include +#include "rutils.h" //Some of the functions here are adopted from the Sigmap implementation (https://github.com/haowenz/sigmap/tree/c9a40483264c9514587a36555b5af48d3f054f6f). We have optimized the Sigmap implementation to work with the hash tables efficiently. @@ -19,8 +20,11 @@ typedef struct ri_detect_s { int valid_peak; }ri_detect_t; -static inline void comp_prefix_prefixsq(const float *sig, uint32_t s_len, float* prefix_sum, float* prefix_sum_square) { - +static inline void comp_prefix_prefixsq(const float *sig, + const uint32_t s_len, + float* prefix_sum, + float* prefix_sum_square) +{ assert(s_len > 0); prefix_sum[0] = 0.0f; @@ -31,22 +35,17 @@ static inline void comp_prefix_prefixsq(const float *sig, uint32_t s_len, float* } } -static inline float* comp_tstat(void *km, const float *prefix_sum, const float *prefix_sum_square, uint32_t s_len, uint32_t w_len) { - +static inline float* comp_tstat(void *km, + const float *prefix_sum, + const float *prefix_sum_square, + const uint32_t s_len, + const uint32_t w_len) +{ const float eta = FLT_MIN; - - // rh_kvec_t(float) tstat = {0,0,0}; - // rh_kv_resize(float, 0, tstat, s_len+1); - // rh_kv_pushp(float, 0, tstat, &s); - float* tstat = (float*)ri_kcalloc(km, s_len+1, sizeof(float)); - // Quick return: - // t-test not defined for number of points less than 2 - // need at least as many points as twice the window length if (s_len < 2*w_len || w_len < 2) return tstat; - // fudge boundaries memset(tstat, 0, w_len*sizeof(float)); - // get to work on the rest + for (uint32_t i = w_len; i <= s_len - w_len; ++i) { float sum1 = prefix_sum[i]; float sumsq1 = prefix_sum_square[i]; @@ -58,7 +57,7 @@ static inline float* comp_tstat(void *km, const float *prefix_sum, const float * float sumsq2 = prefix_sum_square[i + w_len] - prefix_sum_square[i]; float mean1 = sum1 / w_len; float mean2 = sum2 / w_len; - float combined_var = sumsq1 / w_len - mean1 * mean1 + sumsq2 / w_len - mean2 * mean2; + float combined_var = (sumsq1/w_len - mean1*mean1 + sumsq2/w_len - mean2*mean2)/w_len; // Prevent problem due to very small variances combined_var = fmaxf(combined_var, eta); // t-stat @@ -66,7 +65,7 @@ static inline float* comp_tstat(void *km, const float *prefix_sum, const float * // special case where there are two samples of equal size with // differing variance const float delta_mean = mean2 - mean1; - tstat[i] = fabs(delta_mean) / sqrt(combined_var / w_len); + tstat[i] = fabs(delta_mean) / sqrt(combined_var); } // fudge boundaries memset(tstat+s_len-w_len+1, 0, (w_len)*sizeof(float)); @@ -74,30 +73,43 @@ static inline float* comp_tstat(void *km, const float *prefix_sum, const float * return tstat; } -static inline uint32_t gen_peaks(ri_detect_t *short_detector, ri_detect_t *long_detector, const float peak_height, uint32_t* peaks) { - - assert(short_detector->s_len == long_detector->s_len); +// static inline float calculate_adaptive_peak_height(const float *prefix_sum, const float *prefix_sum_square, uint32_t current_index, uint32_t window_length, float base_peak_height) { +// // Ensure we don't go beyond signal bounds +// uint32_t start_index = current_index > window_length ? current_index - window_length : 0; +// uint32_t end_index = current_index + window_length; - uint32_t curInd = 0; +// float sum = prefix_sum[end_index] - prefix_sum[start_index]; +// float sumsq = prefix_sum_square[end_index] - prefix_sum_square[start_index]; +// float mean = sum / (end_index - start_index); +// float variance = (sumsq / (end_index - start_index)) - (mean * mean); +// float stddev = sqrtf(variance); + +// // Example adaptive strategy: Increase peak height in high-variance regions +// return base_peak_height * (1 + stddev); +// } + +static inline uint32_t gen_peaks(ri_detect_t **detectors, + const uint32_t n_detectors, + const float peak_height, + const float *prefix_sum, + const float *prefix_sum_square, + uint32_t* peaks) { - uint32_t ndetector = 2; - ri_detect_t *detectors[ndetector]; // = {short_detector, long_detector}; - detectors[0] = short_detector; - detectors[1] = long_detector; - for (uint32_t i = 0; i < short_detector->s_len; i++) { - for (uint32_t k = 0; k < ndetector; k++) { + uint32_t curInd = 0; + for (uint32_t i = 0; i < detectors[0]->s_len; i++) { + for (uint32_t k = 0; k < n_detectors; k++) { ri_detect_t *detector = detectors[k]; - // Carry on if we've been masked out if (detector->masked_to >= i) continue; float current_value = detector->sig[i]; + // float adaptive_peak_height = calculate_adaptive_peak_height(prefix_sum, prefix_sum_square, i, detector->window_length, peak_height); + if (detector->peak_pos == detector->DEF_PEAK_POS) { - // CASE 1: We've not yet recorded a maximum - if (current_value < detector->peak_value) { - // Either record a deeper minimum... + // CASE 1: We've not yet recorded any maximum + if (current_value < detector->peak_value) { // A deeper minimum: detector->peak_value = current_value; - } else if (current_value - detector->peak_value > peak_height) { // TODO(Haowen): this might cause overflow, need to fix this - // ...or we've seen a qualifying maximum + } else if (current_value - detector->peak_value > peak_height) { + // ...or a qualifying maximum: detector->peak_value = current_value; detector->peak_pos = i; // otherwise, wait to rise high enough to be considered a peak @@ -109,22 +121,22 @@ static inline uint32_t gen_peaks(ri_detect_t *short_detector, ri_detect_t *long_ detector->peak_value = current_value; detector->peak_pos = i; } - // Dominate other tstat signals if we're going to fire at some point - if (detector == short_detector) { - if (detector->peak_value > detector->threshold) { - long_detector->masked_to = detector->peak_pos + detector->window_length; - long_detector->peak_pos = long_detector->DEF_PEAK_POS; - long_detector->peak_value = long_detector->DEF_PEAK_VAL; - long_detector->valid_peak = 0; + // Tell other detectors no need to check for a peak until a certain point + if (detector->peak_value > detector->threshold) { + for(int n_d = k+1; n_d < n_detectors; n_d++){ + detectors[n_d]->masked_to = detector->peak_pos + detectors[0]->window_length; + detectors[n_d]->peak_pos = detectors[n_d]->DEF_PEAK_POS; + detectors[n_d]->peak_value = detectors[n_d]->DEF_PEAK_VAL; + detectors[n_d]->valid_peak = 0; } } - // Have we convinced ourselves we've seen a peak - if (detector->peak_value - current_value > peak_height && detector->peak_value > detector->threshold) { + // There is a good peak + if (detector->peak_value - current_value > peak_height && + detector->peak_value > detector->threshold) { detector->valid_peak = 1; } - // Finally, check the distance if this is a good peak + // Check if we are now further away from the current peak if (detector->valid_peak && (i - detector->peak_pos) > detector->window_length / 2) { - // Emit the boundary and reset peaks[curInd++] = detector->peak_pos; detector->peak_pos = detector->DEF_PEAK_POS; detector->peak_value = current_value; @@ -137,6 +149,36 @@ static inline uint32_t gen_peaks(ri_detect_t *short_detector, ri_detect_t *long_ return curInd; } +int compare_floats(const void* a, const void* b) { + const float* da = (const float*) a; + const float* db = (const float*) b; + return (*da > *db) - (*da < *db); +} + +float calculate_mean_of_filtered_segment(float* segment, + const uint32_t segment_length) +{ + // Calculate median and IQR + qsort(segment, segment_length, sizeof(float), compare_floats); // Assuming compare_floats is already defined + float q1 = segment[segment_length / 4]; + float q3 = segment[3 * segment_length / 4]; + float iqr = q3 - q1; + float lower_bound = q1 - iqr; + float upper_bound = q3 + iqr; + + float sum = 0.0; + uint32_t count = 0; + for (uint32_t i = 0; i < segment_length; i++) { + if (segment[i] >= lower_bound && segment[i] <= upper_bound) { + sum += segment[i]; + ++count; + } + } + + // Return the mean of the filtered segment + return count > 0 ? sum / count : 0; // Ensure we don't divide by zero +} + /** * @brief Generates events from peaks, prefix sums and s_len. * @@ -144,71 +186,128 @@ static inline uint32_t gen_peaks(ri_detect_t *short_detector, ri_detect_t *long_ * @param peaks Array of peak positions. * @param peak_size Size of peaks array. * @param prefix_sum Array of prefix sums. - * @param prefix_sum_square Array of prefix sums squared. * @param s_len Length of the signal. * @param n_events Pointer to the number of events generated. * @return float* Pointer to the array of generated events. */ static inline float* gen_events(void *km, + float* sig, const uint32_t *peaks, const uint32_t peak_size, - const float *prefix_sum, - const float *prefix_sum_square, const uint32_t s_len, uint32_t* n_events) { - uint32_t n_ev = 1; - double mean = 0, std_dev = 0, sum = 0, sum2 = 0; + uint32_t n_ev = 0; - for (uint32_t i = 1; i < peak_size; ++i) - if (peaks[i] > 0 && peaks[i] < s_len) - n_ev++; + for (uint32_t pi = 0; pi < peak_size; ++pi) + if (peaks[pi] > 0 && peaks[pi] < s_len) n_ev++; float* events = (float*)ri_kmalloc(km, n_ev*sizeof(float)); - float l_prefixsum = 0, l_peak = 0; - - for (uint32_t pi = 0; pi < n_ev - 1; pi++){ - events[pi] = (prefix_sum[peaks[pi]] - l_prefixsum)/(peaks[pi]-l_peak); - sum += events[pi]; - sum2 += events[pi]*events[pi]; - l_prefixsum = prefix_sum[peaks[pi]]; - l_peak = peaks[pi]; - } - events[n_ev-1] = (prefix_sum[s_len] - l_prefixsum)/(s_len-l_peak); - sum += events[n_ev-1]; - sum2 += events[n_ev-1]*events[n_ev-1]; + uint32_t start_idx = 0, segment_length = 0; - //normalization - mean = sum/n_ev; - std_dev = sqrt(sum2/n_ev - (mean)*(mean)); + for (uint32_t pi = 0, i = 0; pi < peak_size && i < n_ev; pi++){ + if (!(peaks[pi] > 0 && peaks[pi] < s_len)) continue; - for(uint32_t i = 0; i < n_ev; ++i){ - events[i] = (events[i]-mean)/std_dev; + segment_length = peaks[pi] - start_idx; + events[i++] = calculate_mean_of_filtered_segment(sig + start_idx, segment_length); + start_idx = peaks[pi]; } (*n_events) = n_ev; return events; } -float* detect_events(void *km, uint32_t s_len, const float* sig, uint32_t window_length1, uint32_t window_length2, float threshold1, float threshold2, float peak_height, uint32_t *n) // kt_for() callback +static inline float* normalize_signal(void *km, + const float* sig, + const uint32_t s_len, + double* mean_sum, + double* std_dev_sum, + uint32_t* n_events_sum, + uint32_t* n_sig) +{ + double sum = (*mean_sum), sum2 = (*std_dev_sum); + double mean = 0, std_dev = 0; + float* events = (float*)ri_kcalloc(km, s_len, sizeof(float)); + + for (uint32_t i = 0; i < s_len; ++i) { + sum += sig[i]; + sum2 += sig[i]*sig[i]; + } + + (*n_events_sum) += s_len; + (*mean_sum) = sum; + (*std_dev_sum) = sum2; + + mean = sum/(*n_events_sum); + std_dev = sqrt(sum2/(*n_events_sum) - (mean)*(mean)); + + float norm_val = 0; + int k = 0; + for(uint32_t i = 0; i < s_len; ++i){ + norm_val = (sig[i]-mean)/std_dev; + if(norm_val < 3 && norm_val > -3) events[k++] = norm_val; + } + + (*n_sig) = k; + + return events; +} + +float* detect_events(void *km, + const uint32_t s_len, + const float* sig, + const uint32_t window_length1, + const uint32_t window_length2, + const float threshold1, + const float threshold2, + const float peak_height, + double* mean_sum, + double* std_dev_sum, + uint32_t* n_events_sum, + uint32_t *n_events) { float* prefix_sum = (float*)ri_kcalloc(km, s_len+1, sizeof(float)); float* prefix_sum_square = (float*)ri_kcalloc(km, s_len+1, sizeof(float)); - comp_prefix_prefixsq(sig, s_len, prefix_sum, prefix_sum_square); - float* tstat1 = comp_tstat(km, prefix_sum, prefix_sum_square, s_len, window_length1); - float* tstat2 = comp_tstat(km, prefix_sum, prefix_sum_square, s_len, window_length2); - ri_detect_t short_detector = {.DEF_PEAK_POS = -1, .DEF_PEAK_VAL = FLT_MAX, .sig = tstat1, .s_len = s_len, .threshold = threshold1, - .window_length = window_length1, .masked_to = 0, .peak_pos = -1, .peak_value = FLT_MAX, .valid_peak = 0}; + //Normalize the signal + uint32_t n_signals = 0; + float* norm_signals = normalize_signal(km, sig, s_len, mean_sum, std_dev_sum, n_events_sum, &n_signals); + if(n_signals == 0) return 0; + comp_prefix_prefixsq(norm_signals, n_signals, prefix_sum, prefix_sum_square); + + float* tstat1 = comp_tstat(km, prefix_sum, prefix_sum_square, n_signals, window_length1); + float* tstat2 = comp_tstat(km, prefix_sum, prefix_sum_square, n_signals, window_length2); + ri_detect_t short_detector = {.DEF_PEAK_POS = -1, + .DEF_PEAK_VAL = FLT_MAX, + .sig = tstat1, + .s_len = n_signals, + .threshold = threshold1, + .window_length = window_length1, + .masked_to = 0, + .peak_pos = -1, + .peak_value = FLT_MAX, + .valid_peak = 0}; + + ri_detect_t long_detector = {.DEF_PEAK_POS = -1, + .DEF_PEAK_VAL = FLT_MAX, + .sig = tstat2, + .s_len = n_signals, + .threshold = threshold2, + .window_length = window_length2, + .masked_to = 0, + .peak_pos = -1, + .peak_value = FLT_MAX, + .valid_peak = 0}; + + uint32_t* peaks = (uint32_t*)ri_kmalloc(km, n_signals * sizeof(uint32_t)); + ri_detect_t *detectors[2] = {&short_detector, &long_detector}; + uint32_t n_peaks = gen_peaks(detectors, 2, peak_height, prefix_sum, prefix_sum_square, peaks); + ri_kfree(km, tstat1); ri_kfree(km, tstat2); ri_kfree(km, prefix_sum); ri_kfree(km, prefix_sum_square); - ri_detect_t long_detector = {.DEF_PEAK_POS = -1, .DEF_PEAK_VAL = FLT_MAX, .sig = tstat2, .s_len = s_len, .threshold = threshold2, - .window_length = window_length2, .masked_to = 0, .peak_pos = -1, .peak_value = FLT_MAX, .valid_peak = 0}; - uint32_t* peaks = (uint32_t*)ri_kmalloc(km, s_len * sizeof(uint32_t)); - uint32_t n_peaks = gen_peaks(&short_detector, &long_detector, peak_height, peaks); float* events = 0; - if(n_peaks > 0) events = gen_events(km, peaks, n_peaks, prefix_sum, prefix_sum_square, s_len, n); - ri_kfree(km, tstat1); ri_kfree(km, tstat2); ri_kfree(km, prefix_sum); ri_kfree(km, prefix_sum_square); ri_kfree(km, peaks); + if(n_peaks > 0) events = gen_events(km, norm_signals, peaks, n_peaks, n_signals, n_events); + ri_kfree(km, norm_signals); ri_kfree(km, peaks); return events; } diff --git a/src/revent.h b/src/revent.h index 13d32cd..a98dfc8 100644 --- a/src/revent.h +++ b/src/revent.h @@ -19,7 +19,18 @@ extern "C" { * * @return list of event values of length $n */ -float* detect_events(void *km, uint32_t s_len, const float* sig, uint32_t window_length1, uint32_t window_length2, float threshold1, float threshold2, float peak_height, uint32_t *n); +float* detect_events(void *km, + const uint32_t s_len, + const float* sig, + const uint32_t window_length1, + const uint32_t window_length2, + const float threshold1, + const float threshold2, + const float peak_height, + double* mean_sum, + double* std_dev_sum, + uint32_t* n_events_sum, + uint32_t *n_events); #ifdef __cplusplus } diff --git a/src/rindex.c b/src/rindex.c index 666edd3..936d15a 100644 --- a/src/rindex.c +++ b/src/rindex.c @@ -30,7 +30,6 @@ int ri_verbose = 1; typedef struct { int mini_batch_size; uint64_t batch_size, sum_len; - const float* pore_vals; mm_bseq_file_t* fp; ri_idx_t* ri; @@ -48,14 +47,16 @@ typedef struct { void ri_idx_stat(const ri_idx_t *ri) { - fprintf(stderr, "[M::%s] pore kmer size: %d; concatanated events: %d; quantization method (most sig. Q/least sig. lq): %d/%d; w: %d; n: %d; #seq: %d\n", __func__, ri->k, ri->e, ri->q, ri->lq, ri->w, ri->n, ri->n_seq); + fprintf(stderr, "[M::%s] pore kmer size: %d; concatanated events: %d; quantization bits: %d; w: %d; n: %d; #seq: %d\n", __func__, ri->k, ri->e, ri->q, ri->w, ri->n, ri->n_seq); } -ri_idx_t* ri_idx_init(float diff, int b, int w, int e, int n, int q, int lq, int k, int flag){ +ri_idx_t* ri_idx_init(float diff, int b, int w, int e, int n, int q, int k, float fine_min, float fine_max, float fine_range, int flag){ ri_idx_t* ri; ri = (ri_idx_t*)calloc(1, sizeof(ri_idx_t)); - ri->b = b, ri->w = w; ri->e = e; ri->n = n; ri->q = q; ri->lq = lq, ri->k = k, ri->flag = flag; + ri->b = b, ri->w = w; ri->e = e; ri->n = n; ri->q = q; ri->k = k, ri->flag = flag; ri->diff = diff; + ri->fine_min = fine_min, ri->fine_max = fine_max, ri->fine_range = fine_range; + ri->seq = NULL; ri->sig = NULL; ri->F = NULL; ri->R = NULL; ri->f_l_sig = NULL; ri->r_l_sig = NULL; ri->pore = NULL; ri->h = NULL; ri->B = (ri_idx_bucket_t*)calloc(1<b, sizeof(ri_idx_bucket_t)); ri->km = ri_km_init(); @@ -64,7 +65,7 @@ ri_idx_t* ri_idx_init(float diff, int b, int w, int e, int n, int q, int lq, int void ri_idx_destroy(ri_idx_t *ri){ uint32_t i; - if (ri == 0) return; + if (!ri) return; if (ri->h) kh_destroy(str, (khash_t(str)*)ri->h); if (ri->B) { for (i = 0; i < 1U<b; ++i) { @@ -75,25 +76,6 @@ void ri_idx_destroy(ri_idx_t *ri){ } if(ri->km) ri_km_destroy(ri->km); - else if(ri->n_seq && ri->seq){ - for (i = 0; i < ri->n_seq; ++i){ - free(ri->seq[i].name); - - if(ri->flag & RI_I_STORE_SIG){ - free(ri->F[i]); - free(ri->R[i]); - } - } - - if(ri->flag & RI_I_STORE_SIG){ - free(ri->F); - free(ri->f_l_sig); - free(ri->R); - free(ri->r_l_sig); - } - - free(ri->seq); - } free(ri->B); free(ri); } @@ -138,29 +120,34 @@ static void *worker_pipeline(void *shared, int step, void *in) int seq_c = s->seq[s->n_seq-1].rid + 1; - if(p->ri->flag & RI_I_STORE_SIG){ - p->ri->F = (float**)ri_krealloc(0, p->ri->F,seq_c * sizeof(float*)); - p->ri->f_l_sig = (uint32_t*)ri_krealloc(0, p->ri->f_l_sig,seq_c * sizeof(uint32_t)); - p->ri->R = (float**)ri_krealloc(0, p->ri->R,seq_c * sizeof(float*)); - p->ri->r_l_sig = (uint32_t*)ri_krealloc(0, p->ri->r_l_sig,seq_c * sizeof(uint32_t)); + if(p->ri->flag&RI_I_STORE_SIG){ + p->ri->F = (float**)ri_krealloc(p->ri->km, p->ri->F,seq_c * sizeof(float*)); + p->ri->f_l_sig = (uint32_t*)ri_krealloc(p->ri->km, p->ri->f_l_sig,seq_c * sizeof(uint32_t)); + + if(!(p->ri->flag&RI_I_REV_QUERY)){ + p->ri->R = (float**)ri_krealloc(p->ri->km, p->ri->R,seq_c * sizeof(float*)); + p->ri->r_l_sig = (uint32_t*)ri_krealloc(p->ri->km, p->ri->r_l_sig,seq_c * sizeof(uint32_t)); + } for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t* t = &s->seq[i]; if (t->l_seq > 0){ uint32_t s_len; uint32_t r_id = s->seq[i].rid; - p->ri->F[r_id] = (float*)ri_kcalloc(0, t->l_seq, sizeof(float)); + p->ri->F[r_id] = (float*)ri_kcalloc(p->ri->km, t->l_seq, sizeof(float)); float* s_values = p->ri->F[r_id]; - ri_seq_to_sig(t->seq, t->l_seq, p->pore_vals, p->ri->k, 0, &s_len, s_values); - ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->lq, p->ri->k, &s->a); + ri_seq_to_sig(t->seq, t->l_seq, p->ri->pore, p->ri->k, 0, &s_len, s_values); + ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->k, p->ri->fine_min, p->ri->fine_max, p->ri->fine_range, &s->a); p->ri->f_l_sig[r_id] = s_len; - p->ri->R[r_id] = (float*)ri_kcalloc(0, t->l_seq, sizeof(float)); - s_values = p->ri->R[r_id]; - ri_seq_to_sig(t->seq, t->l_seq, p->pore_vals, p->ri->k, 1, &s_len, s_values); - ri_sketch(0, s_values, t->rid, 1, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->lq, p->ri->k, &s->a); - p->ri->r_l_sig[r_id] = s_len; + if(!(p->ri->flag&RI_I_REV_QUERY)){ + p->ri->R[r_id] = (float*)ri_kcalloc(p->ri->km, t->l_seq, sizeof(float)); + s_values = p->ri->R[r_id]; + ri_seq_to_sig(t->seq, t->l_seq, p->ri->pore, p->ri->k, 1, &s_len, s_values); + ri_sketch(0, s_values, t->rid, 1, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->k, p->ri->fine_min, p->ri->fine_max, p->ri->fine_range, &s->a); + p->ri->r_l_sig[r_id] = s_len; + } } free(t->seq); free(t->name); } @@ -171,11 +158,13 @@ static void *worker_pipeline(void *shared, int step, void *in) uint32_t s_len; float* s_values = (float*)calloc(t->l_seq, sizeof(float)); - ri_seq_to_sig(t->seq, t->l_seq, p->pore_vals, p->ri->k, 0, &s_len, s_values); - ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->lq, p->ri->k, &s->a); + ri_seq_to_sig(t->seq, t->l_seq, p->ri->pore, p->ri->k, 0, &s_len, s_values); + ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->k, p->ri->fine_min, p->ri->fine_max, p->ri->fine_range, &s->a); - ri_seq_to_sig(t->seq, t->l_seq, p->pore_vals, p->ri->k, 1, &s_len, s_values); - ri_sketch(0, s_values, t->rid, 1, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->lq, p->ri->k, &s->a); + if(!(p->ri->flag&RI_I_REV_QUERY)){ + ri_seq_to_sig(t->seq, t->l_seq, p->ri->pore, p->ri->k, 1, &s_len, s_values); + ri_sketch(0, s_values, t->rid, 1, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->k, p->ri->fine_min, p->ri->fine_max, p->ri->fine_range, &s->a); + } free(s_values); } @@ -271,22 +260,20 @@ static void *worker_sig_pipeline(void *shared, int step, void *in) return s; } else if (step == 1) { // step 1: compute sketch step_t *s = (step_t*)in; + for (i = 0; i < s->n_seq; ++i) { + ri_sig_t* t = s->sig[i]; + if (t->l_sig > 0){ + uint32_t s_len = 0; + double s_sum = 0, s_std = 0; + uint32_t n_events_sum = 0; + float* s_values = detect_events(0, t->l_sig, t->sig, p->ri->window_length1, p->ri->window_length2, p->ri->threshold1, p->ri->threshold2, p->ri->peak_height, &s_sum, &s_std, &n_events_sum, &s_len); - //TODO Complete similar to worker_pipeline - // }else { - for (i = 0; i < s->n_seq; ++i) { - ri_sig_t* t = s->sig[i]; - if (t->l_sig > 0){ - uint32_t s_len = 0; - float* s_values = detect_events(0, t->l_sig, t->sig, p->ri->window_length1, p->ri->window_length2, p->ri->threshold1, p->ri->threshold2, p->ri->peak_height, &s_len); + ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->k, p->ri->fine_min, p->ri->fine_max, p->ri->fine_range, &s->a); - ri_sketch(0, s_values, t->rid, 0, s_len, p->ri->diff, p->ri->w, p->ri->e, p->ri->n, p->ri->q, p->ri->lq, p->ri->k, &s->a); - - if(s_values)free(s_values); - } - free(t->sig); free(t->name); + if(s_values)free(s_values); } - // } + free(t->sig); free(t->name); + } free(s->sig); s->sig = 0; return s; } else if (step == 2) { // dispatch sketch to buckets @@ -380,16 +367,27 @@ const uint64_t *ri_idx_get(const ri_idx_t *ri, uint64_t hashval, int *n){ void ri_idx_dump(FILE* idx_file, const ri_idx_t* ri){ - uint32_t pars[8], i; + uint32_t pars[7], i; - pars[0] = ri->w, pars[1] = ri->e, pars[2] = ri->n, pars[3] = ri->q, pars[4] = ri->lq, pars[5] = ri->k, pars[6] = ri->n_seq, pars[7] = ri->flag; + pars[0] = ri->w, pars[1] = ri->e, pars[2] = ri->n, pars[3] = ri->q, pars[4] = ri->k, pars[5] = ri->n_seq, pars[6] = ri->flag; fwrite(RI_IDX_MAGIC, 1, RI_IDX_MAGIC_BYTE, idx_file); - fwrite(pars, sizeof(uint32_t), 8, idx_file); + fwrite(pars, sizeof(uint32_t), 7, idx_file); fwrite(&ri->diff, sizeof(float), 1, idx_file); + fwrite(&ri->fine_min, sizeof(float), 1, idx_file); + fwrite(&ri->fine_max, sizeof(float), 1, idx_file); + fwrite(&ri->fine_range, sizeof(float), 1, idx_file); + fwrite(ri->pore, sizeof(ri_pore_t), 1, idx_file); + fwrite(ri->pore->pore_vals, sizeof(float), ri->pore->n_pore_vals, idx_file); + ri_kfree(ri->km, ri->pore->pore_vals); + ri->pore->pore_vals = 0; + fwrite(ri->pore->pore_inds, sizeof(ri_porei_t), ri->pore->n_pore_vals, idx_file); + ri_kfree(ri->km, ri->pore->pore_inds); + ri->pore->pore_inds = 0; + ri_kfree(ri->km, ri->pore); + // ri->pore = 0; for (i = 0; i < ri->n_seq; ++i) { - if(ri->flag & RI_I_SIG_TARGET){ if (ri->sig[i].name) { uint8_t l = strlen(ri->sig[i].name); @@ -415,18 +413,22 @@ void ri_idx_dump(FILE* idx_file, const ri_idx_t* ri){ if(ri->flag & RI_I_STORE_SIG){ fwrite(&(ri->f_l_sig[i]), 4, 1, idx_file); fwrite(ri->F[i], 4, ri->f_l_sig[i], idx_file); - ri_kfree(0, ri->F[i]); - fwrite(&(ri->r_l_sig[i]), 4, 1, idx_file); - fwrite(ri->R[i], 4, ri->r_l_sig[i], idx_file); - ri_kfree(0, ri->R[i]); + ri_kfree(ri->km, ri->F[i]); + if(!(ri->flag&RI_I_REV_QUERY)){ + fwrite(&(ri->r_l_sig[i]), 4, 1, idx_file); + fwrite(ri->R[i], 4, ri->r_l_sig[i], idx_file); + ri_kfree(ri->km, ri->R[i]); + } } } if(ri->flag & RI_I_STORE_SIG){ - ri_kfree(0, ri->F); - ri_kfree(0, ri->f_l_sig); - ri_kfree(0, ri->R); - ri_kfree(0, ri->r_l_sig); + ri_kfree(ri->km, ri->F); + ri_kfree(ri->km, ri->f_l_sig); + if(!(ri->flag&RI_I_REV_QUERY)){ + ri_kfree(ri->km, ri->R); + ri_kfree(ri->km, ri->r_l_sig); + } } for (i = 0; i < 1U<b; ++i) { @@ -459,22 +461,34 @@ ri_idx_t* ri_idx_load(FILE* idx_file){ if (fread(magic, 1, RI_IDX_MAGIC_BYTE, idx_file) != RI_IDX_MAGIC_BYTE) return 0; if (strncmp(magic, RI_IDX_MAGIC, RI_IDX_MAGIC_BYTE) != 0) return 0; - int pars[8]; - fread(&pars[0], sizeof(int), 8, idx_file); + int pars[7]; + fread(&pars[0], sizeof(int), 7, idx_file); - float diff; + float diff, fine_min, fine_max, fine_range; fread(&diff, sizeof(float), 1, idx_file); + fread(&fine_min, sizeof(float), 1, idx_file); + fread(&fine_max, sizeof(float), 1, idx_file); + fread(&fine_range, sizeof(float), 1, idx_file); - ri = ri_idx_init(diff, 14, pars[0], pars[1], pars[2], pars[3], pars[4], pars[5], pars[7]); - ri->n_seq = pars[6]; + ri = ri_idx_init(diff, 14, pars[0], pars[1], pars[2], pars[3], pars[4], fine_min, fine_max, fine_range, pars[6]); + ri->n_seq = pars[5]; if(ri->flag&RI_I_SIG_TARGET) ri->sig = (ri_sig_t*)ri_kcalloc(ri->km, ri->n_seq, sizeof(ri_sig_t)); else ri->seq = (ri_idx_seq_t*)ri_kcalloc(ri->km, ri->n_seq, sizeof(ri_idx_seq_t)); + ri->pore = (ri_pore_t*)ri_kmalloc(ri->km, sizeof(ri_pore_t)); + fread(ri->pore, sizeof(ri_pore_t), 1, idx_file); + ri->pore->pore_vals = (float*)ri_kmalloc(ri->km, ri->pore->n_pore_vals * sizeof(float)); + fread(ri->pore->pore_vals, sizeof(float), ri->pore->n_pore_vals, idx_file); + ri->pore->pore_inds = (ri_porei_t*)ri_kmalloc(ri->km, ri->pore->n_pore_vals * sizeof(ri_porei_t)); + fread(ri->pore->pore_inds, sizeof(ri_porei_t), ri->pore->n_pore_vals, idx_file); + if(ri->flag & RI_I_STORE_SIG){ - ri->F = (float**)ri_kcalloc(0, ri->n_seq, sizeof(float*)); - ri->f_l_sig = (uint32_t*)ri_kcalloc(0, ri->n_seq, sizeof(uint32_t)); - ri->R = (float**)ri_kcalloc(0, ri->n_seq, sizeof(float*)); - ri->r_l_sig = (uint32_t*)ri_kcalloc(0, ri->n_seq, sizeof(uint32_t)); + ri->F = (float**)ri_kcalloc(ri->km, ri->n_seq, sizeof(float*)); + ri->f_l_sig = (uint32_t*)ri_kcalloc(ri->km, ri->n_seq, sizeof(uint32_t)); + if(!(ri->flag&RI_I_REV_QUERY)){ + ri->R = (float**)ri_kcalloc(ri->km, ri->n_seq, sizeof(float*)); + ri->r_l_sig = (uint32_t*)ri_kcalloc(ri->km, ri->n_seq, sizeof(uint32_t)); + } } for (i = 0; i < ri->n_seq; ++i) { @@ -505,12 +519,14 @@ ri_idx_t* ri_idx_load(FILE* idx_file){ if(ri->flag & RI_I_STORE_SIG){ fread(&(ri->f_l_sig[i]), 4, 1, idx_file); - ri->F[i] = (float*)ri_kmalloc(0, ri->f_l_sig[i] * sizeof(float)); + ri->F[i] = (float*)ri_kmalloc(ri->km, ri->f_l_sig[i] * sizeof(float)); fread(ri->F[i], 4, ri->f_l_sig[i], idx_file); - fread(&(ri->r_l_sig[i]), 4, 1, idx_file); - ri->R[i] = (float*)ri_kmalloc(0, ri->r_l_sig[i] * sizeof(float)); - fread(ri->R[i], 4, ri->r_l_sig[i], idx_file); + if(!(ri->flag&RI_I_REV_QUERY)){ + fread(&(ri->r_l_sig[i]), 4, 1, idx_file); + ri->R[i] = (float*)ri_kmalloc(ri->km, ri->r_l_sig[i] * sizeof(float)); + fread(ri->R[i], 4, ri->r_l_sig[i], idx_file); + } } } for (i = 0; i < 1U<b; ++i) { @@ -580,19 +596,25 @@ void ri_idx_reader_close(ri_idx_reader_t* r){ free(r); } -ri_idx_t* ri_idx_gen(mm_bseq_file_t* fp, float* pore_vals, float diff, int b, int w, int e, int n, int q, int lq, int k, int flag, int mini_batch_size, int n_threads, uint64_t batch_size) +ri_idx_t* ri_idx_gen(mm_bseq_file_t* fp, ri_pore_t* pore, float diff, int b, int w, int e, int n, int q, int k, float fine_min, float fine_max, float fine_range, int flag, int mini_batch_size, int n_threads, uint64_t batch_size) { if(flag&RI_I_SIG_TARGET) return 0; pipeline_t pl; - if (fp == 0 || mm_bseq_eof(fp) || !pore_vals) return 0; + if (fp == 0 || mm_bseq_eof(fp) || !pore || !pore->pore_vals) return 0; memset(&pl, 0, sizeof(pipeline_t)); pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size; pl.batch_size = batch_size; pl.fp = fp; - pl.pore_vals = pore_vals; - pl.ri = ri_idx_init(diff, b, w, e, n, q, lq, k, flag); + pl.ri = ri_idx_init(diff, b, w, e, n, q, k, fine_min, fine_max, fine_range, flag); + + pl.ri->pore = (ri_pore_t*)ri_kmalloc(pl.ri->km, sizeof(ri_pore_t)); + memcpy(pl.ri->pore, pore, sizeof(ri_pore_t)); + pl.ri->pore->pore_vals = (float*)ri_kmalloc(pl.ri->km, pore->n_pore_vals * sizeof(float)); + memcpy(pl.ri->pore->pore_vals, pore->pore_vals, pore->n_pore_vals * sizeof(float)); + pl.ri->pore->pore_inds = (ri_porei_t*)ri_kmalloc(pl.ri->km, pore->n_pore_vals * sizeof(ri_porei_t)); + memcpy(pl.ri->pore->pore_inds, pore->pore_inds, pore->n_pore_vals * sizeof(ri_porei_t)); kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); ri_idx_post(pl.ri, n_threads); @@ -600,13 +622,13 @@ ri_idx_t* ri_idx_gen(mm_bseq_file_t* fp, float* pore_vals, float diff, int b, in return pl.ri; } -ri_idx_t* ri_idx_siggen(ri_sig_file_t** fp, char **f, int &cur_f, int n_f, float* pore_vals, float diff, int b, int w, int e, int n, int q, int lq, int k, uint32_t window_length1, uint32_t window_length2, float threshold1, float threshold2, float peak_height, int flag, int mini_batch_size, int n_threads, uint64_t batch_size) +ri_idx_t* ri_idx_siggen(ri_sig_file_t** fp, char **f, int &cur_f, int n_f, ri_pore_t* pore, float diff, int b, int w, int e, int n, int q, int k, float fine_min, float fine_max, float fine_range, uint32_t window_length1, uint32_t window_length2, float threshold1, float threshold2, float peak_height, int flag, int mini_batch_size, int n_threads, uint64_t batch_size) { if(!(flag&RI_I_SIG_TARGET)) return 0; pipeline_t pl; - if (fp == 0 || *fp == 0 || n_f <= 0 || cur_f > n_f || (cur_f == n_f && (*fp)->cur_read >= (*fp)->num_read) || !pore_vals) return 0; + if (fp == 0 || *fp == 0 || n_f <= 0 || cur_f > n_f || (cur_f == n_f && (*fp)->cur_read >= (*fp)->num_read) || !pore || !pore->pore_vals) return 0; memset(&pl, 0, sizeof(pipeline_t)); pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size; pl.batch_size = batch_size; @@ -614,8 +636,14 @@ ri_idx_t* ri_idx_siggen(ri_sig_file_t** fp, char **f, int &cur_f, int n_f, float pl.sf = f; pl.n_f = n_f; pl.cur_f = cur_f; - pl.pore_vals = pore_vals; - pl.ri = ri_idx_init(diff, b, w, e, n, q, lq, k, flag); + pl.ri = ri_idx_init(diff, b, w, e, n, q, k, fine_min, fine_max, fine_range, flag); + + pl.ri->pore = (ri_pore_t*)ri_kmalloc(pl.ri->km, sizeof(ri_pore_t)); + memcpy(pl.ri->pore, pore, sizeof(ri_pore_t)); + pl.ri->pore->pore_vals = (float*)ri_kmalloc(pl.ri->km, pore->n_pore_vals * sizeof(float)); + memcpy(pl.ri->pore->pore_vals, pore->pore_vals, pore->n_pore_vals * sizeof(float)); + pl.ri->pore->pore_inds = (ri_porei_t*)ri_kmalloc(pl.ri->km, pore->n_pore_vals * sizeof(ri_porei_t)); + memcpy(pl.ri->pore->pore_inds, pore->pore_inds, pore->n_pore_vals * sizeof(ri_porei_t)); pl.ri->window_length1 = window_length1; pl.ri->window_length2 = window_length2; @@ -633,23 +661,22 @@ ri_idx_t* ri_idx_siggen(ri_sig_file_t** fp, char **f, int &cur_f, int n_f, float return pl.ri; } -ri_idx_t* ri_idx_reader_read(ri_idx_reader_t* r, float* pore_vals, int n_threads){ +ri_idx_t* ri_idx_reader_read(ri_idx_reader_t* r, ri_pore_t* pore, int n_threads){ ri_idx_t *ri; if (r->is_idx) { ri = ri_idx_load(r->fp.idx); - } else if(r->opt.flag & RI_I_SIG_TARGET) { - - ri = ri_idx_siggen(&(r->sfp), r->sf, r->cur_f, r->n_f, pore_vals, r->opt.diff, r->opt.b, r->opt.w, r->opt.e, r->opt.n, r->opt.q, r->opt.lq, r->opt.k, r->opt.window_length1, r->opt.window_length2, r->opt.threshold1, r->opt.threshold2, r->opt.peak_height, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); - - } - else{ - ri = ri_idx_gen(r->fp.seq, pore_vals, r->opt.diff, r->opt.b, r->opt.w, r->opt.e, r->opt.n, r->opt.q, r->opt.lq, r->opt.k, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); + } else if(r->opt.flag&RI_I_SIG_TARGET) { + ri = ri_idx_siggen(&(r->sfp), r->sf, r->cur_f, r->n_f, pore, r->opt.diff, r->opt.b, r->opt.w, r->opt.e, r->opt.n, r->opt.q, r->opt.k, r->opt.fine_min, r->opt.fine_max, r->opt.fine_range, r->opt.window_length1, r->opt.window_length2, r->opt.threshold1, r->opt.threshold2, r->opt.peak_height, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); + } else{ + ri = ri_idx_gen(r->fp.seq, pore, r->opt.diff, r->opt.b, r->opt.w, r->opt.e, r->opt.n, r->opt.q, r->opt.k, r->opt.fine_min, r->opt.fine_max, r->opt.fine_range, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size); } + if (ri) { if (r->fp_out) ri_idx_dump(r->fp_out, ri); ri->index = r->n_parts++; } + return ri; } diff --git a/src/rindex.h b/src/rindex.h index d2d53bd..c74bd83 100644 --- a/src/rindex.h +++ b/src/rindex.h @@ -27,11 +27,14 @@ typedef struct ri_idx_seq_s{ } ri_idx_seq_t; typedef struct ri_idx_s{ - int32_t b, w, e, n, q, lq, k, flag; + int32_t b, w, e, n, q, k, flag; int32_t index; float diff; + float fine_min, fine_max, fine_range; struct ri_idx_bucket_s *B; // index (hidden) + ri_pore_t* pore; + void *km, *h; uint32_t window_length1; @@ -131,7 +134,7 @@ ri_idx_t* ri_idx_init(int b, int w, int e, int n, int q, int lq, int k, int flag * * @return rindex (index) */ -ri_idx_t *ri_idx_reader_read(ri_idx_reader_t *r, float* pore_vals, int n_threads); +ri_idx_t *ri_idx_reader_read(ri_idx_reader_t *r, ri_pore_t* pore, int n_threads); /** * Adds the values in an array to the index diff --git a/src/rmap.cpp b/src/rmap.cpp index b593c1d..01927de 100644 --- a/src/rmap.cpp +++ b/src/rmap.cpp @@ -188,21 +188,24 @@ void align_chain(mm_reg1_t *chain, mm128_t* anchors, const ri_idx_t *ri, const f } } -//returns n_regs // currently we report one mapping void ri_map_frag(const ri_idx_t *ri, const uint32_t s_len, const float *sig, ri_reg1_t* reg, ri_tbuf_t *b, const ri_mapopt_t *opt, - const char *qname) + const char *qname, + double* mean_sum, + double* std_dev_sum, + uint32_t* n_events_sum, + const uint32_t c_count = 0) { uint32_t n_events = 0; #ifdef PROFILERH double signal_t = ri_realtime(); #endif - float* events = detect_events(b->km, s_len, sig, opt->window_length1, opt->window_length2, opt->threshold1, opt->threshold2, opt->peak_height, &n_events); + float* events = detect_events(b->km, s_len, sig, opt->window_length1, opt->window_length2, opt->threshold1, opt->threshold2, opt->peak_height, mean_sum, std_dev_sum, n_events_sum, &n_events); #ifdef PROFILERH ri_signaltime += ri_realtime() - signal_t; #endif @@ -213,8 +216,10 @@ void ri_map_frag(const ri_idx_t *ri, } //Copy events to the end of reg->events with ri_krealloc - reg->events = (float*)ri_krealloc(b->km, reg->events, (reg->offset+n_events) * sizeof(float)); - memcpy(reg->events + reg->offset, events, n_events * sizeof(float)); + if(opt->flag&RI_M_DTW_EVALUATE_CHAINS){ + reg->events = (float*)ri_krealloc(b->km, reg->events, (reg->offset+n_events) * sizeof(float)); + memcpy(reg->events + reg->offset, events, n_events * sizeof(float)); + } #ifdef PROFILERH double sketch_t = ri_realtime(); @@ -222,7 +227,8 @@ void ri_map_frag(const ri_idx_t *ri, //Sketching mm128_v riv = {0,0,0}; - ri_sketch(b->km, events, 0, 0, n_events, ri->diff, ri->w, ri->e, ri->n, ri->q, ri->lq, ri->k, &riv); + ri_sketch(b->km, events, 0, 0, n_events, ri->diff, ri->w, ri->e, ri->n, ri->q, ri->k, ri->fine_min, ri->fine_max, ri->fine_range, &riv); + if(events){ri_kfree(b->km, events); events = NULL;} // if (opt->q_occ_frac > 0.0f) ri_seed_mz_flt(b->km, &riv, opt->mid_occ, opt->q_occ_frac); @@ -310,32 +316,6 @@ void ri_map_frag(const ri_idx_t *ri, //Set MAPQ TODO: integrate alignment score within mapq mm_set_mapq(b->km, reg->n_cregs, reg->creg, opt->min_chaining_score, rep_len, (opt->flag&RI_M_DTW_EVALUATE_CHAINS)?1:0); - // for(int i = 0; i < reg->n_cregs; ++i){ - - // int rid = reg->creg[i].rid; - // int rs = reg->creg[i].rev?(uint32_t)(ri->seq[rid].len+1-reg->creg[i].re):reg->creg[i].rs; - // float* ref = (reg->creg[i].rev)?ri->R[rid]:ri->F[rid]; - // uint32_t r_len = (reg->creg[i].rev)?ri->r_l_sig[rid]:ri->f_l_sig[rid]; - - // fprintf(stderr, "Chain: %d rid: %d mapq: %d score: %d align: %f cnt: %d s: %c rs: %d qs: %d qe: %d p?: %c\n", i, rid, reg->creg[i].mapq, reg->creg[i].score, reg->creg[i].alignment_score, reg->creg[i].cnt, "+-"[reg->creg[i].rev], rs, reg->creg[i].qs, reg->creg[i].qe, "01"[reg->creg[i].parent == i]); - - // // float* revents = ref + reg->creg[i].rs; - // // uint32_t rlen = reg->creg[i].re - reg->creg[i].rs + 1; - - // //print ref starting from rs until re - rs + 1 - // // for(int j = 0; j < rlen; ++j){ - // // fprintf(stderr, "%f ", revents[j]); - // // } fprintf(stderr, " 1\n"); - - // // float* qevents = reg->events + reg->creg[i].qs; - // // uint32_t qlen = reg->creg[i].qe - reg->creg[i].qs + 1; - - // //print reg->Events from qs until qe - qs + 1 - // // for(int j = 0; j < qlen; ++j){ - // // fprintf(stderr, "%f ", qevents[j]); - // // } fprintf(stderr, " 2\n"); - // } - if(seed_hits){ri_kfree(b->km, seed_hits); seed_hits = NULL;} if(u){ri_kfree(b->km, u); u = NULL;} @@ -359,9 +339,9 @@ static void map_worker_for(void *_data, ri_sig_t* sig = s->sig[i]; - uint32_t max_chunk = opt->max_num_chunk; uint32_t qlen = sig->l_sig; uint32_t l_chunk = (opt->chunk_size > qlen)?qlen:opt->chunk_size; + uint32_t max_chunk = (opt->flag&RI_M_NO_ADAPTIVE)?((qlen-1)/l_chunk)+1:opt->max_num_chunk; uint32_t s_qs, s_qe = l_chunk; uint32_t c_count = 0; @@ -369,14 +349,18 @@ static void map_worker_for(void *_data, double t = ri_realtime(); - for (s_qs = c_count = 0; s_qs < qlen && c_count < max_chunk && reg0->n_maps == 0; s_qs += l_chunk, ++c_count) { - // fprintf(stderr, "%s %d\n", sig->name, c_count); + double mean_sum = 0, std_dev_sum = 0; + uint32_t n_events_sum = 0; + + for (s_qs = c_count = 0; s_qs < qlen && c_count < max_chunk; s_qs += l_chunk, ++c_count) { s_qe = s_qs + l_chunk; if(s_qe > qlen) s_qe = qlen; if(reg0->creg){free(reg0->creg); reg0->creg = NULL; reg0->n_cregs = 0;} - ri_map_frag(s->p->ri, (const uint32_t)s_qe-s_qs, (const float*)&(sig->sig[s_qs]), reg0, b, opt, sig->name); + ri_map_frag(s->p->ri, (const uint32_t)s_qe-s_qs, (const float*)&(sig->sig[s_qs]), reg0, b, opt, sig->name, &mean_sum, &std_dev_sum, &n_events_sum); + + int n_chains = (opt->flag&RI_M_ALL_CHAINS || reg0->n_cregs < 1)?reg0->n_cregs:1; if (reg0->n_cregs == 1 && ((reg0->creg[0].mapq >= opt->min_mapq) || (opt->flag&RI_M_DTW_EVALUATE_CHAINS && reg0->creg[0].alignment_score >= opt->dtw_min_score))) { reg0->n_maps++; @@ -386,7 +370,6 @@ static void map_worker_for(void *_data, } //TODO make n_cregs a parameter of best n mappings - int n_chains = (opt->flag&RI_M_ALL_CHAINS || reg0->n_cregs < 1)?reg0->n_cregs:1; float meanC = 0, meanQ = 0; // if(opt->flag&RI_M_DTW_EVALUATE_CHAINS){ // uint32_t chain_cnt = 0; @@ -451,6 +434,7 @@ static void map_worker_for(void *_data, // fprintf(stderr, "Aligned ic: %d\n", ic); } } + if(reg0->n_maps > 0) break; } double mapping_time = ri_realtime() - t; @@ -465,9 +449,6 @@ static void map_worker_for(void *_data, if(!chains) {reg0->n_cregs = 0;} float mean_chain_score = 0; - for (uint32_t c_ind = 0; c_ind < reg0->n_cregs; ++c_ind) - mean_chain_score += chains[c_ind].score; - if(reg0->n_cregs > 0) mean_chain_score /= reg0->n_cregs; if(reg0->n_maps == 0 && reg0->creg && reg0->creg[0].mapq > opt->min_mapq){ reg0->n_maps++; @@ -500,7 +481,7 @@ static void map_worker_for(void *_data, reg0->read_id = sig->rid; reg0->read_name = sig->name; - reg0->maps[0].read_length = (uint32_t)(read_position_scale * reg0->offset); + reg0->maps[0].read_length = (s->p->ri->flag&RI_I_SIG_TARGET)?reg0->offset:(uint32_t)(read_position_scale * reg0->offset); reg0->maps[0].c_id = 0; reg0->maps[0].ref_id = 0; reg0->maps[0].read_start_position = 0; @@ -528,10 +509,10 @@ static void map_worker_for(void *_data, reg0->read_id = sig->rid; reg0->read_name = sig->name; - reg0->maps[m].read_length = (s->p->ri->flag&RI_I_SIG_TARGET)?(read_position_scale*reg0->offset):(uint32_t)(read_position_scale*chains[c_id].qe); + reg0->maps[m].read_length = (s->p->ri->flag&RI_I_SIG_TARGET)?(reg0->offset):(uint32_t)(read_position_scale*chains[c_id].qe); reg0->maps[m].ref_id = chains[c_id].rid; - reg0->maps[m].read_start_position = (uint32_t)(read_position_scale*chains[c_id].qs); - reg0->maps[m].read_end_position = (uint32_t)(read_position_scale*chains[c_id].qe); + reg0->maps[m].read_start_position = (s->p->ri->flag&RI_I_SIG_TARGET)?chains[c_id].qs:(uint32_t)(read_position_scale*chains[c_id].qs); + reg0->maps[m].read_end_position = (s->p->ri->flag&RI_I_SIG_TARGET)?chains[c_id].qe:(uint32_t)(read_position_scale*chains[c_id].qe); if(s->p->ri->flag&RI_I_SIG_TARGET) reg0->maps[m].fragment_start_position = chains[c_id].rev?(uint32_t)(s->p->ri->sig[chains[c_id].rid].l_sig+1-chains[c_id].re):chains[c_id].rs; else reg0->maps[m].fragment_start_position = chains[c_id].rev?(uint32_t)(s->p->ri->seq[chains[c_id].rid].len+1-chains[c_id].re):chains[c_id].rs; reg0->maps[m].fragment_length = (uint32_t)(chains[c_id].re - chains[c_id].rs + 1); @@ -558,16 +539,15 @@ static void map_worker_for(void *_data, ri_sig_t** ri_sig_read_frag(pipeline_mt *pl, int64_t chunk_size, int *n_) -{ - int64_t size = 0; - rhsig_v rsigv = {0,0,0}; - +{ *n_ = 0; if (pl->n_fp < 1) return 0; //Debugging for sweeping purposes - // if(pl->su_nreads >= 2000) return 0; + // if(pl->su_nreads >= 1000) return 0; + int64_t size = 0; + rhsig_v rsigv = {0,0,0}; rh_kv_resize(ri_sig_t*, 0, rsigv, 4000); while (pl->fp) { @@ -605,11 +585,12 @@ ri_sig_t** ri_sig_read_frag(pipeline_mt *pl, //Debugging for sweeping purposes // pl->su_nreads++; - // if(pl->su_nreads >= 2000) break; + // if(pl->su_nreads >= 1000) break; } ri_sig_t** a = 0; if(rsigv.n) a = rsigv.a; + else rh_kv_destroy(rsigv); *n_ = rsigv.n; return a; @@ -658,7 +639,7 @@ static void *map_worker_pipeline(void *shared, #endif //Debugging for sweeping purposes - // if(p->su_nreads >= 2000) p->su_stop = p->su_nreads; + // if(p->su_nreads >= 1000) p->su_stop = p->su_nreads; if(p->opt->flag & RI_M_SEQUENCEUNTIL && !p->su_stop){ const ri_idx_t *ri = p->ri; @@ -733,6 +714,7 @@ static void *map_worker_pipeline(void *shared, // if(reg0->tags) {free(reg0->tags); reg0->tags = NULL;} if(reg0->maps){free(reg0->maps); reg0->maps = NULL;} free(reg0); s->reg[k] = NULL; + fflush(stdout); } } @@ -777,8 +759,8 @@ int ri_map_file_frag(const ri_idx_t *idx, rh_kv_resize(char*, 0, fnames, 256); find_sfiles(fn[0], &fnames); pl.f = fnames.a; - if(!fnames.n || ((pl.fp = open_sig(pl.f[0])) == 0)) return -1; - if (pl.fp == 0) return -1; + if(!fnames.n || ((pl.fp = open_sig(pl.f[0])) == 0)){rh_kv_destroy(fnames); return -1;} + if (pl.fp == 0){rh_kv_destroy(fnames); return -1;} pl.fn = fn; pl.n_f = fnames.n; pl.cur_fp = 1; @@ -819,5 +801,7 @@ int ri_map_file_frag(const ri_idx_t *idx, fprintf(stderr, "\n[M::%s] File read: %.6f sec; Signal-to-event: %.6f sec; Sketching: %.6f sec; Seeding: %.6f sec; Chaining: %.6f sec; Mapping: %.6f sec; Mapping (multi-threaded): %.6f sec\n", __func__, ri_filereadtime, ri_signaltime, ri_sketchtime, ri_seedtime, ri_chaintime, ri_maptime, ri_maptime_multithread); #endif + rh_kv_destroy(fnames); + return 0; } \ No newline at end of file diff --git a/src/rmap.h b/src/rmap.h index c9451cc..bdefac6 100644 --- a/src/rmap.h +++ b/src/rmap.h @@ -41,8 +41,6 @@ typedef struct ri_reg1_s{ uint32_t n_prev_anchors; mm_reg1_t* creg; // This is for transition purposes. int n_cregs; - - uint32_t n_chains; } ri_reg1_t; typedef struct pipeline_ms{ diff --git a/src/roptions.c b/src/roptions.c index 33a03f8..55c8a97 100644 --- a/src/roptions.c +++ b/src/roptions.c @@ -4,12 +4,16 @@ void ri_idxopt_init(ri_idxopt_t *opt) { memset(opt, 0, sizeof(ri_idxopt_t)); - opt->e = 6; opt->w = 0; opt->q = 9; opt->lq = 3; opt->n = 0; opt->k = 6, opt->lev_col = 1; + opt->e = 8; opt->w = 0; opt->q = 4; opt->n = 0; opt->k = 6, opt->lev_col = 1; opt->b = 14; - opt->diff = 0.3f; + opt->diff = 0.35f; opt->mini_batch_size = 50000000; opt->batch_size = 4000000000ULL; + opt->fine_min = -2.0f; + opt->fine_max = 2.0f; + opt->fine_range = 0.4; + opt->window_length1 = 3; //--seg-window-length1 opt->window_length2 = 6; //--seg-window-length2 opt->threshold1 = 4.30265f; //--seg-threshold1 @@ -31,7 +35,9 @@ void ri_mapopt_init(ri_mapopt_t *opt) opt->sample_per_base = (float)opt->sample_rate / opt->bp_per_sec; //seeding - opt->mid_occ_frac = 5e-3f; //--mid-occ-frac + // opt->mid_occ_frac = 75e-4f; //--mid-occ-frac + opt->mid_occ_frac = 1e-2f; //--mid-occ-frac + // opt->mid_occ_frac = 5e-3f; //--mid-occ-frac opt->q_occ_frac = 1e-2f; //--q-occ-frac opt->min_mid_occ = 50; //--q-mid-occ [I1, I2] opt->max_mid_occ = 500000; //--q-mid-occ [I1, I2] @@ -60,6 +66,8 @@ void ri_mapopt_init(ri_mapopt_t *opt) opt->pri_ratio = 0.3f; opt->best_n = 0; + opt->top_n_mean = 0; //--top-n-mean + opt->alt_drop = 0.15f; //--alt-drop opt->w_bestmq=0.05f; //--w-bestmq @@ -72,10 +80,9 @@ void ri_mapopt_init(ri_mapopt_t *opt) opt->mini_batch_size = 500000000; //-K - opt->step_size = 1; //read_seeding_step_size + opt->step_size = 1; opt->min_events = 50; //--min-events opt->max_num_chunk = 10;//--max-chunks - // opt->min_chain_anchor = 10; opt->min_mapq = 2; //--min-mapq diff --git a/src/roptions.h b/src/roptions.h index b33d2e6..3b8ec41 100644 --- a/src/roptions.h +++ b/src/roptions.h @@ -11,6 +11,7 @@ #define RI_I_SYNCMER 0x8 #define RI_I_STORE_SIG 0x10 #define RI_I_SIG_TARGET 0x20 +#define RI_I_REV_QUERY 0x40 #define RI_M_SEQUENCEUNTIL 0x1 #define RI_M_RMQ 0x2 @@ -29,6 +30,9 @@ //Overlapping related #define RI_M_ALL_CHAINS 0x2000 +//Characterization related +#define RI_M_OUT_ALL_CHAINS 0x4000 + //DTW related #define RI_M_DTW_BORDER_CONSTRAINT_GLOBAL 0 #define RI_M_DTW_BORDER_CONSTRAINT_SPARSE 1 @@ -42,11 +46,12 @@ extern "C" { // indexing and mapping options typedef struct ri_idxopt_s{ - short b, w, e, n, q, lq, k, flag, lev_col; + short b, w, e, n, q, k, flag, lev_col; int64_t mini_batch_size; uint64_t batch_size; float diff; + float fine_min, fine_max, fine_range; uint32_t window_length1; uint32_t window_length2; @@ -97,6 +102,8 @@ typedef struct ri_mapopt_s{ float pri_ratio; int best_n; + int top_n_mean; + float alt_drop; //Mapping parameters diff --git a/src/rseed.c b/src/rseed.c index 84345c1..65f18fc 100644 --- a/src/rseed.c +++ b/src/rseed.c @@ -70,7 +70,7 @@ ri_seed_t *ri_seed_collect_all(void *km, const ri_idx_t *ri, const mm128_v *riv, const uint64_t *cr; ri_seed_t *q; mm128_t *p = &riv->a[i]; - uint32_t q_pos = (uint32_t)riv->a[i].y, q_span = p->x & span_mask; + uint32_t q_pos = (uint32_t)p->y, q_span = p->x & span_mask; int t; cr = ri_idx_get(ri, p->x>>RI_HASH_SHIFT, &t); if (t == 0) continue; diff --git a/src/rsig.c b/src/rsig.c index 20d7616..b6a2566 100644 --- a/src/rsig.c +++ b/src/rsig.c @@ -10,7 +10,13 @@ #include #include -void ri_seq_to_sig(const char *str, int len, const float* pore_vals, const int k, const int strand, uint32_t* s_len, float* s_values){ +void ri_seq_to_sig(const char *str, + int len, + const ri_pore_t* pore, + const int k, + const int strand, + uint32_t* s_len, + float* s_values){ int i, j, l, pos, n = 0; // uint64_t shift1 = 2 * (k - 1); @@ -18,20 +24,18 @@ void ri_seq_to_sig(const char *str, int len, const float* pore_vals, const int k double mean = 0, std_dev = 0, sum = 0, sum2 = 0, curval = 0; for (i = l = j = n = 0; i < len; ++i) { - if(strand) pos = len - i -1; + if(strand) pos = len-i-1; else pos = i; int c = seq_nt4_table[(uint8_t)str[pos]]; if (c < 4) { // not an ambiguous base if(!strand) kmer = (kmer << 2 | c) & mask; // forward k-mer // else kmer = (kmer >> 2) | (3ULL^c) << shift1; // reverse k-mer - //TODO: this is currently based on the ordering in the original ordering in the sigmap implementation. Change later to above else kmer = ((kmer << 2) | (3ULL^c)) & mask; // reverse k-mer - }else - kmer = (kmer << 2) & mask; //TODO: This is not the best approach. We are basically inserting 00 (A?) to kmer whenever c >= 4. Mask it instead + } if(i+1 < k) continue; - curval = pore_vals[kmer]; + curval = pore->pore_vals[kmer]; s_values[j++] = curval; sum += curval; sum2 += curval*curval; @@ -40,8 +44,7 @@ void ri_seq_to_sig(const char *str, int len, const float* pore_vals, const int k mean = sum/j; std_dev = sqrt(sum2/j - (mean)*(mean)); - for(i = 0; i < j; ++i) - s_values[i] = (s_values[i]-mean)/std_dev; + for(i = 0; i < j; ++i) s_values[i] = (s_values[i]-mean)/std_dev; *s_len = j; } @@ -298,7 +301,9 @@ void find_sfiles(const char *A, ri_char_v *fnames) if (strstr(A, ".fast5") || strstr(A, ".pod5") || strstr(A, ".pod") || strstr(A, ".slow5") || strstr(A, ".blow5")) { char** cur_fname; rh_kv_pushp(char*, 0, *fnames, &cur_fname); - (*cur_fname) = strdup(A); + (*cur_fname) = + strdup(A) + ; } return; } diff --git a/src/rsig.h b/src/rsig.h index 103b90a..a631bbf 100644 --- a/src/rsig.h +++ b/src/rsig.h @@ -92,13 +92,19 @@ ri_sig_file_t **open_sigs(int n, const char **fn); * * @param str sequence to convert to its expected event values * @param len length of the $str - * @param pore_vals expected event values for each possible k-mer + * @param pore a pore model including expected event values for each possible k-mer * @param pore_kmer k-mer size of each event * @param strand directin of strand. 1 is forward, 0 is reverse direction. * @param s_len length of $s_values * @param s_values expected event values of each k-mer in $str */ -void ri_seq_to_sig(const char *str, int len, const float* pore_vals, const int pore_kmer, const int strand, uint32_t* s_len, float* s_values); +void ri_seq_to_sig(const char *str, + int len, + const ri_pore_t* pore, + const int pore_kmer, + const int strand, + uint32_t* s_len, + float* s_values); /** * Reads the entire signal values of the next read from a file diff --git a/src/rsketch.c b/src/rsketch.c index 1ae1af6..661e7f4 100644 --- a/src/rsketch.c +++ b/src/rsketch.c @@ -2,6 +2,7 @@ #include #include "rh_kvec.h" #include +#include static inline uint64_t hash64(uint64_t key, uint64_t mask){ key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; @@ -14,167 +15,93 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask){ return key; } -// static inline uint64_t MurmurHash3(uint64_t key, uint64_t mask) { -// key = (key^(key >> 33)); -// key = (key*0xff51afd7ed558ccd); -// key = (key^(key >> 33)); -// key = (key*0xc4ceb9fe1a85ec53); -// key = (key^(key >> 33)); - -// return key&mask; -// } - -// #include //TODO: insert ifdef here to include simd headers - -//https://stackoverflow.com/a/21673221 -// static inline __m256i movemask_inverse(const uint32_t hash_value) { -// __m256i vmask = _mm256_set1_epi32(hash_value); -// const __m256i shuffle = _mm256_setr_epi64x(0x0000000000000000, -// 0x0101010101010101, 0x0202020202020202, 0x0303030303030303); -// vmask = _mm256_shuffle_epi8(vmask, shuffle); -// const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); -// vmask = _mm256_or_si256(vmask, bit_mask); -// return _mm256_cmpeq_epi8(vmask, _mm256_set1_epi64x(-1)); -// } - -// static inline void calc_blend_simd(__m256i* blndcnt_lsb, __m256i* blndcnt_msb, __m256i* ma, __m256i* mb, uint64_t val, const uint64_t mask, const int bits) { - -// (*blndcnt_lsb) = _mm256_adds_epi8((*blndcnt_lsb), _mm256_blendv_epi8((*ma), (*mb), movemask_inverse(val&mask))); -// uint64_t blendVal = (uint64_t)_mm256_movemask_epi8((*blndcnt_lsb))&mask; -// if(bits > 32){ -// (*blndcnt_msb) = _mm256_adds_epi8((*blndcnt_msb), _mm256_blendv_epi8((*ma), (*mb),movemask_inverse((val>>32)&mask))); -// blendVal |= ((uint64_t)_mm256_movemask_epi8((*blndcnt_msb)))<<32; -// } -// } - -// static inline uint64_t calc_blend_rm_simd(__m256i* blndcnt_lsb, __m256i* blndcnt_msb, __m256i* ma, __m256i* mb, uint64_t val, uint64_t remval, const uint64_t mask, const int bits) { - -// (*blndcnt_lsb) = _mm256_adds_epi8((*blndcnt_lsb), _mm256_blendv_epi8((*ma), (*mb), movemask_inverse(val&mask))); -// uint64_t blendVal = (uint64_t)_mm256_movemask_epi8((*blndcnt_lsb))&mask; -// //removal of the oldest item -// (*blndcnt_lsb) = _mm256_adds_epi8((*blndcnt_lsb), _mm256_blendv_epi8((*mb), (*ma), movemask_inverse(remval&mask))); - -// if(bits > 32){ -// (*blndcnt_msb) = _mm256_adds_epi8((*blndcnt_msb), _mm256_blendv_epi8((*ma), (*mb),movemask_inverse((val>>32)&mask))); -// blendVal |= ((uint64_t)_mm256_movemask_epi8((*blndcnt_msb)))<<32; - -// (*blndcnt_msb) = _mm256_adds_epi8((*blndcnt_msb),_mm256_blendv_epi8((*mb), (*ma), movemask_inverse((remval>>32)&mask))); -// } - -// return blendVal; -// } - -// void ri_sketch_blend_min(void *km, const float* s_values, uint32_t id, int strand, int len, int w, int e, int n, int q, int lq, int k, mm128_v *p){ -// } - -// void ri_sketch_blend(void *km, const float* s_values, uint32_t id, int strand, int len, int e, int n, int q, int lq, int k, mm128_v *p){ - -// uint64_t blendVal = 0; -// int blend_pos = 0; -// bool buffull = false; -// const uint64_t blendmask = (1ULL<<28)-1; - -// mm128_t blndBuf[n]; -// memset(blndBuf, 0, n*sizeof(mm128_t)); - -// //SIMD-related variables -// __m256i ma = _mm256_set1_epi8(1); -// __m256i mb = _mm256_set1_epi8(-1); -// __m256i blndcnt_lsb = _mm256_set1_epi8(0); -// __m256i blndcnt_msb = _mm256_set1_epi8(0); - -// int step = 1;//TODO: make this an argument -// uint32_t span = (k+e-1)*step; //for now single event is considered to span 6 bases. - -// const uint32_t quant_bit = lq+2; -// const uint32_t shift_r = 32-q; -// const uint64_t id_shift = (uint64_t)id<n + len/step); - -// mm128_t sigBuf[e]; -// memset(sigBuf, 0, e*sizeof(mm128_t)); - -// for (i = 0; i < len; i += step) { -// if(i > 0 && fabs(s_values[i] - s_values[l_sigpos]) < LAST_SIG_DIFF) continue; - -// l_sigpos = i; -// signal = *((uint32_t*)&s_values[i]); -// tmpQuantSignal = signal>>30<>shift_r)&mask_l_quant); - -// mm128_t info = { UINT64_MAX, UINT64_MAX }; - -// quantVal = (quantVal<= fine_min && signal <= fine_max) { + // Within [fine_min, fine_max], map to a sub-range [a, b] in [0, 1], + //then scale to [0, fine_range] for finer granularity + quantized = fine_range * ((normalized - a) / (b - a)); + } + else { + // Outside [fine_min, fine_max], split the rest of [0, 1] into two and map accordingly + if (normalized < 0.5) quantized = fine_range + coarse_coef1 * normalized; // Coarser granularity + else quantized = coarse_coef2 + coarse_coef1 * normalized; // Coarser granularity + } -// if(buffull){ -// blendVal = calc_blend_rm_simd(&blndcnt_lsb, &blndcnt_msb, &ma, &mb, hashVal, blndBuf[blend_pos].x, blendmask, 32); -// info.x = blendVal< 0 && (w > 0 && w < 256) && e*quant_bit <= 64); +void ri_sketch_min(void *km, + const float* s_values, + uint32_t id, + int strand, + int len, + float diff, + int w, + int e, + uint32_t quant_bit, + int k, + float fine_min, + float fine_max, + float fine_range, + mm128_v *p) +{ + assert(len > 0 && (w > 0 && w < 256) && e*quant_bit <= (64-RI_HASH_SHIFT)); int j, buf_pos, min_pos; mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX }; - // int step = 1;//TODO: make this an argument - // uint32_t span = (k+e-1)*step; uint32_t span = k+e-1; - const uint32_t shift_r = 32-q; - const uint64_t id_shift = (uint64_t)id<n + (len/step)/w); rh_kv_resize(mm128_t, km, *p, p->n + len/w); int sigBufFull = 0; - uint32_t i, l, sigBufPos = 0; + uint32_t f_pos, l, sigBufPos = 0; uint32_t l_sigpos = 0; //last signal position - uint32_t signal = 0, tmpQuantSignal = 0; + uint32_t f_tmpQuantSignal = 0; uint64_t quantVal = 0; mm128_t sigBuf[e]; memset(sigBuf, 0, e*sizeof(mm128_t)); - // for (i = l = buf_pos = min_pos = 0; i < len; i += step) { - for (i = l = buf_pos = min_pos = 0; i < len; ++i) { - if(i > 0 && fabs(s_values[i] - s_values[l_sigpos]) < diff) continue; + for (f_pos = l = buf_pos = min_pos = 0; f_pos < len; ++f_pos) { + if(f_pos > 0 && fabs(s_values[f_pos] - s_values[l_sigpos]) < diff) continue; l++; mm128_t info = { UINT64_MAX, UINT64_MAX }; - l_sigpos = i; - signal = *((uint32_t*)&s_values[i]); - tmpQuantSignal = signal>>30<>shift_r)&mask_l_quant); + l_sigpos = f_pos; - quantVal = (quantVal< 0 && e*quant_bit <= 64); +void ri_sketch_reg(void *km, + const float* s_values, + uint32_t id, + int strand, + int len, + float diff, + int e, + uint32_t quant_bit, + int k, + float fine_min, + float fine_max, + float fine_range, + mm128_v *p){ + + assert(len > 0 && (uint32_t)e*quant_bit <= 64); - // int step = 1;//TODO: make this an argument - // uint32_t span = (k+e-1)*step; uint32_t span = k+e-1; - uint32_t shift_r = 32-q; - const uint64_t id_shift = (uint64_t)id< 0 && e*quant_bit <= 64); - - // rh_kv_resize(mm128_t, km, *p, p->n + len/step); rh_kv_resize(mm128_t, km, *p, p->n + len); mm128_t sigBuf[e]; memset(sigBuf, 0, e*sizeof(mm128_t)); - // for (i = 0; i < len; i += step) { - for (i = 0; i < len; ++i) { - if((i > 0 && fabs(s_values[i] - s_values[l_sigpos]) < diff)) continue; + for (f_pos = 0; f_pos < len; ++f_pos) { + if((f_pos > 0 && fabs(s_values[f_pos] - s_values[l_sigpos]) < diff)) continue; + + l_sigpos = f_pos; - l_sigpos = i; - signal = *((uint32_t*)&s_values[i]); - tmpQuantSignal = signal>>30<>shift_r)&mask_l_quant); + f_tmpQuantSignal = dynamic_quantize(s_values[f_pos], fine_min, fine_max, fine_range, n_buckets)&mask_quant_bit; - sigBuf[sigBufPos].y = id_shift | (uint32_t)i<>quant_bit<>quant_bit< //for memset #include "rutils.h" -//A large float value can be used for masking if needed -// #define RI_MASK_SIGNAL 3.402823466e+32F - -// #define LAST_SIG_DIFF 0.3F - //To store sketches in vectors #define RI_HASH_SHIFT 6 #define RI_ID_SHIFT 32 @@ -43,7 +38,21 @@ extern "C" { * and strand indicates whether the seed comes from the top or the bottom $strand. * Callers may want to set "p->n = 0"; otherwise results are appended to p */ -void ri_sketch(void *km, const float* s_values, uint32_t id, int strand, int len, float diff, int w, int e, int n, int q, int lq, int k, mm128_v *p); +void ri_sketch(void *km, + const float* s_values, + uint32_t id, + int strand, + int len, + float diff, + int w, + int e, + int n, + uint32_t quant_bit, + int k, + float fine_min, + float fine_max, + float fine_range, + mm128_v *p); #ifdef __cplusplus } diff --git a/src/rutils.c b/src/rutils.c index 9bfe7db..a64bf71 100644 --- a/src/rutils.c +++ b/src/rutils.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -59,14 +60,54 @@ char* strsep(char** stringp, const char* delim) { return start; } -void load_pore(const char* fpore, const short k, const short lev_col, float** pore_vals){ +uint32_t rev_complement(uint32_t x, const short k) { + uint32_t y = 0; + for (short i = 0; i < k; i++) { + y = (y<<2) | ((x&3)^3); + x >>= 2; + } + return y; +} + +int compare_value_index_pairs(const void* a, const void* b) { + float diff = ((ri_porei_t*)a)->pore_val - ((ri_porei_t*)b)->pore_val; + return (diff < 0)?-1:(diff > 0); +} + +ri_porei_t* create_sorted_pairs(const ri_pore_t* pore) { + double sum = 0, sum2 = 0, mean, std_dev; + + // First pass: Calculate mean and standard deviation + for (uint32_t i = 0; i < pore->n_pore_vals; i++) { + sum += pore->pore_vals[i]; + sum2 += pore->pore_vals[i] * pore->pore_vals[i]; + } + mean = sum / pore->n_pore_vals; + std_dev = sqrt(sum2 / pore->n_pore_vals - mean * mean); + + // Allocate memory for pairs + ri_porei_t* pairs = (ri_porei_t*)malloc(pore->n_pore_vals * sizeof(ri_porei_t)); + + // Second pass: Normalize values and create pairs + for (uint32_t i = 0; i < pore->n_pore_vals; i++) { + pairs[i].pore_val = (pore->pore_vals[i] - mean) / std_dev; // Normalization + pairs[i].ind = i; + pairs[i].rev_ind = rev_complement(i, pore->k); + } + + // Sort pairs based on normalized pore_val + qsort(pairs, pore->n_pore_vals, sizeof(ri_porei_t), compare_value_index_pairs); + return pairs; +} + +void load_pore(const char* fpore, const short k, const short lev_col, ri_pore_t* pore){ FILE* fp = fopen(fpore, "r"); if(fp == NULL){ fprintf(stderr, "Error: cannot open file %s\n", fpore); return; } - (*pore_vals) = (float*)malloc(sizeof(float) * (1U<<(2*k))); + pore->pore_vals = (float*)malloc(sizeof(float) * (1U<<(2*k))); char line[1024]; char* token; int i = 0; @@ -78,10 +119,10 @@ void load_pore(const char* fpore, const short k, const short lev_col, float** po if(j++ == lev_col){ float value; if (sscanf(token, "%f", &value) == 1) { - (*pore_vals)[i] = value; + pore->pore_vals[i] = value; } else { fprintf(stderr, "Error: cannot convert '%s' to float\n", token); - free(pore_vals); pore_vals = NULL; + free(pore->pore_vals); pore->pore_vals = NULL; fclose(fp); return; } @@ -91,6 +132,10 @@ void load_pore(const char* fpore, const short k, const short lev_col, float** po i++; } fclose(fp); + + pore->n_pore_vals = 1U<<(2*k); + pore->k = k; + pore->pore_inds = create_sorted_pairs(pore); } // #define sort_key_128x(a) ((a).x) diff --git a/src/rutils.h b/src/rutils.h index 46f0b7d..0b66fad 100644 --- a/src/rutils.h +++ b/src/rutils.h @@ -22,6 +22,8 @@ typedef struct { uint32_t x, y; } mm64_t; typedef struct { size_t n, m; mm128_t *a; } mm128_v; typedef struct { size_t n, m; mm64_t *a; } mm64_v; typedef struct { size_t n, m; char **a; } ri_char_v; +typedef struct ri_porei_s{float pore_val; unsigned int ind; unsigned int rev_ind;} ri_porei_t; +typedef struct ri_pore_s{ri_porei_t* pore_inds; float* pore_vals; unsigned int n_pore_vals; short k; float max_val, min_val;} ri_pore_t; void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); @@ -32,7 +34,7 @@ double ri_realtime(void); double ri_cputime(void); long ri_peakrss(void); -void load_pore(const char* fpore, const short k, const short lev_col, float** pore_vals); +void load_pore(const char* fpore, const short k, const short lev_col, ri_pore_t* pore); #ifdef __cplusplus } diff --git a/test/README.md b/test/README.md index 1d15650..45514eb 100644 --- a/test/README.md +++ b/test/README.md @@ -6,11 +6,11 @@ We compare RawHash2 with [UNCALLED](https://github.com/skovaka/UNCALLED), [Sigma We list the links to download and compile each tool for comparison below: -* [UNCALLED](https://github.com/skovaka/UNCALLED/tree/74a5d4e5b5d02fb31d6e88926e8a0896dc3475cb) -* [Sigmap](https://github.com/haowenz/sigmap/commit/c9a40483264c9514587a36555b5af48d3f054f6f) +* [UNCALLED v2.3](https://github.com/skovaka/UNCALLED/tree/58dbec69f625e0343739d821788d536b578ea41d) +* [Sigmap v0.1](https://github.com/haowenz/sigmap/tree/c9a40483264c9514587a36555b5af48d3f054f6f) * [RawHash v1](https://github.com/CMU-SAFARI/RawHash/releases/tag/v1.0) -We use minimap2 to generate the ground truth mapping information by mapping basecalled seqeunces to their corresponding reference genomes. We use the following minimap2 version: +We use minimap2 (v2.24) to generate the ground truth mapping information by mapping basecalled seqeunces to their corresponding reference genomes. We use the following minimap2 version: * [Minimap2 v2.24](https://github.com/lh3/minimap2/releases/tag/v2.24) @@ -55,13 +55,13 @@ git clone --recursive https://github.com/CMU-SAFARI/RawHash.git rawhash2 && cd r #Cloning and compiling RawHash wget -qO- https://github.com/CMU-SAFARI/RawHash/releases/download/v1.0/RawHash-1.0.tar.gz | tar -xzv && cd rawhash && make && cp ./bin/rawhash ../bin/ && cd .. -#Cloning and compiling Sigmap -git clone --recursive https://github.com/haowenz/sigmap.git sigmap && cd sigmap && make && cp sigmap ../bin/ && cd .. +#Cloning and compiling Sigmap v0.1 commit c9a40483264c9514587a36555b5af48d3f054f6f +git clone --recursive https://github.com/haowenz/sigmap.git sigmap && cd sigmap && git checkout c9a40483264c9514587a36555b5af48d3f054f6f && make && cp sigmap ../bin/ && cd .. -#Cloning and compiling UNCALLED -git clone --recursive https://github.com/skovaka/UNCALLED.git uncalled && cd uncalled/submods/bwa && git pull origin master && cd ../../ && pip3 install . && cd .. +#Cloning and compiling UNCALLED v2.1 commit 58dbec69f625e0343739d821788d536b578ea41d +git clone --recursive https://github.com/skovaka/UNCALLED.git uncalled && cd uncalled && git checkout 58dbec69f625e0343739d821788d536b578ea41d && cd submods/bwa && git pull origin master && cd ../../ && pip3 install . && cd .. -#Downloading and compiling minimap2 +#Downloading and compiling minimap2 v2.24 wget https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24.tar.bz2; tar -xf minimap2-2.24.tar.bz2; rm minimap2-2.24.tar.bz2; mv minimap2-2.24 minimap2; cd minimap2 && make && cp minimap2 ../bin/ && cd .. #Step 2 Adding binaries to PATH diff --git a/test/evaluation/contamination/run_rawhash2.sh b/test/evaluation/contamination/run_rawhash2.sh index f835033..766eb4a 100644 --- a/test/evaluation/contamination/run_rawhash2.sh +++ b/test/evaluation/contamination/run_rawhash2.sh @@ -12,10 +12,9 @@ mkdir -p ${OUTDIR} #Viral preset (default for viral genomes) PREFIX="contamination" -PARAMS="--min-mapq 5 --min-score 20 --chain-skip-scale 0.3 --best-chains 5" -bash ../../scripts/run_rawhash2.sh ${OUTDIR} ${PREFIX} ${FAST5} ${REF} ${PORE} ${PRESET} ${THREAD} "${PARAMS}" > "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.out" 2> "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.err" +bash ../../scripts/run_rawhash2.sh ${OUTDIR} ${PREFIX} ${FAST5} ${REF} ${PORE} ${PRESET} ${THREAD} > "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.out" 2> "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.err" #Minimizers PREFIX="contamination_w3" -PARAMS="-w 3 --min-mapq 5 --min-score 20 --chain-skip-scale 0.3 --best-chains 5" +PARAMS="-w 3" bash ../../scripts/run_rawhash2.sh ${OUTDIR} ${PREFIX} ${FAST5} ${REF} ${PORE} ${PRESET} ${THREAD} "${PARAMS}" > "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.out" 2> "${OUTDIR}/${PREFIX}_rawhash2_${PRESET}.err" diff --git a/test/evaluation/relative_abundance/run_rawhash.sh b/test/evaluation/relative_abundance/run_rawhash.sh index c3617eb..7480aed 100644 --- a/test/evaluation/relative_abundance/run_rawhash.sh +++ b/test/evaluation/relative_abundance/run_rawhash.sh @@ -4,8 +4,8 @@ THREAD=$1 #relative_abundance OUTDIR="./rawhash/" -# FAST5="../../data/relative_abundance/fast5_files/" -FAST5="../../data/relative_abundance/test_fast5_files/" +FAST5="../../data/relative_abundance/fast5_files/" +# FAST5="../../data/relative_abundance/test_fast5_files/" REF="../../data/relative_abundance/ref.fa" PORE="../../../extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model" PRESET="fast" diff --git a/test/evaluation/relative_abundance/run_rawhash2.sh b/test/evaluation/relative_abundance/run_rawhash2.sh index 70070d0..5871779 100644 --- a/test/evaluation/relative_abundance/run_rawhash2.sh +++ b/test/evaluation/relative_abundance/run_rawhash2.sh @@ -4,8 +4,8 @@ THREAD=$1 #relative_abundance OUTDIR="./rawhash2/" -# FAST5="../../data/relative_abundance/fast5_files/" -FAST5="../../data/relative_abundance/test_fast5_files/" +FAST5="../../data/relative_abundance/fast5_files/" +# FAST5="../../data/relative_abundance/test_fast5_files/" REF="../../data/relative_abundance/ref.fa" PORE="../../../extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model" PRESET="fast" diff --git a/test/evaluation/relative_abundance/run_sigmap.sh b/test/evaluation/relative_abundance/run_sigmap.sh index e091aca..94146c3 100644 --- a/test/evaluation/relative_abundance/run_sigmap.sh +++ b/test/evaluation/relative_abundance/run_sigmap.sh @@ -4,8 +4,8 @@ THREAD=$1 #relative_abundance OUTDIR="./sigmap/" -# FAST5="../../data/relative_abundance/fast5_files/" -FAST5="../../data/relative_abundance/test_fast5_files/" +FAST5="../../data/relative_abundance/fast5_files/" +# FAST5="../../data/relative_abundance/test_fast5_files/" REF="../../data/relative_abundance/ref.fa" PORE="../../../extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model" PREFIX="relative_abundance" diff --git a/test/evaluation/relative_abundance/run_uncalled.sh b/test/evaluation/relative_abundance/run_uncalled.sh index 0fe6f78..a375ae0 100644 --- a/test/evaluation/relative_abundance/run_uncalled.sh +++ b/test/evaluation/relative_abundance/run_uncalled.sh @@ -4,8 +4,8 @@ THREAD=$1 #relative_abundance OUTDIR="./uncalled/" -# FAST5="../../data/relative_abundance/fast5_files/" -FAST5="../../data/relative_abundance/test_fast5_files/" +FAST5="../../data/relative_abundance/fast5_files/" +# FAST5="../../data/relative_abundance/test_fast5_files/" REF="../../data/relative_abundance/ref.fa" PREFIX="relative_abundance" mkdir -p ${OUTDIR} diff --git a/test/scripts/run_rawhash.sh b/test/scripts/run_rawhash.sh index f8224f0..807ab87 100644 --- a/test/scripts/run_rawhash.sh +++ b/test/scripts/run_rawhash.sh @@ -11,5 +11,5 @@ PRESETX=$6 #Default preset of rawhash for the run (e.g., viral) THREAD=$7 #Number of threads to use PARAMS=$8 #(optional -- you can keep it empty) custom parameters to set on top of the default parameters -# /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_rawhash_index_${PRESETX}.time" rawhash -x ${PRESETX} -t ${THREAD} ${PARAMS} -p "${PORE}" -d "${OUTDIR}/${PREFIX}_rawhash_${PRESETX}.ind" ${REF} +/usr/bin/time -vpo "${OUTDIR}/${PREFIX}_rawhash_index_${PRESETX}.time" rawhash -x ${PRESETX} -t ${THREAD} ${PARAMS} -p "${PORE}" -d "${OUTDIR}/${PREFIX}_rawhash_${PRESETX}.ind" ${REF} /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_rawhash_map_${PRESETX}.time" rawhash -x ${PRESETX} -t ${THREAD} ${PARAMS} -o "${OUTDIR}/${PREFIX}_rawhash_${PRESETX}.paf" "${OUTDIR}/${PREFIX}_rawhash_${PRESETX}.ind" ${SIGNALS} diff --git a/test/scripts/run_sigmap.sh b/test/scripts/run_sigmap.sh index 7f03fb4..4406072 100644 --- a/test/scripts/run_sigmap.sh +++ b/test/scripts/run_sigmap.sh @@ -10,5 +10,5 @@ PORE=$5 #Path to the k-mer model file THREAD=$6 #Number of threads to use PARAMS=$7 #(optional -- you can keep it empty) custom parameters to set on top of the default parameters -# /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_sigmap_index.time" sigmap -i -p "${PORE}" -o "${OUTDIR}/${PREFIX}_sigmap.ind" -r ${REF} ${PARAMS} +/usr/bin/time -vpo "${OUTDIR}/${PREFIX}_sigmap_index.time" sigmap -i -p "${PORE}" -o "${OUTDIR}/${PREFIX}_sigmap.ind" -r ${REF} ${PARAMS} /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_sigmap_map.time" sigmap -m -t ${THREAD} -r ${REF} -p "${PORE}" -o "${OUTDIR}/${PREFIX}_sigmap.paf" -x "${OUTDIR}/${PREFIX}_sigmap.ind" -s ${SIGNALS} ${PARAMS} diff --git a/test/scripts/run_uncalled.sh b/test/scripts/run_uncalled.sh index d2a37b0..0b4328f 100644 --- a/test/scripts/run_uncalled.sh +++ b/test/scripts/run_uncalled.sh @@ -9,5 +9,5 @@ REF=$4 #Path to the reference genome THREAD=$5 #Number of threads to use PARAMS=$6 #(optional -- you can keep it empty) custom parameters to set on top of the default parameters -# /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_uncalled_index.time" uncalled index ${PARAMS} -o "${OUTDIR}/${PREFIX}_uncalled.ind" ${REF} +/usr/bin/time -vpo "${OUTDIR}/${PREFIX}_uncalled_index.time" uncalled index ${PARAMS} -o "${OUTDIR}/${PREFIX}_uncalled.ind" ${REF} /usr/bin/time -vpo "${OUTDIR}/${PREFIX}_uncalled_map.time" uncalled map -t ${THREAD} ${PARAMS} "${OUTDIR}/${PREFIX}_uncalled.ind" ${SIGNALS} > "${OUTDIR}/${PREFIX}_uncalled.paf"