From 4a28493f7079c832b415be905e956cec844be784 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 13 Apr 2020 06:57:09 -0700 Subject: [PATCH 01/43] start making gap length configurable --- dozeu.h | 61 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/dozeu.h b/dozeu.h index b61638c..38e9dbc 100644 --- a/dozeu.h +++ b/dozeu.h @@ -219,7 +219,12 @@ struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t _p struct dz_mem_s { struct dz_mem_block_s blk; struct dz_stack_s stack; }; #define dz_mem_stack_rem(_mem) ( (size_t)((_mem)->stack.end - (_mem)->stack.top) ) -struct dz_s { int8_t matrix[32]; uint16_t giv[8], gev[8], xt, bonus, max_gap_len, _pad[9]; struct dz_forefront_s const *root; int8_t protein_matrix[]; }; +struct dz_s { + int8_t matrix[32]; + uint16_t giv[8], gev[8], riv[8], rev[8], xt, bonus, _pad[6]; // padding ensures memory alignment + //struct dz_forefront_s const *root; + int8_t protein_matrix[]; +}; dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); #define dz_mem(_self) ( (struct dz_mem_s *)(_self) - 1 ) @@ -301,7 +306,7 @@ static __dz_force_inline struct dz_mem_s *dz_mem_init( size_t size) { - size = dz_min2(DZ_MEM_INIT_SIZE, sizeof(struct dz_mem_s) + dz_roundup(size, DZ_MEM_ALIGN_SIZE)); + size = dz_max2(DZ_MEM_INIT_SIZE, sizeof(struct dz_mem_s) + dz_roundup(size, DZ_MEM_ALIGN_SIZE)); struct dz_mem_s *mem = (struct dz_mem_s *)dz_malloc(size); if(mem == NULL) { debug("failed to malloc memory"); @@ -361,6 +366,7 @@ void *dz_mem_malloc( struct dz_mem_s *mem, size_t size) { + // TODO: this doesn't seem to check to make sure the size allocated is big enough... if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } void *ptr = (void *)mem->stack.top; mem->stack.top += dz_roundup(size, sizeof(__m128i)); @@ -552,7 +558,7 @@ struct dz_s *dz_init_intl( int8_t const *score_matrix, /* match award in positive, mismatch penalty in negative. s(A,A) at [0], s(A,C) at [1], ... s(T,T) at [15] where s(ref_base, query_base) is a score function */ uint16_t gap_open, /* gap penalties in positive */ uint16_t gap_extend, - uint64_t max_gap_len, /* as X-drop threshold */ + //uint64_t max_gap_len, /* as X-drop threshold */ uint16_t full_length_bonus) /* end-to-end mapping bonus; only activated when compiled with -DDZ_FULL_LENGTH_BONUS */ { size_t const L = sizeof(__m128i) / sizeof(uint16_t), gi = gap_open, ge = gap_extend; @@ -595,31 +601,48 @@ struct dz_s *dz_init_intl( _mm_store_si128((__m128i *)self->gev, gev); self->xt = gi + ge * max_gap_len; /* X-drop threshold */ self->bonus = full_length_bonus; - self->max_gap_len = max_gap_len; /* save raw value */ - debug("gi(%u), ge(%u), xdrop_threshold(%u), full_length_bonus(%u), max_gap_len(%lu)", gi, ge, self->xt, self->bonus, self->max_gap_len); + //self->max_gap_len = max_gap_len; /* save raw value */ + debug("gi(%u), ge(%u), xdrop_threshold(%u), full_length_bonus(%u)", gi, ge, self->xt, self->bonus); /* create root head */ struct dz_cap_s *cap = (struct dz_cap_s *)dz_mem_malloc(mem, sizeof(struct dz_cap_s)); _mm_store_si128((__m128i *)cap, _mm_setzero_si128()); - - /* calc vector length; query = NULL for the first (root) column */ - max_gap_len = dz_roundup(max_gap_len, L); - struct dz_forefront_s w = { { 0, (uint32_t)(max_gap_len / L) }, 0, 0, 0, 0, 0, 0, NULL, NULL }; - struct dz_forefront_s *a = &w; - - /* malloc the first column */ - struct dz_swgv_s *dp = _begin_column_head(0, max_gap_len / L, 0, &a, 0); - - /* fill the root (the leftmost) column; first init vectors */ - __m128i s = _mm_setr_epi16(0, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); + + /* initialize and memoize the vectors we need to compute the root */ + __m128i riv = _mm_setr_epi16(0, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); + _mm_store_si128((__m128i *)this->riv, riv); + __m128i rev = _mm_setr_epi16(-gi, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); + _mm_store_si128((__m128i *) this->rev, rev); + + // TODO: moving this part down to dz_root + + /* fill the root (the leftmost) column; first init vectors */ + + /* calc vector length; query = NULL for the first (root) column */ + max_gap_len = dz_roundup(max_gap_len, L); + + struct dz_forefront_s w = { { 0, (uint32_t)(max_gap_len / L) }, 0, 0, 0, 0, 0, 0, NULL, NULL }; + struct dz_forefront_s *a = &w; + + /* malloc the first column */ + struct dz_swgv_s *dp = _begin_column_head(0, max_gap_len / L, 0, &a, 0); + + /* e, f, and s needed for _store_vector */ __m128i const e = _mm_set1_epi16(DZ_CELL_MIN), xtv = _mm_set1_epi16(-self->xt); /* until the X-drop test fails on all the cells in a vector */ + __m128i s = self->riv; for(size_t p = 0; p < max_gap_len / L; p++) { __m128i const f = s; - if(dz_unlikely(_test_xdrop(s, xtv))) { debug("p(%lu)", p); w.r.epos = p; break; } + if (dz_unlikely(_test_xdrop(s, xtv))) { + debug("p(%lu)", p); + w.r.epos = p; + break; + } _store_vector(&dp[p]); - if(p == 0) { s = _mm_setr_epi16(-gi, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); } + if (p == 0) { + s = self->rev; + } s = _mm_subs_epi16(s, _mm_slli_epi16(gev, 3)); } @@ -1125,7 +1148,7 @@ struct dz_forefront_s const *dz_extend_intl( __m128i const gev4 = _mm_slli_epi16(gev1, 2); __m128i const gev8 = _mm_slli_epi16(gev1, 3); - struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, 0, 0, 0, 0, NULL, NULL }; /* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ \ + struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, 0, 0, 0, 0, NULL, NULL }; /* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ w.rlen = rlen; w.rid = rid; w.query = query; From bccbe0b3241f150284ffe282bad12f4342b8ff99 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 13 Apr 2020 21:19:49 -0700 Subject: [PATCH 02/43] make max gap len configurable for each read --- dozeu.h | 242 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 152 insertions(+), 90 deletions(-) diff --git a/dozeu.h b/dozeu.h index 38e9dbc..025d719 100644 --- a/dozeu.h +++ b/dozeu.h @@ -186,6 +186,11 @@ struct dz_forefront_s { struct dz_query_s const *query; struct dz_cap_s const *mcap; }; +struct dz_alignment_init_s { + struct dz_forefront_s const *root; + uint16_t xt; +}; + dz_static_assert(sizeof(struct dz_swgv_s) % sizeof(__m128i) == 0); dz_static_assert(sizeof(struct dz_cap_s) % sizeof(__m128i) == 0); dz_static_assert(sizeof(struct dz_forefront_s) % sizeof(__m128i) == 0); @@ -221,14 +226,13 @@ struct dz_mem_s { struct dz_mem_block_s blk; struct dz_stack_s stack; }; struct dz_s { int8_t matrix[32]; - uint16_t giv[8], gev[8], riv[8], rev[8], xt, bonus, _pad[6]; // padding ensures memory alignment - //struct dz_forefront_s const *root; + uint16_t giv[8], gev[8], riv[8], rev[8], bonus, _pad[7]; // padding ensures memory alignment int8_t protein_matrix[]; }; dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); #define dz_mem(_self) ( (struct dz_mem_s *)(_self) - 1 ) -#define dz_root(_self) ( (struct dz_forefront_s const **)(&_self->root) ) +//#define dz_root(_self) ( (struct dz_forefront_s const **)(&_self->root) ) #define dz_is_terminated(_ff) ( dz_cff(_ff)->r.spos >= dz_cff(_ff)->r.epos ) #define dz_gt(_ff) ( dz_cff(_ff)->inc > 0 ) #define dz_geq(_ff) ( dz_cff(_ff)->mcap != NULL ) @@ -299,7 +303,9 @@ void dz_mem_flush( { mem->stack.curr = &mem->blk; mem->stack.top = (uint8_t *)dz_roundup((uintptr_t)(mem + 1), DZ_MEM_ALIGN_SIZE); - mem->stack.end = (uint8_t *)mem + mem->blk.size - DZ_MEM_MARGIN_SIZE; + // TODO: i think this margin is redundant, since dz_malloc adds the margin past the end + // of the requested size + mem->stack.end = (uint8_t *)mem + mem->blk.size;// - DZ_MEM_MARGIN_SIZE; return; } static __dz_force_inline @@ -338,7 +344,18 @@ uint64_t dz_mem_add_stack( size_t size) { debug("add_stack, ptr(%p)", mem->stack.curr->next); - if(mem->stack.curr->next == NULL) { + if(mem->stack.curr->next == NULL + || dz_unlikely(mem->stack.curr->next->size < size + dz_roundup(sizeof(struct dz_mem_block_s), DZ_MEM_ALIGN_SIZE))) { + if (mem->stack.curr->next != NULL) { + /* didn't allocate enough memory earlier, throw out all subsequent blocks so we can start again bigger */ + struct dz_mem_block_s *blk = mem->stack.curr->next; + while (blk != NULL) { + struct dz_mem_block_s *next = blk->next; + dz_free(blk); + blk = next; + } + mem->stack.curr->next = NULL; + } /* current stack is the forefront of the memory block chain, add new block */ size = dz_max2( size + dz_roundup(sizeof(struct dz_mem_block_s), DZ_MEM_ALIGN_SIZE), @@ -367,7 +384,10 @@ void *dz_mem_malloc( size_t size) { // TODO: this doesn't seem to check to make sure the size allocated is big enough... - if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } + // also, where does 4096 come from? + // also, don't we need to provide the size to ensure that a large enough block is allocated? + //if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } + if(dz_mem_stack_rem(mem) < size) { dz_mem_add_stack(mem, size); } void *ptr = (void *)mem->stack.top; mem->stack.top += dz_roundup(size, sizeof(__m128i)); return(ptr); @@ -412,8 +432,8 @@ unittest() { #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ - /* allocate from heap */ \ - if(dz_mem_stack_rem(dz_mem(self)) < next_req) { dz_mem_add_stack(dz_mem(self), 0); } \ + /* allocate from heap TODO: shouldn't we make sure we get enough allocation? */ \ + if(dz_mem_stack_rem(dz_mem(self)) < next_req) { dz_mem_add_stack(dz_mem(self), next_req); } /* 0); }*/ \ /* push head-cap */ \ struct dz_cap_s *cap = _init_cap(_adj, 0xff, _forefronts, _n_forefronts); \ /* return array pointer */ \ @@ -428,7 +448,8 @@ unittest() { cap->rch = (_rch); /* record rch for the next column */ \ cap->rrem = (_rlen); /* record rlen for use in traceback */ \ if(dz_likely(dz_mem_stack_rem(dz_mem(self)) < next_req)) { \ - dz_mem_add_stack(dz_mem(self), 0); \ + /* TODO: editing to make sure we ask for enough memory */ \ + dz_mem_add_stack(dz_mem(self), next_req); /* 0); }*/ \ cap = _init_cap(0, _rch, &cap, 1); \ } \ debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).r.spos, (_w).r.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ @@ -561,7 +582,6 @@ struct dz_s *dz_init_intl( //uint64_t max_gap_len, /* as X-drop threshold */ uint16_t full_length_bonus) /* end-to-end mapping bonus; only activated when compiled with -DDZ_FULL_LENGTH_BONUS */ { - size_t const L = sizeof(__m128i) / sizeof(uint16_t), gi = gap_open, ge = gap_extend; /* static uint8_t const transpose[16] __attribute__(( aligned(16) )) = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 @@ -595,62 +615,97 @@ struct dz_s *dz_init_intl( } #endif - __m128i const giv = _mm_set1_epi16(gi); - __m128i const gev = _mm_set1_epi16(ge); + __m128i const giv = _mm_set1_epi16(gap_open); + __m128i const gev = _mm_set1_epi16(gap_extend); _mm_store_si128((__m128i *)self->giv, giv); _mm_store_si128((__m128i *)self->gev, gev); - self->xt = gi + ge * max_gap_len; /* X-drop threshold */ + // TODO: see if i can remove this to cut down on state + //self->xt = 0; /* X-drop threshold */ self->bonus = full_length_bonus; //self->max_gap_len = max_gap_len; /* save raw value */ - debug("gi(%u), ge(%u), xdrop_threshold(%u), full_length_bonus(%u)", gi, ge, self->xt, self->bonus); + debug("gi(%u), ge(%u), full_length_bonus(%u)", gap_open, gap_extend, self->bonus); - /* create root head */ - struct dz_cap_s *cap = (struct dz_cap_s *)dz_mem_malloc(mem, sizeof(struct dz_cap_s)); - _mm_store_si128((__m128i *)cap, _mm_setzero_si128()); + // TODO: i'm pretty sure this cap is extraneous and never used + ///* create root head */ + //struct dz_cap_s *cap = (struct dz_cap_s *)dz_mem_malloc(mem, sizeof(struct dz_cap_s)); + //_mm_store_si128((__m128i *)cap, _mm_setzero_si128()); /* initialize and memoize the vectors we need to compute the root */ - __m128i riv = _mm_setr_epi16(0, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); - _mm_store_si128((__m128i *)this->riv, riv); - __m128i rev = _mm_setr_epi16(-gi, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); - _mm_store_si128((__m128i *) this->rev, rev); + __m128i riv = _mm_setr_epi16( + 0, + -(gap_open+gap_extend), + -(gap_open+2*gap_extend), + -(gap_open+3*gap_extend), + -(gap_open+4*gap_extend), + -(gap_open+5*gap_extend), + -(gap_open+6*gap_extend), + -(gap_open+7*gap_extend) + ); + _mm_store_si128((__m128i *)self->riv, riv); + __m128i rev = _mm_setr_epi16( + -gap_open, + -(gap_open+gap_extend), + -(gap_open+2*gap_extend), + -(gap_open+3*gap_extend), + -(gap_open+4*gap_extend), + -(gap_open+5*gap_extend), + -(gap_open+6*gap_extend), + -(gap_open+7*gap_extend) + ); + _mm_store_si128((__m128i *)self->rev, rev); - // TODO: moving this part down to dz_root + // precompute the end-cap, which will always live next to the dz_s in this mem block + struct dz_cap_s *cap = _init_cap(0, 0xff, NULL, 0); + return(self); +} +static __dz_vectorize +struct dz_alignment_init_s dz_align_init( + struct dz_s *self, + uint32_t max_gap_len) +{ /* fill the root (the leftmost) column; first init vectors */ - /* calc vector length; query = NULL for the first (root) column */ - max_gap_len = dz_roundup(max_gap_len, L); + /* calc vector length */ + size_t const L = sizeof(__m128i) / sizeof(uint16_t); + size_t max_gap_vec_len = dz_roundup(max_gap_len, L) / L; - struct dz_forefront_s w = { { 0, (uint32_t)(max_gap_len / L) }, 0, 0, 0, 0, 0, 0, NULL, NULL }; + struct dz_forefront_s w = { { 0, (uint32_t) max_gap_vec_len}, 0, 0, 0, 0, 0, 0, NULL, NULL }; struct dz_forefront_s *a = &w; /* malloc the first column */ - struct dz_swgv_s *dp = _begin_column_head(0, max_gap_len / L, 0, &a, 0); + struct dz_swgv_s *dp = _begin_column_head(0, max_gap_vec_len, 0, &a, 0); + + uint16_t xt = self->giv[0] + self->gev[0] * max_gap_len; + // calculate the x-drop threshold for this gap length + __m128i xtv = _mm_set1_epi16(-xt); /* e, f, and s needed for _store_vector */ - __m128i const e = _mm_set1_epi16(DZ_CELL_MIN), xtv = _mm_set1_epi16(-self->xt); - - /* until the X-drop test fails on all the cells in a vector */ - __m128i s = self->riv; - for(size_t p = 0; p < max_gap_len / L; p++) { - __m128i const f = s; - if (dz_unlikely(_test_xdrop(s, xtv))) { + __m128i const e = _mm_set1_epi16(DZ_CELL_MIN); + /* until the X-drop test fails on all the cells in a vector */ + __m128i s = _mm_load_si128((__m128i const *)self->riv); + for(size_t p = 0; p < max_gap_vec_len; p++) { + __m128i const f = s; + if (_test_xdrop(s, xtv)) { debug("p(%lu)", p); w.r.epos = p; break; } - _store_vector(&dp[p]); - if (p == 0) { - s = self->rev; + _store_vector(&dp[p]); + if (p == 0) { + s = _mm_load_si128((__m128i const *)self->rev); } - s = _mm_subs_epi16(s, _mm_slli_epi16(gev, 3)); - } - - /* done; create and return a forefront object */ - _end_column(dp, w.r.spos, w.r.epos); - self->root = _end_matrix(dp, &w, 0); - debug("self(%p), mem(%p), root(%p)", self, dz_mem(self), self->root); - return(self); + s = _mm_subs_epi16(s, _mm_slli_epi16(_mm_load_si128((__m128i const *)self->gev), 3)); + } + + /* done; create forefront object */ + _end_column(dp, w.r.spos, w.r.epos); + + /* package forefront and xt, return */ + dz_alignment_init_s aln_init; + aln_init.root = _end_matrix(dp, &w, 0); + aln_init.xt = xt; + return(aln_init); } #ifdef DZ_FULL_LENGTH_BONUS @@ -659,20 +714,20 @@ struct dz_s *dz_init( int8_t const *score_matrix, uint16_t gap_open, uint16_t gap_extend, - uint64_t max_gap_len, + //uint64_t max_gap_len, uint16_t full_length_bonus) { - return(dz_init_intl(score_matrix, gap_open, gap_extend, max_gap_len, full_length_bonus)); + return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); } #else static __dz_vectorize struct dz_s *dz_init( int8_t const *score_matrix, uint16_t gap_open, - uint16_t gap_extend, - uint64_t max_gap_len) + uint16_t gap_extend)//, + //uint64_t max_gap_len) { - return(dz_init_intl(score_matrix, gap_open, gap_extend, max_gap_len, 0)); + return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); } #endif @@ -689,8 +744,10 @@ static __dz_vectorize void dz_flush( struct dz_s *self) { + // point the mem back at the initial block dz_mem_flush(dz_mem(self)); - dz_mem(self)->stack.top = (uint8_t *)(self->root + 1); + // move stack pointers past the dz_s and the dz_cap_s that follows it + dz_mem(self)->stack.top = (uint8_t *)(((dz_cap_s*) (self + 1)) + 1); return; } @@ -787,15 +844,16 @@ static int8_t const dz_unittest_score_matrix[DZ_MAT_SIZE * DZ_MAT_SIZE] = { #endif #ifdef DZ_FULL_LENGTH_BONUS -#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1, 20, 0 +#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1, 0 #else -#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1, 20 +#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1 #endif +#define DZ_UNITTEST_MAX_GAP_LEN 20 unittest() { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); ut_assert(dz != NULL); - ut_assert(dz->root != NULL); + //ut_assert(dz->root != NULL); dz_destroy(dz); } @@ -1072,13 +1130,13 @@ unittest() { _end_column(cdp, w.r.spos, w.r.epos); \ cdp; \ }) -#define _fill_column(w, pdp, query, rt, rrem, init_s) ({ \ +#define _fill_column(w, pdp, query, rt, rrem, xt, init_s) ({ \ /* load the base */ \ _init_rch(query, rt, rrem); \ struct dz_swgv_s *cdp = _begin_column(w, rch, rrem); \ /* init vectors */ \ __m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN); \ - __m128i const xtv = _mm_set1_epi16(w.inc - self->xt); /* next offset == current max thus X-drop threshold is always -xt */ \ + __m128i const xtv = _mm_set1_epi16(w.inc - xt); /* next offset == current max thus X-drop threshold is always -xt */ \ /* until the bottommost vertically placed band... */ \ uint32_t sspos = w.r.spos; /* save spos on the stack */ \ for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ @@ -1117,7 +1175,7 @@ struct dz_forefront_s const *dz_extend_intl( struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, - char const *ref, int32_t rlen, uint32_t rid, + char const *ref, int32_t rlen, uint32_t rid, uint16_t xt, uint16_t init_s) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); @@ -1162,7 +1220,7 @@ struct dz_forefront_s const *dz_extend_intl( uint8_t const *rt = (uint8_t const *)&ref[rrem - (rlen < 0)]; while(rrem != 0 && w.r.spos < w.r.epos) { - pdp = _fill_column(w, pdp, query, rt, rrem, init_s); rrem += dir; + pdp = _fill_column(w, pdp, query, rt, rrem, xt, init_s); rrem += dir; debug("rrem(%d), [%u, %u)", rrem, w.r.spos, w.r.epos); } return(_end_matrix(pdp, &w, rrem)); /* update max vector (pdp always points at the last vector) */ @@ -1180,19 +1238,19 @@ struct dz_forefront_s const *dz_extend( struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, - char const *ref, int32_t rlen, uint32_t rid) + char const *ref, int32_t rlen, uint32_t rid, uint16_t xt) { - return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, INT16_MIN)); + return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, INT16_MIN)); } static __dz_vectorize struct dz_forefront_s const *dz_scan( struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, - char const *ref, int32_t rlen, uint32_t rid) + char const *ref, int32_t rlen, uint32_t rid, uint16_t xt) { debug("dz_scan called"); - return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, 0)); + return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, 0)); } @@ -1223,29 +1281,30 @@ unittest( "extend.base" ) { struct dz_forefront_s const *forefront = NULL; /* nothing occurs */ - forefront = dz_extend(dz, q, NULL, 0, NULL, 0, 1); + forefront = dz_extend(dz, q, NULL, 0, NULL, 0, 1, 0); ut_assert(forefront == NULL); - forefront = dz_extend(dz, q, NULL, 0, "", 0, 2); + forefront = dz_extend(dz, q, NULL, 0, "", 0, 2, 0); ut_assert(forefront == NULL); - forefront = dz_extend(dz, q, dz_root(dz), 1, "", 0, 3); - ut_assert(forefront == *dz_root(dz)); + dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + forefront = dz_extend(dz, q, &aln_init.root, 1, "", 0, 3, aln_init.xt); + ut_assert(forefront == aln_init.root); /* extend */ - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("A", "\x0", "M"), 1, 4); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("A", "\x0", "M"), 1, 4, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(2, 2, 5)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 5); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 5, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(4, 4, 9)); if(trial == 1) { continue; } - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTT", "\x0\x2\x0\x3\x3\x3\x3", "MASLVQT"), 7, 6); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTT", "\x0\x2\x0\x3\x3\x3\x3", "MASLVQT"), 7, 6, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(9, 9, 28)); if(trial == 2) { continue; } - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTTC", "\x0\x2\x0\x3\x3\x3\x3\x1", "MASLVQTG"), 8, 7); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTTC", "\x0\x2\x0\x3\x3\x3\x3\x1", "MASLVQTG"), 8, 7, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(11, 11, 34)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTTCA", "\x0\x2\x0\x3\x3\x3\x3\x1\x0", "MASLVQTGK"), 9, 8); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTTCA", "\x0\x2\x0\x3\x3\x3\x3\x1\x0", "MASLVQTGK"), 9, 8, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(13, 13, 39)); (void)forefront; @@ -1278,35 +1337,36 @@ unittest( "extend.base.revcomp" ) { struct dz_forefront_s const *forefront = NULL; /* nothing occurs */ - forefront = dz_extend(dz, q, NULL, 0, NULL, 0, 1); + forefront = dz_extend(dz, q, NULL, 0, NULL, 0, 1, 0); ut_assert(forefront == NULL); - forefront = dz_extend(dz, q, NULL, 0, "", 0, 2); + forefront = dz_extend(dz, q, NULL, 0, "", 0, 2, 0); ut_assert(forefront == NULL); - forefront = dz_extend(dz, q, dz_root(dz), 1, "", 0, 3); - ut_assert(forefront == *dz_root(dz)); + dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + forefront = dz_extend(dz, q, &aln_init.root, 1, "", 0, 3, aln_init.xt); + ut_assert(forefront == aln_init.root); /* extend */ - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("A", "\x0", "M"), 1, 4); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("A", "\x0", "M"), 1, 4, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(2, 2, 5)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 5); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 5, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(4, 4, 9)); if(trial == 1) { continue; } - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTT", "\x0\x2\x0\x3\x3\x3\x3", "MASLVQT"), 7, 6); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTT", "\x0\x2\x0\x3\x3\x3\x3", "MASLVQT"), 7, 6, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(9, 9, 28)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel(&"AAAATCT"[7], &"\x0\x0\x0\x0\x3\x1\x3"[7], "MASLVQT"), -7, 6); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel(&"AAAATCT"[7], &"\x0\x0\x0\x0\x3\x1\x3"[7], "MASLVQT"), -7, 6, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(9, 9, 28)); if(trial == 2) { continue; } - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTTC", "\x0\x2\x0\x3\x3\x3\x3\x1", "MASLVQTG"), 8, 7); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTTC", "\x0\x2\x0\x3\x3\x3\x3\x1", "MASLVQTG"), 8, 7, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(11, 11, 34)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel(&"GAAAATCT"[8], &"\x2\x0\x0\x0\x0\x3\x1\x3"[8], "MASLVQTG"), -8, 7); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel(&"GAAAATCT"[8], &"\x2\x0\x0\x0\x0\x3\x1\x3"[8], "MASLVQTG"), -8, 7, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(11, 11, 28)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AGATTTTCA", "\x0\x2\x0\x3\x3\x3\x3\x1\x0", "MASLVQTGK"), 9, 8); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AGATTTTCA", "\x0\x2\x0\x3\x3\x3\x3\x1\x0", "MASLVQTGK"), 9, 8, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(13, 13, 39)); - forefront = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel(&"TGAAAATCT"[9], &"\x3\x2\x0\x0\x0\x0\x3\x1\x3"[9], "MASLVQTGK"), -9, 8); + forefront = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel(&"TGAAAATCT"[9], &"\x3\x2\x0\x0\x0\x0\x3\x1\x3"[9], "MASLVQTGK"), -9, 8, aln_init.xt); ut_assert(forefront != NULL && forefront->max == dz_ut_sel(13, 13, 28)); (void)forefront; @@ -1334,14 +1394,15 @@ unittest( "extend.small" ) { * \ / \ / * C CATT */ - forefronts[0] = dz_extend(dz, q, dz_root(dz), 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 1); + dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + forefronts[0] = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 1, aln_init.xt); ut_assert(forefronts[0] != NULL && forefronts[0]->max == dz_ut_sel(4, 4, 9)); - forefronts[1] = dz_extend(dz, q, &forefronts[0], 1, dz_ut_sel("C", "\x1", "T"), 1, 2); + forefronts[1] = dz_extend(dz, q, &forefronts[0], 1, dz_ut_sel("C", "\x1", "T"), 1, 2, aln_init.xt); ut_assert(forefronts[1] != NULL && forefronts[1]->max == dz_ut_sel(6, 6, 14)); if(trial == 1) { continue; } - forefronts[2] = dz_extend(dz, q, &forefronts[0], 2, dz_ut_sel("TTTT", "\x3\x3\x3\x3", "LVQT"), 4, 3); + forefronts[2] = dz_extend(dz, q, &forefronts[0], 2, dz_ut_sel("TTTT", "\x3\x3\x3\x3", "LVQT"), 4, 3, aln_init.xt); if(trial == 2) { ut_assert(forefronts[2] != NULL && forefronts[2]->max == dz_ut_sel(8, 8, 18)); continue; @@ -1349,10 +1410,10 @@ unittest( "extend.small" ) { ut_assert(forefronts[2] != NULL && forefronts[2]->max == dz_ut_sel(14, 14, 32)); if(trial == 3) { continue; } - forefronts[3] = dz_extend(dz, q, &forefronts[2], 1, dz_ut_sel("CATT", "\x1\x0\x3\x3", "CKAK"), 4, 4); + forefronts[3] = dz_extend(dz, q, &forefronts[2], 1, dz_ut_sel("CATT", "\x1\x0\x3\x3", "CKAK"), 4, 4, aln_init.xt); ut_assert(forefronts[3] != NULL && forefronts[3]->max == dz_ut_sel(22, 22, 43)); - forefronts[4] = dz_extend(dz, q, &forefronts[2], 2, dz_ut_sel("CTGA", "\x1\x3\x2\x0", "QLTL"), 4, 5); + forefronts[4] = dz_extend(dz, q, &forefronts[2], 2, dz_ut_sel("CTGA", "\x1\x3\x2\x0", "QLTL"), 4, 5, aln_init.xt); ut_assert(forefronts[4] != NULL && forefronts[4]->max == dz_ut_sel(30, 30, 61)); } dz_destroy(dz); @@ -1586,13 +1647,14 @@ unittest( "trace" ) { struct dz_forefront_s const *forefront = NULL; struct dz_alignment_s *aln = NULL; - forefront = dz_extend(dz, q, dz_root(dz), 1, "A", 1, 1); + dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + forefront = dz_extend(dz, q, &aln_init.root, 1, "A", 1, 1, aln_init.xt); aln = dz_trace(dz, forefront); - forefront = dz_extend(dz, q, &forefront, 1, "GC", 2, 2); + forefront = dz_extend(dz, q, &forefront, 1, "GC", 2, 2, aln_init.xt); aln = dz_trace(dz, forefront); - forefront = dz_extend(dz, q, &forefront, 1, "TTTT", 4, 3); + forefront = dz_extend(dz, q, &forefront, 1, "TTTT", 4, 3, aln_init.xt); aln = dz_trace(dz, forefront); (void)aln; From 848d9986498afa2ab253d808ea57125d354226d2 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 17 Apr 2020 11:21:47 -0700 Subject: [PATCH 03/43] start writing qual adj preprocessor flags --- dozeu.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/dozeu.h b/dozeu.h index 025d719..b27ea7d 100644 --- a/dozeu.h +++ b/dozeu.h @@ -44,6 +44,11 @@ extern "C" { #include #include +#ifndef DZ_INCLUDE_ONCE + +#define DZ_NUM_QUAL_SCORES 64 +#define DZ_QUAL_MATRIX_SIZE (16 * DZ_NUM_QUAL_SCORES) + #ifndef DZ_CIGAR_OP # define DZ_CIGAR_OP 0x04030201 #endif @@ -157,9 +162,14 @@ unittest() { debug("hello"); } #define DZ_CELL_MARGIN ( 32 ) #define DZ_CELL_MARGINED_MIN ( DZ_CELL_MIN + DZ_CELL_MARGIN ) #define DZ_CELL_MARGINED_MAX ( DZ_CELL_MAX - DZ_CELL_MARGIN ) - + /* query; preconverted query sequence; blen = roundup(qlen, L) / L; array must have 16-byte-length margin at the tail */ -struct dz_query_s { uint64_t blen; char const *q; int16_t bonus[2 * sizeof(__m128i) / sizeof(int16_t)]; uint8_t arr[]; }; +struct dz_query_s { + uint64_t blen; + char const *q; + int16_t bonus[2 * sizeof(__m128i) / sizeof(int16_t)]; + uint8_t arr[]; +}; dz_static_assert(sizeof(struct dz_query_s) % sizeof(__m128i) == 0); /* node (reference) */ @@ -496,6 +506,9 @@ unittest() { #define _init_bonus(_query) ; #define _add_bonus(_i, _v) ( (_v) ) #endif + +#endif // DZ_INCLUDE_ONCE +// TODO: qual: somewhere in here is the score accessing method #if defined(DZ_NUCL_ASCII) #define _init_rch(_query, _rt, _rrem) \ @@ -575,8 +588,15 @@ unittest() { * @fn dz_init, dz_destroy */ static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_s *dz_qual_adj_init_intl( +#else struct dz_s *dz_init_intl( +#endif int8_t const *score_matrix, /* match award in positive, mismatch penalty in negative. s(A,A) at [0], s(A,C) at [1], ... s(T,T) at [15] where s(ref_base, query_base) is a score function */ +#ifdef DZ_QUAL_ADJ + int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ +#endiff uint16_t gap_open, /* gap penalties in positive */ uint16_t gap_extend, //uint64_t max_gap_len, /* as X-drop threshold */ @@ -593,13 +613,27 @@ struct dz_s *dz_init_intl( debug("failed to malloc memory"); return(NULL); } + // TODO: what is the point of the latter 16 bytes all being zero? #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) - struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s)); + /* allocate with or without space afterwards for the quality adjusted matrices */ + #ifdef DZ_QUAL_ADJ + struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_QUAL_MATRIX_SIZE); + #else + struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s)); + #endif /* constants */ __m128i const tmat = _mm_loadu_si128((__m128i const *)score_matrix); _mm_store_si128((__m128i *)&self->matrix[0], tmat); _mm_store_si128((__m128i *)&self->matrix[16], _mm_setzero_si128()); + + #ifdef DZ_QUAL_ADJ + /* load the quality adjusted matrices immediately after the dz_s */ + __m128i *qual_adj_mats = (__m128i *) (self + 1); + for(uint64_t i = 0; i < DZ_NUM_QUAL_SCORES; ++i) { + _mm_store_si128(qual_adj_mats + i, _mm_loadu_si128(((__m128i const *)qual_adj_score_matrix) + i)); + } + #endif #else struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i)); @@ -741,13 +775,24 @@ void dz_destroy( return; } static __dz_vectorize +#ifdef DZ_QUAL_ADJ +dz_qual_adj_flush( +#else void dz_flush( +#endif struct dz_s *self) { // point the mem back at the initial block dz_mem_flush(dz_mem(self)); - // move stack pointers past the dz_s and the dz_cap_s that follows it - dz_mem(self)->stack.top = (uint8_t *)(((dz_cap_s*) (self + 1)) + 1); + + // move stack pointers past the dz_s, maybe the qual matrices, and the dz_cap_s that follows8 + void *bottom = (void *) (self + 1); + #ifdef DZ_QUAL_ADJ + bottom = (void *) (((int8_t const *) bottom) + DZ_QUAL_MATRIX_SIZE); + #endif + bottom = (void *) (((dz_cap_s *) bottom) + 1); + + dz_mem(self)->stack.top = bottom; return; } @@ -869,7 +914,13 @@ struct dz_query_s *dz_pack_query_forward( char const *query, size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); - struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(__m128i)); + // TODO: qual: make sure I'm doing this right + size_t pack_size = dz_roundup(qlen + 1, L) + sizeof(__m128i); + #ifdef DZ_QUAL_ADJ + struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + 2 * pack_size); + #else + struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); + #endif *q = (struct dz_query_s){ .blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L), .q = query, @@ -901,7 +952,8 @@ struct dz_query_s *dz_pack_query_forward( for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) { __m128i const qv = _mm_loadu_si128((__m128i const *)&query[i]); __m128i tv = _mm_shuffle_epi8(cv, _mm_and_si128(qv, fv)); - _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv; /* shift by one to make room for a base */ + _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); /* shift by one to make room for a base */ + pv = tv; } /* continue the same conversion on the remainings */ @@ -910,9 +962,15 @@ struct dz_query_s *dz_pack_query_forward( for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { q->arr[i + 1] = conv[(uint8_t)query[i] & 0x0f]; } + // TODO: does he need the + 1 here? maybe that's why he is buffering the extra sizeof(__m128i) in the alloc? for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) { q->arr[i + 1] = qS; } + + // TODO: qual: finish this fwd + #ifdef DZ_QUAL_ADJ + + #endif debug("qlen(%lu), q(%.*s)", qlen, (int)qlen, query); return(q); @@ -923,7 +981,13 @@ struct dz_query_s *dz_pack_query_reverse( char const *query, size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); - struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(__m128i)); + // TODO: qual: make sure I'm doing this right + size_t pack_size = dz_roundup(qlen + 1, L) + sizeof(__m128i); + #ifdef DZ_QUAL_ADJ + struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + 2 * pack_size); + #else + struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); + #endif *q = (struct dz_query_s){ .blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L), .q = query, @@ -960,6 +1024,11 @@ struct dz_query_s *dz_pack_query_reverse( for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) { q->arr[i + 1] = qS; } + + // TODO: qual: finish this rev + #ifdef DZ_QUAL_ADJ + + #endif debug("qlen(%lu), q(%.*s)", qlen, (int)qlen, query); return(q); @@ -1665,6 +1734,7 @@ unittest( "trace" ) { }; /* extern "C" { */ #endif +#define DZ_INCLUDE_ONCE /** * end of dozeu.h */ From 7d5a519b9bd58dd161bb6f258654fe76900da33f Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 22 Apr 2020 11:19:40 -0700 Subject: [PATCH 04/43] first draft of qual adj flag --- dozeu.h | 321 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 267 insertions(+), 54 deletions(-) diff --git a/dozeu.h b/dozeu.h index b27ea7d..549a733 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,6 +1,6 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c -// #define DEBUG -// #define DZ_PRINT_VECTOR +//#define DEBUG +//#define DZ_PRINT_VECTOR /** * @file dozeu.h * @brief SIMD X-drop DP for read-to-graph alignment @@ -62,6 +62,10 @@ extern "C" { # define DZ_NUCL_ASCII # endif +#if defined(DZ_NUCL_2BIT) && defined(DZ_QUAL_ADJ) +#error "Base quality adjustment is only supported in ASCII configuration" +#endif + /* define DZ_N_AS_UNMATCHING_BASE to penalize Ns, otherwise scores for (x, N) and (N, x) are always set zero */ // #define DZ_N_AS_UNMATCHING_BASE enum dz_alphabet { @@ -73,9 +77,18 @@ enum dz_alphabet { rN = 0x90, qN = 0x90, qS = 0x02 /* pshufb instruction clears the column when the 7-th bit of the index is set */ #endif }; -#define dz_pair_score(_self, _q, _r, _i) ( (_self)->matrix[((_r) | (_q)->arr[(_i)]) & 0x1f] ) -#define dz_pair_eq(_self, _q, _r, _i) ( (uint32_t)((_q)->arr[(_i)]) == ((uint32_t)(_r)<<2) ) -#else +/* get the first index of the quals from a packed query (only valid if DZ_QUAL_ADJ is defined) */ +#define dz_quals(_query) ( (uint8_t const *) &(_query)->arr[(_query)->arr_end] ) +/* get the base quality adjusted matrix (only valid if DZ_QUAL_ADJ is defined) */ +#define dz_qual_matrix(_self) ( (int8_t const *)((_self) + 1) ) + +#define dz_pair_score(_self, _q, _r, _i) ( (_self)->matrix[((_r) | (_q)->arr[(_i)]) & 0x1f] ) + +#define dz_qual_adj_pair_score(_self, _q, _r, _i) ( dz_qual_matrix(_self)[(((uint32_t) dz_quals(_q)[(_i)]) << 5) | ((uint32_t)(_r)) | ((uint32_t)((_q)->arr[(_i)] & 0x1f))] ) + +#define dz_pair_eq(_self, _q, _r, _i) ( (uint32_t)((_q)->arr[(_i)]) == ((uint32_t)(_r)<<2) ) + +#else // ! DZ_PROTEIN /* protein */ # ifndef DZ_MAT_SIZE @@ -93,7 +106,7 @@ enum dz_alphabet { #define dz_trap() { *((volatile uint8_t *)NULL); } #ifdef DZ_PROTEIN dz_static_assert(DZ_MAT_SIZE <= 32); -#endif +#endif // DZ_PROTEIN #if (defined(DEBUG) || defined(UNITTEST)) && !defined(__cplusplus) # include "log.h" @@ -165,7 +178,8 @@ unittest() { debug("hello"); } /* query; preconverted query sequence; blen = roundup(qlen, L) / L; array must have 16-byte-length margin at the tail */ struct dz_query_s { - uint64_t blen; + uint32_t blen; + uint32_t arr_end; char const *q; int16_t bonus[2 * sizeof(__m128i) / sizeof(int16_t)]; uint8_t arr[]; @@ -241,6 +255,8 @@ struct dz_s { }; dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); #define dz_mem(_self) ( (struct dz_mem_s *)(_self) - 1 ) +/* get the base quality adjusted matrix (only valid if DZ_QUAL_ADJ is defined) */ +#define dz_qual_matrix(_self) ( (int8_t const *)((_self) + 1) ) //#define dz_root(_self) ( (struct dz_forefront_s const **)(&_self->root) ) #define dz_is_terminated(_ff) ( dz_cff(_ff)->r.spos >= dz_cff(_ff)->r.epos ) @@ -413,6 +429,8 @@ unittest() { dz_mem_destroy(mem); } } + +#endif // DZ_INCLUDE_ONCE /** * vector update macros @@ -506,24 +524,51 @@ unittest() { #define _init_bonus(_query) ; #define _add_bonus(_i, _v) ( (_v) ) #endif - -#endif // DZ_INCLUDE_ONCE -// TODO: qual: somewhere in here is the score accessing method #if defined(DZ_NUCL_ASCII) +#ifdef DZ_QUAL_ADJ #define _init_rch(_query, _rt, _rrem) \ - _init_bonus(_query); \ - uint32_t rch = conv[_rt[-_rrem] & 0x0f]; \ - /* debug("rch(%c, %u, %x)", _rt[-_rrem], rch, rch); */ \ - uint8_t const *parr = (_query)->arr; \ - __m128i const rv = _mm_set1_epi8(rch); + _init_bonus(_query); \ + uint32_t rch = conv[_rt[-_rrem] & 0x0f]; \ + /* debug("rch(%c, %u, %x)", _rt[-_rrem], rch, rch); */ \ + uint8_t const *parr = (_query)->arr; \ + uint8_t const *qarr = dz_quals(_query); \ + __m128i const rv = _mm_set1_epi8(rch); +#define _calc_score_profile(_i) ({ \ + /* construct the within-matrix and between-matrix indexes and then combine them */ \ + __m128i iv = _mm_or_si128(_mm_slli_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i const *)&qarr[(_i) * L])), 5), \ + _mm_cvtepi8_epi16(_mm_and_si128(_mm_or_si128(rv, _mm_loadl_epi64((__m128i const *)&parr[(_i) * L])), _mm_set1_epi8(0x1f)))); \ + /* get the scores from the quality matrices (unvectorized, unfortunately) */ \ + int8_t const *qual_mat = dz_qual_matrix(_self); \ + /* TODO: qual: is this the right order? */ \ + __m128i sc = _mm_setr_epi16( \ + qual_mat[_mm_extract_epi16(iv, 7)], \ + qual_mat[_mm_extract_epi16(iv, 6)], \ + qual_mat[_mm_extract_epi16(iv, 5)], \ + qual_mat[_mm_extract_epi16(iv, 4)], \ + qual_mat[_mm_extract_epi16(iv, 3)], \ + qual_mat[_mm_extract_epi16(iv, 2)], \ + qual_mat[_mm_extract_epi16(iv, 1)], \ + qual_mat[_mm_extract_epi16(iv, 0)], \ + ); \ + sc; \ +}) +#else +#define _init_rch(_query, _rt, _rrem) \ + _init_bonus(_query); \ + uint32_t rch = conv[_rt[-_rrem] & 0x0f]; \ + /* debug("rch(%c, %u, %x)", _rt[-_rrem], rch, rch); */ \ + uint8_t const *parr = (_query)->arr; \ + __m128i const rv = _mm_set1_epi8(rch); + #define _calc_score_profile(_i) ({ \ __m128i qv = _mm_loadl_epi64((__m128i const *)&parr[(_i) * L]); \ __m128i sc = _mm_cvtepi8_epi16(_mm_shuffle_epi8(_mm_load_si128((__m128i const *)self->matrix), _mm_or_si128(rv, qv))); \ /* print_vector(_mm_cvtepi8_epi16(rv)); print_vector(_mm_cvtepi8_epi16(qv)); */ \ sc; \ }) +#endif #elif defined(DZ_NUCL_2BIT) #define _init_rch(_query, _rt, _rrem) \ _init_bonus(_query); \ @@ -596,7 +641,7 @@ struct dz_s *dz_init_intl( int8_t const *score_matrix, /* match award in positive, mismatch penalty in negative. s(A,A) at [0], s(A,C) at [1], ... s(T,T) at [15] where s(ref_base, query_base) is a score function */ #ifdef DZ_QUAL_ADJ int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ -#endiff +#endif uint16_t gap_open, /* gap penalties in positive */ uint16_t gap_extend, //uint64_t max_gap_len, /* as X-drop threshold */ @@ -617,7 +662,7 @@ struct dz_s *dz_init_intl( #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) /* allocate with or without space afterwards for the quality adjusted matrices */ #ifdef DZ_QUAL_ADJ - struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_QUAL_MATRIX_SIZE); + struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + 2 * DZ_QUAL_MATRIX_SIZE); #else struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s)); #endif @@ -631,7 +676,8 @@ struct dz_s *dz_init_intl( /* load the quality adjusted matrices immediately after the dz_s */ __m128i *qual_adj_mats = (__m128i *) (self + 1); for(uint64_t i = 0; i < DZ_NUM_QUAL_SCORES; ++i) { - _mm_store_si128(qual_adj_mats + i, _mm_loadu_si128(((__m128i const *)qual_adj_score_matrix) + i)); + _mm_store_si128(qual_adj_mats + (i * 2), _mm_loadu_si128(((__m128i const *)qual_adj_score_matrix) + i)); + _mm_store_si128(qual_adj_mats + (i * 2 + 1), _mm_setzero_si128()); } #endif #else @@ -693,8 +739,13 @@ struct dz_s *dz_init_intl( return(self); } + static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_alignment_init_s dz_qual_adj_align_init( +#else struct dz_alignment_init_s dz_align_init( +#endif struct dz_s *self, uint32_t max_gap_len) { @@ -744,27 +795,52 @@ struct dz_alignment_init_s dz_align_init( #ifdef DZ_FULL_LENGTH_BONUS static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_s *dz_qual_adj_init( +#else struct dz_s *dz_init( +#endif int8_t const *score_matrix, +#ifdef DZ_QUAL_ADJ + int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ +#endif uint16_t gap_open, uint16_t gap_extend, //uint64_t max_gap_len, uint16_t full_length_bonus) { +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)) +#else return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); +#endif } -#else + +#else // DZ_FULL_LENGTH_BONUS + static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_s *dz_qual_adj_init( +#else struct dz_s *dz_init( +#endif int8_t const *score_matrix, +#ifdef DZ_QUAL_ADJ + int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ +#endif uint16_t gap_open, uint16_t gap_extend)//, //uint64_t max_gap_len) { - return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); -} +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, 0)) +#else + return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); #endif - +} +#endif // DZ_FULL_LENGTH_BONUS + +#ifndef DZ_INCUDE_ONCE static __dz_vectorize void dz_destroy( struct dz_s *self) @@ -774,9 +850,11 @@ void dz_destroy( dz_mem_destroy(dz_mem(self)); return; } +#endif + static __dz_vectorize #ifdef DZ_QUAL_ADJ -dz_qual_adj_flush( +void dz_qual_adj_flush( #else void dz_flush( #endif @@ -785,16 +863,18 @@ void dz_flush( // point the mem back at the initial block dz_mem_flush(dz_mem(self)); - // move stack pointers past the dz_s, maybe the qual matrices, and the dz_cap_s that follows8 - void *bottom = (void *) (self + 1); + // move stack pointers past the dz_s, maybe the qual matrices, and the dz_cap_s that follows + void *bottom = (void *)(self + 1); #ifdef DZ_QUAL_ADJ - bottom = (void *) (((int8_t const *) bottom) + DZ_QUAL_MATRIX_SIZE); + bottom = (void *)(((int8_t *)bottom) + 2 * DZ_QUAL_MATRIX_SIZE); #endif - bottom = (void *) (((dz_cap_s *) bottom) + 1); + bottom = (void *)(((dz_cap_s *)bottom) + 1); - dz_mem(self)->stack.top = bottom; + dz_mem(self)->stack.top = (uint8_t *)bottom; return; } + +#ifndef DZ_INCUDE_ONCE #if defined(DZ_NUCL_ASCII) static size_t const dz_unittest_query_length = 560; @@ -901,6 +981,9 @@ unittest() { //ut_assert(dz->root != NULL); dz_destroy(dz); } + + +#endif // DZ_INCUDE_ONCE /** * @fn dz_pack_query, dz_pack_query_reverse @@ -909,9 +992,17 @@ unittest() { #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_query_s *dz_qual_adj_pack_query_forward( +#else struct dz_query_s *dz_pack_query_forward( +#endif struct dz_s *self, - char const *query, size_t qlen) + char const *query, +#ifdef DZ_QUAL_ADJ + uint8_t const *qual, +#endif + size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right @@ -922,7 +1013,8 @@ struct dz_query_s *dz_pack_query_forward( struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); #endif *q = (struct dz_query_s){ - .blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L), + .blen = (uint32_t) ((qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L)), + .arr_end = (uint32_t) pack_size, .q = query, .bonus = { 0 } }; @@ -949,7 +1041,7 @@ struct dz_query_s *dz_pack_query_forward( __m128i const cv = _mm_load_si128((__m128i const *)conv); /* conversion table */ /* until the end of the query sequence */ - for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) { + for(size_t i = 0, end = dz_rounddown(qlen, sizeof(__m128i)); i < end; i += sizeof(__m128i)) { __m128i const qv = _mm_loadu_si128((__m128i const *)&query[i]); __m128i tv = _mm_shuffle_epi8(cv, _mm_and_si128(qv, fv)); _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); /* shift by one to make room for a base */ @@ -963,22 +1055,48 @@ struct dz_query_s *dz_pack_query_forward( q->arr[i + 1] = conv[(uint8_t)query[i] & 0x0f]; } // TODO: does he need the + 1 here? maybe that's why he is buffering the extra sizeof(__m128i) in the alloc? - for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) { + for(size_t i = qlen, end = dz_roundup(qlen + 1, sizeof(__m128i)); i < end; i++) { q->arr[i + 1] = qS; } - // TODO: qual: finish this fwd + // TODO: qual: probably could make this faster if we apply the <<5 once here and store in 16 bit values + #ifdef DZ_QUAL_ADJ - + __m128i const qmax = _mm_set1_epu8(DZ_NUM_QUAL_SCORES - 1); + __m128i qpv = _mm_setzero_si128(); + + /* until the end of the query sequence */ + for(size_t i = 0, end = dz_rounddown(qlen, sizeof(__m128i)); i < end; i += sizeof(__m128i)) { + __m128i const qtv = _mm_min_epu8(_mm_loadu_si128((__m128i const *)&qual[i]), qmax); + _mm_store_si128((__m128i *)&q->arr[pack_size + i], _mm_alignr_epi8(qtv, qpv, 15)); /* shift by one to make room for a base */ + qpv = qtv; + } + + /* continue the same conversion on the remainings */ + q->arr[pack_size + dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(qpv, 15); + for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { + q->arr[pack_size + i + 1] = dz_min2(qual[i], DZ_NUM_QUAL_SCORES - 1); + } + for(size_t i = qlen, end = dz_roundup(qlen + 1, sizeof(__m128i)); i < end; i++) { + q->arr[pack_size + i + 1] = 0; + } #endif debug("qlen(%lu), q(%.*s)", qlen, (int)qlen, query); return(q); } static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_query_s *dz_qual_adj_pack_query_reverse( +#else struct dz_query_s *dz_pack_query_reverse( - struct dz_s *self, - char const *query, size_t qlen) +#endif + struct dz_s *self, + char const *query, +#ifdef DZ_QUAL_ADJ + uint8_t const *qual, +#endif + size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right @@ -989,7 +1107,8 @@ struct dz_query_s *dz_pack_query_reverse( struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); #endif *q = (struct dz_query_s){ - .blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L), + .blen = (uint32_t) (qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L)), + .arr_end = (uint32_t) pack_size, .q = query, .bonus = { 0 } }; @@ -1010,7 +1129,7 @@ struct dz_query_s *dz_pack_query_reverse( __m128i const cv = _mm_load_si128((__m128i const *)conv), rv = _mm_load_si128((__m128i const *)rev); /* until the end of the query sequence */ - for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) { + for(size_t i = 0, end = dz_rounddown(qlen, sizeof(__m128i)); i < end; i += sizeof(__m128i)) { __m128i const qv = _mm_loadu_si128((__m128i const *)&query[qlen - 16 - i]); __m128i tv = _mm_shuffle_epi8(_mm_shuffle_epi8(cv, _mm_and_si128(qv, fv)), rv); _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv; /* shift by one to make room for a base */ @@ -1021,13 +1140,31 @@ struct dz_query_s *dz_pack_query_reverse( for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { q->arr[i + 1] = conv[(uint8_t)query[qlen - 1 - i] & 0x0f]; } - for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) { + for(size_t i = qlen, end = dz_roundup(qlen + 1, sizeof(__m128i)); i < end; i++) { q->arr[i + 1] = qS; } // TODO: qual: finish this rev #ifdef DZ_QUAL_ADJ + __m128i const qmax = _mm_set1_epu8(DZ_NUM_QUAL_SCORES - 1); + __m128i qpv = _mm_setzero_si128(); + + /* until the end of the query sequence */ + for(size_t i = 0, end = dz_rounddown(qlen, sizeof(__m128i)); i < end; i += sizeof(__m128i)) { + __m128i qtv = _mm_min_epu8(_mm_shuffle_epi8(__mm_loadu_si128((__m128i const *)&qual[qlen - 16 - i]), rv), qmax); + _mm_store_si128((__m128i *)&q->arr[pack_size + i], _mm_alignr_epi8(qtv, qpv, 15)); + qpv = qtv; /* shift by one to make room for a base */ + } + + /* continue the same conversion on the remainings */ + q->arr[pack_size + dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(qpv, 15); + for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { + q->arr[pack_size + i + 1] = dz_min2(qual[qlen - 1 - i], DZ_NUM_QUAL_SCORES - 1); + } + for(size_t i = qlen, end = dz_roundup(qlen + 1, sizeof(__m128i)); i < end; i++) { + q->arr[pack_size + i + 1] = 0; + } #endif debug("qlen(%lu), q(%.*s)", qlen, (int)qlen, query); @@ -1133,18 +1270,36 @@ struct dz_query_s *dz_pack_query_reverse( return(q); } #endif + static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_query_s *dz_qual_adj_pack_query( +#else struct dz_query_s *dz_pack_query( +#endif struct dz_s *self, - char const *query, int64_t qlen) + char const *query, +#ifdef DZ_QUAL_ADJ + const uint8_t *qual, +#endif + int64_t qlen) { if(qlen >= 0) { - return(dz_pack_query_forward(self, query, (size_t)qlen)); +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_pack_query_forward(self, query, qual, (size_t)qlen)); +#else + return(dz_pack_query_forward(self, query, (size_t)qlen)); +#endif } else { - return(dz_pack_query_reverse(self, &query[qlen], (size_t)-qlen)); +#ifdef DZ_QUAL_ADJ + return(dz_qual_pack_query_reverse(self, &query[qlen], &qual[qlen], (size_t)-qlen)); +#else + return(dz_pack_query_reverse(self, &query[qlen], (size_t)-qlen)); +#endif } } +#ifndef DZ_INCLUDE_ONCE unittest() { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); @@ -1154,6 +1309,8 @@ unittest() { dz_unused(q); dz_destroy(dz); } + +#endif // DZ_INCLUDE_ONCE #define _merge_column(w, adj, forefronts, n_forefronts, query, init_s) ({ \ for(size_t i = 0; i < n_forefronts; i++) { \ @@ -1209,10 +1366,15 @@ unittest() { /* until the bottommost vertically placed band... */ \ uint32_t sspos = w.r.spos; /* save spos on the stack */ \ for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ - _load_vector(&pdp[p]); _update_vector(p); \ + _load_vector(&pdp[p]); \ + _update_vector(p); \ if(dz_unlikely(_test_xdrop(s, xtv))) { /* mark _unlikely to move out of the core loop */ \ /* drop either leading or trailing vector, skip the forefront extension when the forefront is clipped */ \ - if(p == w.r.spos) { w.r.spos++; cdp--; continue; } else { w.r.epos = p; goto dz_pp_cat(_forefront_, __LINE__); } \ + if(p == w.r.spos) { \ + w.r.spos++; cdp--; continue; \ + } else { \ + w.r.epos = p; goto dz_pp_cat(_forefront_, __LINE__); \ + } \ } \ _store_vector(&cdp[p]); \ } \ @@ -1235,18 +1397,23 @@ dz_pp_cat(_forefront_, __LINE__):; \ /* debug("update inc(%u), max(%u, %u, %p), cap(%p)", w.inc, w.max, w.max + w.inc, w.mcap, cap); */ \ cdp; \ }) - + /** * @fn dz_extend_intl */ static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_forefront_s const *dz_qual_adj_extend_intl( +#else struct dz_forefront_s const *dz_extend_intl( - struct dz_s *self, +#endif + struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, char const *ref, int32_t rlen, uint32_t rid, uint16_t xt, uint16_t init_s) { + size_t const L = sizeof(__m128i) / sizeof(uint16_t); if(n_forefronts == 0) { return(NULL); } /* invalid */ if(rlen == 0 && n_forefronts == 1) { return(forefronts[0]); } /* no need to merge incoming vectors */ @@ -1303,23 +1470,39 @@ struct dz_forefront_s const *dz_extend_intl( * NOTE: DZ_N_AS_UNMATCHING_BASE is not recommended when dz_scan is used */ static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_forefront_s const *dz_qual_adj_extend( +#else struct dz_forefront_s const *dz_extend( +#endif struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, char const *ref, int32_t rlen, uint32_t rid, uint16_t xt) { +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, INT16_MIN)); +#else return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, INT16_MIN)); +#endif } static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_forefront_s const *dz_qual_adj_scan( +#else struct dz_forefront_s const *dz_scan( +#endif struct dz_s *self, struct dz_query_s const *query, struct dz_forefront_s const **forefronts, size_t n_forefronts, char const *ref, int32_t rlen, uint32_t rid, uint16_t xt) { debug("dz_scan called"); +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, 0)); +#else return(dz_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, 0)); +#endif } @@ -1336,6 +1519,8 @@ struct dz_forefront_s const *dz_scan( #undef _merge_column #undef _fill_column +#ifndef DZ_INCLUDE_ONCE + /* short, exact matching sequences */ unittest( "extend.base" ) { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); @@ -1442,7 +1627,7 @@ unittest( "extend.base.revcomp" ) { } dz_destroy(dz); } -#endif +#endif // ! DZ_PROTEIN /* a small graph */ unittest( "extend.small" ) { @@ -1488,6 +1673,8 @@ unittest( "extend.small" ) { dz_destroy(dz); } +// TODO: qual: this can probably be shared with qual/non-qual + /** * @fn dz_calc_max_rpos */ @@ -1549,6 +1736,10 @@ uint64_t dz_calc_max_pos( debug("forefront(%p), pcap(%p), rrem(%d), rlen(%d)", forefront, pcap, (int32_t)pcap->rrem, (int32_t)forefront->rlen); return((dz_calc_max_rpos(self, forefront)<<32) | dz_calc_max_qpos(self, forefront)); } + +#endif // DZ_INCLUDE_ONCE + +// TODO: qual: need to adjust these traceback macros /* traceback macros */ #define _is_head(_cap) ( (_cap)->r.epos == 0 ) @@ -1596,17 +1787,21 @@ uint64_t dz_calc_max_pos( * @fn dz_trace */ static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_alignment_s *dz_qual_adj_trace( +#else struct dz_alignment_s *dz_trace( +#endif struct dz_s *self, struct dz_forefront_s const *forefront) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); - if(forefront->mcap == NULL) { return(NULL); } + if(forefront->mcap == NULL) { debug("mcap is null"); return(NULL); } /* detect pos */ uint64_t idx = dz_calc_max_qpos(self, forefront); /* vector index, cell index */ uint64_t ref_length = 0, query_length = idx; - + /* allocate aln object */ size_t aln_size = (sizeof(struct dz_alignment_s) + (forefront->rcnt + 6) * sizeof(struct dz_path_span_s) @@ -1616,26 +1811,32 @@ struct dz_alignment_s *dz_trace( struct dz_path_span_s *span = (struct dz_path_span_s *)(aln + 1) + forefront->rcnt + 4, *span_base = span; uint8_t *path = ((uint8_t *)(span + 2)) + forefront->rsum + idx + 1, *path_base = --path; _push_span(forefront->rid); *path = '\0'; /* make sure readable as C string */ - + /* load max column pointers */ struct dz_cap_s const *pcap = forefront->mcap, *cap = NULL; struct dz_query_s const *query = forefront->query; int32_t score = _s(s, pcap, idx), cnt[4] = { 0 }; - + uint8_t debug_ref[1024], debug_query[1024]; uint8_t *drp = debug_ref, *dqp = debug_query; + + #ifdef DZ_QUAL_ADJ + # define _pair_score(_self, _q, _r, _i) ( dz_qual_adj_pair_score((_self), (_q), (_r), (_i)) ) + #else + # define _pair_score(_self, _q, _r, _i) ( dz_pair_score((_self), (_q), (_r), (_i)) ) + #endif /* traceback loop */ #define _debug(_label) { \ debug("test %s, idx(%lu), rbase(%d, %c), qbase(%c), c(%d), e(%d), f(%d), s(%d), score(%d, %lu), diag(%d)", \ #_label, idx, rch, '-', '-', /*"ACGTNNNNNNNNNNNN"[rch & 0xff], "ANNNCNNNGNNNTNNN"[query->arr[idx]],*/ \ score, _s(e, cap, idx), _s(f, cap, idx), _s(s, pcap, idx - 1), \ - dz_pair_score(self, query, rch, idx), (uint64_t)dz_pair_eq(self, query, rch, idx), \ - (score == (_s(s, pcap, idx - 1) + dz_pair_score(self, query, rch, idx)))); \ + _pair_score(self, query, rch, idx), (uint64_t)dz_pair_eq(self, query, rch, idx), \ + (score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx)))); \ } #define _match(_idx) { \ if(dz_inside(pcap->r.spos, _vector_idx(idx - 1), pcap->r.epos) \ - && score == (_s(s, pcap, idx - 1) + dz_pair_score(self, query, rch, idx))) { \ + && score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx))) { \ uint64_t eq = dz_pair_eq(self, query, rch, idx); \ *--path = DZ_CIGAR_OP>>(eq<<3); cnt[eq]++; \ score = _s(s, pcap, idx - 1); idx--; rch = _load_prev_cap(s, score, _idx); \ @@ -1694,6 +1895,12 @@ _trace_tail:; } span[span_length].offset = path_length; return(aln); + + #undef _pair_score + #undef _debug + #undef _match + #undef _ins + #undef _del } #undef _is_head @@ -1707,6 +1914,8 @@ _trace_tail:; #undef _idx_dsc #undef _load_prev_cap +#ifndef DZ_INCLUDE_ONCE + /* short, exact matching sequences */ unittest( "trace" ) { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); @@ -1730,10 +1939,14 @@ unittest( "trace" ) { dz_destroy(dz); } +#endif // DZ_INCLUDE_ONCE + #ifdef __cplusplus }; /* extern "C" { */ #endif +// make sure all of the non-duplicated functions are only included once if we also include +// include the quality adjusted versions #define DZ_INCLUDE_ONCE /** * end of dozeu.h From d598123c01b2bf109c9af6f9de2ffc85e31a3457 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 22 Apr 2020 15:13:07 -0700 Subject: [PATCH 05/43] only include unittests without qual adj --- dozeu.h | 107 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 36 deletions(-) diff --git a/dozeu.h b/dozeu.h index 549a733..d0e969f 100644 --- a/dozeu.h +++ b/dozeu.h @@ -306,6 +306,11 @@ void dz_free( free((void *)((uint8_t *)ptr - DZ_MEM_MARGIN_SIZE)); return; } + +#endif // DZ_INCLUDE_ONCE + + +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) unittest() { size_t size[] = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 }; @@ -318,7 +323,11 @@ unittest() { dz_free(p); } } + +#endif +#ifndef DZ_INCLUDE_ONCE + /** * @fn dz_mem_init, dz_mem_destroy, dz_mem_add_stack, dz_mem_malloc, dz_mem_flush * @brief stack chain @@ -418,7 +427,11 @@ void *dz_mem_malloc( mem->stack.top += dz_roundup(size, sizeof(__m128i)); return(ptr); } + +#endif // DZ_INCLUDE_ONCE +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) + unittest() { size_t size[] = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 }; for(size_t i = 0; i < sizeof(size) / sizeof(size_t); i++) { @@ -430,7 +443,7 @@ unittest() { } } -#endif // DZ_INCLUDE_ONCE +#endif /** * vector update macros @@ -540,18 +553,16 @@ unittest() { __m128i iv = _mm_or_si128(_mm_slli_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i const *)&qarr[(_i) * L])), 5), \ _mm_cvtepi8_epi16(_mm_and_si128(_mm_or_si128(rv, _mm_loadl_epi64((__m128i const *)&parr[(_i) * L])), _mm_set1_epi8(0x1f)))); \ /* get the scores from the quality matrices (unvectorized, unfortunately) */ \ - int8_t const *qual_mat = dz_qual_matrix(_self); \ + int8_t const *qual_mat = dz_qual_matrix(self); \ /* TODO: qual: is this the right order? */ \ - __m128i sc = _mm_setr_epi16( \ - qual_mat[_mm_extract_epi16(iv, 7)], \ - qual_mat[_mm_extract_epi16(iv, 6)], \ - qual_mat[_mm_extract_epi16(iv, 5)], \ - qual_mat[_mm_extract_epi16(iv, 4)], \ - qual_mat[_mm_extract_epi16(iv, 3)], \ - qual_mat[_mm_extract_epi16(iv, 2)], \ - qual_mat[_mm_extract_epi16(iv, 1)], \ - qual_mat[_mm_extract_epi16(iv, 0)], \ - ); \ + __m128i sc = _mm_setr_epi16(qual_mat[_mm_extract_epi16(iv, 7)], \ + qual_mat[_mm_extract_epi16(iv, 6)], \ + qual_mat[_mm_extract_epi16(iv, 5)], \ + qual_mat[_mm_extract_epi16(iv, 4)], \ + qual_mat[_mm_extract_epi16(iv, 3)], \ + qual_mat[_mm_extract_epi16(iv, 2)], \ + qual_mat[_mm_extract_epi16(iv, 1)], \ + qual_mat[_mm_extract_epi16(iv, 0)]); \ sc; \ }) #else @@ -739,13 +750,16 @@ struct dz_s *dz_init_intl( return(self); } + +#ifndef DZ_INCLUDE_ONCE +// TODO: qual: i think this doesn't change between qual/non-qual static __dz_vectorize -#ifdef DZ_QUAL_ADJ -struct dz_alignment_init_s dz_qual_adj_align_init( -#else +//#ifdef DZ_QUAL_ADJ +//struct dz_alignment_init_s dz_qual_adj_align_init( +//#else struct dz_alignment_init_s dz_align_init( -#endif +//#endif struct dz_s *self, uint32_t max_gap_len) { @@ -792,6 +806,8 @@ struct dz_alignment_init_s dz_align_init( aln_init.xt = xt; return(aln_init); } + +#endif // DZ_INCLUDE_ONCE #ifdef DZ_FULL_LENGTH_BONUS static __dz_vectorize @@ -810,7 +826,7 @@ struct dz_s *dz_init( uint16_t full_length_bonus) { #ifdef DZ_QUAL_ADJ - return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)) + return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)); #else return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); #endif @@ -833,14 +849,14 @@ struct dz_s *dz_init( //uint64_t max_gap_len) { #ifdef DZ_QUAL_ADJ - return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, 0)) + return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, 0)); #else return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); #endif } #endif // DZ_FULL_LENGTH_BONUS -#ifndef DZ_INCUDE_ONCE +#ifndef DZ_INCLUDE_ONCE static __dz_vectorize void dz_destroy( struct dz_s *self) @@ -850,7 +866,7 @@ void dz_destroy( dz_mem_destroy(dz_mem(self)); return; } -#endif +#endif // DZ_INCLUDE_ONCE static __dz_vectorize #ifdef DZ_QUAL_ADJ @@ -874,8 +890,8 @@ void dz_flush( return; } -#ifndef DZ_INCUDE_ONCE - +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) + #if defined(DZ_NUCL_ASCII) static size_t const dz_unittest_query_length = 560; static char const *dz_unittest_query = @@ -966,15 +982,24 @@ static int8_t const dz_unittest_score_matrix[DZ_MAT_SIZE * DZ_MAT_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#endif +#endif // DZ_PROTEIN + +#endif // !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) +#ifndef DZ_UNITTEST_SCORE_PARAMS #ifdef DZ_FULL_LENGTH_BONUS #define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1, 0 #else #define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1 #endif +#endif + +#ifndef DZ_UNITTEST_MAX_GAP_LEN #define DZ_UNITTEST_MAX_GAP_LEN 20 +#endif +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) + unittest() { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); ut_assert(dz != NULL); @@ -983,7 +1008,7 @@ unittest() { } -#endif // DZ_INCUDE_ONCE +#endif /** * @fn dz_pack_query, dz_pack_query_reverse @@ -1062,7 +1087,7 @@ struct dz_query_s *dz_pack_query_forward( // TODO: qual: probably could make this faster if we apply the <<5 once here and store in 16 bit values #ifdef DZ_QUAL_ADJ - __m128i const qmax = _mm_set1_epu8(DZ_NUM_QUAL_SCORES - 1); + __m128i const qmax = _mm_set1_epi8(DZ_NUM_QUAL_SCORES - 1); __m128i qpv = _mm_setzero_si128(); /* until the end of the query sequence */ @@ -1147,12 +1172,12 @@ struct dz_query_s *dz_pack_query_reverse( // TODO: qual: finish this rev #ifdef DZ_QUAL_ADJ - __m128i const qmax = _mm_set1_epu8(DZ_NUM_QUAL_SCORES - 1); + __m128i const qmax = _mm_set1_epi8(DZ_NUM_QUAL_SCORES - 1); __m128i qpv = _mm_setzero_si128(); /* until the end of the query sequence */ for(size_t i = 0, end = dz_rounddown(qlen, sizeof(__m128i)); i < end; i += sizeof(__m128i)) { - __m128i qtv = _mm_min_epu8(_mm_shuffle_epi8(__mm_loadu_si128((__m128i const *)&qual[qlen - 16 - i]), rv), qmax); + __m128i qtv = _mm_min_epu8(_mm_shuffle_epi8(_mm_loadu_si128((__m128i const *)&qual[qlen - 16 - i]), rv), qmax); _mm_store_si128((__m128i *)&q->arr[pack_size + i], _mm_alignr_epi8(qtv, qpv, 15)); qpv = qtv; /* shift by one to make room for a base */ } @@ -1292,14 +1317,14 @@ struct dz_query_s *dz_pack_query( #endif } else { #ifdef DZ_QUAL_ADJ - return(dz_qual_pack_query_reverse(self, &query[qlen], &qual[qlen], (size_t)-qlen)); + return(dz_qual_adj_pack_query_reverse(self, &query[qlen], &qual[qlen], (size_t)-qlen)); #else return(dz_pack_query_reverse(self, &query[qlen], (size_t)-qlen)); #endif } } -#ifndef DZ_INCLUDE_ONCE +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) unittest() { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); @@ -1310,7 +1335,7 @@ unittest() { dz_destroy(dz); } -#endif // DZ_INCLUDE_ONCE +#endif #define _merge_column(w, adj, forefronts, n_forefronts, query, init_s) ({ \ for(size_t i = 0; i < n_forefronts; i++) { \ @@ -1519,7 +1544,7 @@ struct dz_forefront_s const *dz_scan( #undef _merge_column #undef _fill_column -#ifndef DZ_INCLUDE_ONCE +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) /* short, exact matching sequences */ unittest( "extend.base" ) { @@ -1627,7 +1652,8 @@ unittest( "extend.base.revcomp" ) { } dz_destroy(dz); } -#endif // ! DZ_PROTEIN + +#endif // !DZ_PROTEIN /* a small graph */ unittest( "extend.small" ) { @@ -1672,8 +1698,12 @@ unittest( "extend.small" ) { } dz_destroy(dz); } + +#endif // !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) // TODO: qual: this can probably be shared with qual/non-qual + +#ifndef DZ_INCLUDE_ONCE /** * @fn dz_calc_max_rpos @@ -1739,8 +1769,6 @@ uint64_t dz_calc_max_pos( #endif // DZ_INCLUDE_ONCE -// TODO: qual: need to adjust these traceback macros - /* traceback macros */ #define _is_head(_cap) ( (_cap)->r.epos == 0 ) #define _dp(_cap) ( dz_cswgv(_cap) - (_cap)->r.epos ) @@ -1914,7 +1942,7 @@ _trace_tail:; #undef _idx_dsc #undef _load_prev_cap -#ifndef DZ_INCLUDE_ONCE +#if !defined(DZ_UNITTESTS_INCLUDED) && !defined(DZ_QUAL_ADJ) /* short, exact matching sequences */ unittest( "trace" ) { @@ -1939,7 +1967,7 @@ unittest( "trace" ) { dz_destroy(dz); } -#endif // DZ_INCLUDE_ONCE +#endif #ifdef __cplusplus }; /* extern "C" { */ @@ -1947,7 +1975,14 @@ unittest( "trace" ) { // make sure all of the non-duplicated functions are only included once if we also include // include the quality adjusted versions +#ifndef DZ_INCLUDE_ONCE #define DZ_INCLUDE_ONCE +#endif + +// make sure the unittests only get included once and with non-qual-adjusted functions +#if !defined(DZ_QUAL_ADJ) && !defined(DZ_UNITTESTS_INCLUDED) +#define DZ_UNITTESTS_INCLUDED +#endif /** * end of dozeu.h */ From b5e9f917d90f150c85d91e0251427cca001b43fd Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 23 Apr 2020 19:52:14 -0700 Subject: [PATCH 06/43] debug quality adjusted xdrop alignment --- dozeu.h | 153 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 68 deletions(-) diff --git a/dozeu.h b/dozeu.h index d0e969f..fdfcd7f 100644 --- a/dozeu.h +++ b/dozeu.h @@ -181,7 +181,7 @@ struct dz_query_s { uint32_t blen; uint32_t arr_end; char const *q; - int16_t bonus[2 * sizeof(__m128i) / sizeof(int16_t)]; + int16_t bonus[16]; uint8_t arr[]; }; dz_static_assert(sizeof(struct dz_query_s) % sizeof(__m128i) == 0); @@ -555,14 +555,15 @@ unittest() { /* get the scores from the quality matrices (unvectorized, unfortunately) */ \ int8_t const *qual_mat = dz_qual_matrix(self); \ /* TODO: qual: is this the right order? */ \ - __m128i sc = _mm_setr_epi16(qual_mat[_mm_extract_epi16(iv, 7)], \ - qual_mat[_mm_extract_epi16(iv, 6)], \ - qual_mat[_mm_extract_epi16(iv, 5)], \ - qual_mat[_mm_extract_epi16(iv, 4)], \ - qual_mat[_mm_extract_epi16(iv, 3)], \ - qual_mat[_mm_extract_epi16(iv, 2)], \ + __m128i sc = _mm_setr_epi16(qual_mat[_mm_extract_epi16(iv, 0)], \ qual_mat[_mm_extract_epi16(iv, 1)], \ - qual_mat[_mm_extract_epi16(iv, 0)]); \ + qual_mat[_mm_extract_epi16(iv, 2)], \ + qual_mat[_mm_extract_epi16(iv, 3)], \ + qual_mat[_mm_extract_epi16(iv, 4)], \ + qual_mat[_mm_extract_epi16(iv, 5)], \ + qual_mat[_mm_extract_epi16(iv, 6)], \ + qual_mat[_mm_extract_epi16(iv, 7)]); \ + print_vector(sc)\ sc; \ }) #else @@ -576,7 +577,8 @@ unittest() { #define _calc_score_profile(_i) ({ \ __m128i qv = _mm_loadl_epi64((__m128i const *)&parr[(_i) * L]); \ __m128i sc = _mm_cvtepi8_epi16(_mm_shuffle_epi8(_mm_load_si128((__m128i const *)self->matrix), _mm_or_si128(rv, qv))); \ - /* print_vector(_mm_cvtepi8_epi16(rv)); print_vector(_mm_cvtepi8_epi16(qv)); */ \ + print_vector(sc); \ + /* print_vector(_mm_cvtepi8_epi16(rv)); print_vector(_mm_cvtepi8_epi16(qv)); */ \ sc; \ }) #endif @@ -750,18 +752,66 @@ struct dz_s *dz_init_intl( return(self); } - + +#ifdef DZ_FULL_LENGTH_BONUS +static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_s *dz_qual_adj_init( +#else +struct dz_s *dz_init( +#endif + int8_t const *score_matrix, +#ifdef DZ_QUAL_ADJ + int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ +#endif + uint16_t gap_open, + uint16_t gap_extend, + //uint64_t max_gap_len, + uint16_t full_length_bonus) +{ +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_init_intl(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)); +#else + return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); +#endif +} + +#else // DZ_FULL_LENGTH_BONUS + +static __dz_vectorize +#ifdef DZ_QUAL_ADJ +struct dz_s *dz_qual_adj_init( +#else +struct dz_s *dz_init( +#endif + int8_t const *score_matrix, +#ifdef DZ_QUAL_ADJ + int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ +#endif + uint16_t gap_open, + uint16_t gap_extend)//, + //uint64_t max_gap_len) +{ +#ifdef DZ_QUAL_ADJ + return(dz_qual_adj_init_intl(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, 0)); +#else + return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); +#endif +} +#endif // DZ_FULL_LENGTH_BONUS + #ifndef DZ_INCLUDE_ONCE + // TODO: qual: i think this doesn't change between qual/non-qual - + static __dz_vectorize //#ifdef DZ_QUAL_ADJ //struct dz_alignment_init_s dz_qual_adj_align_init( //#else struct dz_alignment_init_s dz_align_init( -//#endif - struct dz_s *self, - uint32_t max_gap_len) + //#endif + struct dz_s *self, + uint32_t max_gap_len) { /* fill the root (the leftmost) column; first init vectors */ @@ -806,55 +856,8 @@ struct dz_alignment_init_s dz_align_init( aln_init.xt = xt; return(aln_init); } - + #endif // DZ_INCLUDE_ONCE - -#ifdef DZ_FULL_LENGTH_BONUS -static __dz_vectorize -#ifdef DZ_QUAL_ADJ -struct dz_s *dz_qual_adj_init( -#else -struct dz_s *dz_init( -#endif - int8_t const *score_matrix, -#ifdef DZ_QUAL_ADJ - int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ -#endif - uint16_t gap_open, - uint16_t gap_extend, - //uint64_t max_gap_len, - uint16_t full_length_bonus) -{ -#ifdef DZ_QUAL_ADJ - return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)); -#else - return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); -#endif -} - -#else // DZ_FULL_LENGTH_BONUS - -static __dz_vectorize -#ifdef DZ_QUAL_ADJ -struct dz_s *dz_qual_adj_init( -#else -struct dz_s *dz_init( -#endif - int8_t const *score_matrix, -#ifdef DZ_QUAL_ADJ - int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ -#endif - uint16_t gap_open, - uint16_t gap_extend)//, - //uint64_t max_gap_len) -{ -#ifdef DZ_QUAL_ADJ - return(dz_qual_adj_init(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, 0)); -#else - return(dz_init_intl(score_matrix, gap_open, gap_extend, 0)); -#endif -} -#endif // DZ_FULL_LENGTH_BONUS #ifndef DZ_INCLUDE_ONCE static __dz_vectorize @@ -1029,20 +1032,29 @@ struct dz_query_s *dz_pack_query_forward( #endif size_t qlen) { + debug("entering pack forward"); size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right size_t pack_size = dz_roundup(qlen + 1, L) + sizeof(__m128i); + debug("asking for %d bytes", sizeof(struct dz_query_s) + 2 * pack_size); #ifdef DZ_QUAL_ADJ struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + 2 * pack_size); #else struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); #endif - *q = (struct dz_query_s){ - .blen = (uint32_t) ((qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L)), - .arr_end = (uint32_t) pack_size, - .q = query, - .bonus = { 0 } - }; + q->blen = qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L; + q->arr_end = pack_size; + q->q = query; + + _mm_store_si128((__m128i *)q->bonus, _mm_setzero_si128()); + _mm_store_si128(((__m128i *)q->bonus) + 1, _mm_setzero_si128()); +// *q = (struct dz_query_s){ +// .blen = (uint32_t) ((qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L)), +// .arr_end = (uint32_t) pack_size, +// .q = query, +// .bonus = { 0 } +// }; + debug("allocated and initialized"); /* * tentative support of full-length bonus; precaclulated bonus score is saved at q->bonus[0] and q->bonus[1]; [0] for non-tail vector and [1] for tail vector */ @@ -1072,6 +1084,7 @@ struct dz_query_s *dz_pack_query_forward( _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); /* shift by one to make room for a base */ pv = tv; } + debug("packed seq main loop"); /* continue the same conversion on the remainings */ // _mm_store_si128((__m128i *)&q->arr[dz_rounddown(qlen, sizeof(__m128i))], _mm_srli_si128(pv, 15)); @@ -1084,6 +1097,7 @@ struct dz_query_s *dz_pack_query_forward( q->arr[i + 1] = qS; } + debug("packed seq tail"); // TODO: qual: probably could make this faster if we apply the <<5 once here and store in 16 bit values #ifdef DZ_QUAL_ADJ @@ -1097,6 +1111,8 @@ struct dz_query_s *dz_pack_query_forward( qpv = qtv; } + debug("packed qual main loop"); + /* continue the same conversion on the remainings */ q->arr[pack_size + dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(qpv, 15); for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { @@ -1505,6 +1521,7 @@ struct dz_forefront_s const *dz_extend( struct dz_forefront_s const **forefronts, size_t n_forefronts, char const *ref, int32_t rlen, uint32_t rid, uint16_t xt) { + debug("entering extend (external)"); #ifdef DZ_QUAL_ADJ return(dz_qual_adj_extend_intl(self, query, forefronts, n_forefronts, ref, rlen, rid, xt, INT16_MIN)); #else From f38c0ffec8b95071993b05c62992aa4d5ef32227 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Sun, 26 Apr 2020 17:01:32 -0700 Subject: [PATCH 07/43] preserve memory alignment when packing qual values --- dozeu.h | 72 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/dozeu.h b/dozeu.h index fdfcd7f..60587a7 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,5 +1,7 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c +//#ifdef DZ_QUAL_ADJ //#define DEBUG +//#endif //#define DZ_PRINT_VECTOR /** * @file dozeu.h @@ -108,7 +110,7 @@ enum dz_alphabet { dz_static_assert(DZ_MAT_SIZE <= 32); #endif // DZ_PROTEIN -#if (defined(DEBUG) || defined(UNITTEST)) && !defined(__cplusplus) +#if (defined(DEBUG) || defined(UNITTEST)) # include "log.h" # define UNITTEST_ALIAS_MAIN 0 # define UNITTEST_UNIQUE_ID 3213 @@ -275,8 +277,28 @@ dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); (int16_t)_mm_extract_epi16(v, 1), \ (int16_t)_mm_extract_epi16(v, 0)); \ } +#define print_vector8(v) { \ +debug("%s (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", #v, \ + (int)_mm_extract_epi8(v, 15), \ + (int)_mm_extract_epi8(v, 14), \ + (int)_mm_extract_epi8(v, 13), \ + (int)_mm_extract_epi8(v, 12), \ + (int)_mm_extract_epi8(v, 11), \ + (int)_mm_extract_epi8(v, 10), \ + (int)_mm_extract_epi8(v, 9), \ + (int)_mm_extract_epi8(v, 8), \ + (int)_mm_extract_epi8(v, 7), \ + (int)_mm_extract_epi8(v, 6), \ + (int)_mm_extract_epi8(v, 5), \ + (int)_mm_extract_epi8(v, 4), \ + (int)_mm_extract_epi8(v, 3), \ + (int)_mm_extract_epi8(v, 2), \ + (int)_mm_extract_epi8(v, 1), \ + (int)_mm_extract_epi8(v, 0)); \ +} #else #define print_vector(v) ; +#define print_vector8(v) ; #endif /** @@ -364,10 +386,10 @@ void dz_mem_destroy( struct dz_mem_s *mem) { struct dz_mem_block_s *blk = mem->blk.next; - debug("cleanup memory chain, blk(%p)", blk); + debug("cleanup memory chain, blk(%p)", blk); while(blk != NULL) { struct dz_mem_block_s *next = blk->next; - debug("free blk(%p), next(%p)", blk, next); + debug("free blk(%p), next(%p)", blk, next); dz_free(blk); blk = next; } dz_free(mem); @@ -378,7 +400,7 @@ uint64_t dz_mem_add_stack( struct dz_mem_s *mem, size_t size) { - debug("add_stack, ptr(%p)", mem->stack.curr->next); + debug("add_stack, ptr(%p)", mem->stack.curr->next); if(mem->stack.curr->next == NULL || dz_unlikely(mem->stack.curr->next->size < size + dz_roundup(sizeof(struct dz_mem_block_s), DZ_MEM_ALIGN_SIZE))) { if (mem->stack.curr->next != NULL) { @@ -397,7 +419,7 @@ uint64_t dz_mem_add_stack( 2 * mem->stack.curr->size ); struct dz_mem_block_s *blk = (struct dz_mem_block_s *)dz_malloc(size); - debug("malloc called, blk(%p)", blk); + debug("malloc called, blk(%p)", blk); if(blk == NULL) { return(1); } /* link new node to the forefront of the current chain */ @@ -422,10 +444,12 @@ void *dz_mem_malloc( // also, where does 4096 come from? // also, don't we need to provide the size to ensure that a large enough block is allocated? //if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } - if(dz_mem_stack_rem(mem) < size) { dz_mem_add_stack(mem, size); } + if(dz_mem_stack_rem(mem) < size) { + dz_mem_add_stack(mem, size); + } void *ptr = (void *)mem->stack.top; mem->stack.top += dz_roundup(size, sizeof(__m128i)); - return(ptr); + return(ptr); } #endif // DZ_INCLUDE_ONCE @@ -1032,30 +1056,22 @@ struct dz_query_s *dz_pack_query_forward( #endif size_t qlen) { - debug("entering pack forward"); size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right - size_t pack_size = dz_roundup(qlen + 1, L) + sizeof(__m128i); - debug("asking for %d bytes", sizeof(struct dz_query_s) + 2 * pack_size); + size_t pack_size = dz_roundup(qlen + 1, sizeof(__m128i)); #ifdef DZ_QUAL_ADJ struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + 2 * pack_size); #else struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + pack_size); #endif - q->blen = qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L; - q->arr_end = pack_size; - q->q = query; - - _mm_store_si128((__m128i *)q->bonus, _mm_setzero_si128()); - _mm_store_si128(((__m128i *)q->bonus) + 1, _mm_setzero_si128()); -// *q = (struct dz_query_s){ -// .blen = (uint32_t) ((qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L)), -// .arr_end = (uint32_t) pack_size, -// .q = query, -// .bonus = { 0 } -// }; - debug("allocated and initialized"); - /* + *q = (struct dz_query_s){ + .blen = (uint32_t) ((qlen == 0 ? 0 : dz_roundup(qlen + 1, L) / L)), + .arr_end = (uint32_t) pack_size, + .q = query, + .bonus = { 0 } + }; + + /* * tentative support of full-length bonus; precaclulated bonus score is saved at q->bonus[0] and q->bonus[1]; [0] for non-tail vector and [1] for tail vector */ q->bonus[L + (qlen % L)] = self->bonus; @@ -1084,7 +1100,6 @@ struct dz_query_s *dz_pack_query_forward( _mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); /* shift by one to make room for a base */ pv = tv; } - debug("packed seq main loop"); /* continue the same conversion on the remainings */ // _mm_store_si128((__m128i *)&q->arr[dz_rounddown(qlen, sizeof(__m128i))], _mm_srli_si128(pv, 15)); @@ -1097,7 +1112,6 @@ struct dz_query_s *dz_pack_query_forward( q->arr[i + 1] = qS; } - debug("packed seq tail"); // TODO: qual: probably could make this faster if we apply the <<5 once here and store in 16 bit values #ifdef DZ_QUAL_ADJ @@ -1110,9 +1124,7 @@ struct dz_query_s *dz_pack_query_forward( _mm_store_si128((__m128i *)&q->arr[pack_size + i], _mm_alignr_epi8(qtv, qpv, 15)); /* shift by one to make room for a base */ qpv = qtv; } - - debug("packed qual main loop"); - + /* continue the same conversion on the remainings */ q->arr[pack_size + dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(qpv, 15); for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) { @@ -1141,7 +1153,7 @@ struct dz_query_s *dz_pack_query_reverse( { size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right - size_t pack_size = dz_roundup(qlen + 1, L) + sizeof(__m128i); + size_t pack_size = dz_roundup(qlen + 1, sizeof(__m128i)); #ifdef DZ_QUAL_ADJ struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + 2 * pack_size); #else From afd770866c803f275cebe60222c23ced2979bbda Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 28 Apr 2020 13:33:54 -0700 Subject: [PATCH 08/43] fix bugs in DP and traceback --- dozeu.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/dozeu.h b/dozeu.h index 60587a7..2428436 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,5 +1,5 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c -//#ifdef DZ_QUAL_ADJ +//#ifndef DZ_QUAL_ADJ //#define DEBUG //#endif //#define DZ_PRINT_VECTOR @@ -112,6 +112,11 @@ enum dz_alphabet { #if (defined(DEBUG) || defined(UNITTEST)) # include "log.h" +#else +# define debug(...) ; +#endif + +#ifdef UNITTEST # define UNITTEST_ALIAS_MAIN 0 # define UNITTEST_UNIQUE_ID 3213 # include "unittest.h" @@ -120,7 +125,6 @@ unittest() { debug("hello"); } #else # define unittest(...) static void dz_pp_cat(dz_unused_, __LINE__)(void) # define ut_assert(...) ; -# define debug(...) ; # define trap() ; #endif @@ -497,7 +501,7 @@ unittest() { #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ - /* allocate from heap TODO: shouldn't we make sure we get enough allocation? */ \ + /* allocate from heap */ \ if(dz_mem_stack_rem(dz_mem(self)) < next_req) { dz_mem_add_stack(dz_mem(self), next_req); } /* 0); }*/ \ /* push head-cap */ \ struct dz_cap_s *cap = _init_cap(_adj, 0xff, _forefronts, _n_forefronts); \ @@ -635,12 +639,15 @@ unittest() { #define _load_vector(_p) \ __m128i e = _mm_load_si128((__m128i const *)(&dz_cswgv(_p)->e)); /* print_vector(e); */ \ __m128i s = _mm_load_si128((__m128i const *)(&dz_cswgv(_p)->s)); /* print_vector(s); */ + + #define _update_vector(_p) { \ __m128i sc = _calc_score_profile(_p); \ __m128i te = _mm_subs_epi16(_mm_max_epi16(e, _mm_subs_epi16(s, giv)), gev1); \ /* print_vector(_mm_alignr_epi8(s, ps, 14)); print_vector(sc); */ \ - __m128i ts = _mm_max_epi16(te, _mm_adds_epi16(sc, _mm_alignr_epi8(s, ps, 14))); ps = s; \ - __m128i tf = _mm_max_epi16(_mm_subs_epi16(ts, giv), _mm_subs_epi16(_mm_alignr_epi8(minv, f, 14), gev1)); \ + __m128i ts = _mm_max_epi16(te, _mm_adds_epi16(sc, _mm_alignr_epi8(s, ps, 14))); \ + ps = s; \ + __m128i tf = _mm_subs_epi16(_mm_max_epi16(_mm_subs_epi16(_mm_alignr_epi8(ts, pts, 14), giv), _mm_alignr_epi8(minv, f, 14)), gev1); \ tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 14), gev1)); \ tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 12), gev2)); \ tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 8), gev4)); \ @@ -648,6 +655,7 @@ unittest() { maxv = _mm_max_epi16(maxv, _add_bonus(_p, ts)); \ /* print_vector(te); print_vector(_add_bonus(_p, ts)); print_vector(tf); print_vector(maxv);*/ \ e = te; f = tf; s = ts; \ + pts = ts; \ } #define _store_vector(_p) { \ _mm_store_si128((__m128i *)(&dz_swgv(_p)->e), e); \ @@ -852,7 +860,7 @@ struct dz_alignment_init_s dz_align_init( uint16_t xt = self->giv[0] + self->gev[0] * max_gap_len; // calculate the x-drop threshold for this gap length __m128i xtv = _mm_set1_epi16(-xt); - + /* e, f, and s needed for _store_vector */ __m128i const e = _mm_set1_epi16(DZ_CELL_MIN); /* until the X-drop test fails on all the cells in a vector */ @@ -1414,7 +1422,7 @@ unittest() { _init_rch(query, rt, rrem); \ struct dz_swgv_s *cdp = _begin_column(w, rch, rrem); \ /* init vectors */ \ - __m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN); \ + __m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN), pts = _mm_set1_epi16(INT16_MIN); \ __m128i const xtv = _mm_set1_epi16(w.inc - xt); /* next offset == current max thus X-drop threshold is always -xt */ \ /* until the bottommost vertically placed band... */ \ uint32_t sspos = w.r.spos; /* save spos on the stack */ \ @@ -1892,16 +1900,16 @@ struct dz_alignment_s *dz_trace( (score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx)))); \ } #define _match(_idx) { \ - if(dz_inside(pcap->r.spos, _vector_idx(idx - 1), pcap->r.epos) \ - && score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx))) { \ - uint64_t eq = dz_pair_eq(self, query, rch, idx); \ - *--path = DZ_CIGAR_OP>>(eq<<3); cnt[eq]++; \ - score = _s(s, pcap, idx - 1); idx--; rch = _load_prev_cap(s, score, _idx); \ - continue; \ + if(dz_inside(pcap->r.spos, _vector_idx(idx - 1), pcap->r.epos) \ + && score == (_s(s, pcap, idx - 1) + dz_pair_score(self, query, rch, idx))) { \ + uint64_t eq = dz_pair_eq(self, query, rch, idx); \ + *--path = DZ_CIGAR_OP>>(eq<<3); cnt[eq]++; \ + score = _s(s, pcap, idx - 1); idx--; rch = _load_prev_cap(s, score, _idx); \ + continue; \ } \ } #define _ins(_idx) { \ - if(_vector_idx(idx - 1) >= cap->r.spos && score == _s(f, cap, idx)) { \ + if(dz_inside(cap->r.spos, _vector_idx(idx - 1), cap->r.epos)&& score == _s(f, cap, idx)) { \ _debug(I); \ while(_vector_idx(idx - 1) >= cap->r.spos && score != _s(s, cap, idx - 1) - self->gev[0] - self->giv[0]) { \ *--path = (DZ_CIGAR_OP>>16) & 0xff; cnt[2]++; score = _s(f, cap, idx - 1); idx--; _debug(I); \ @@ -1914,9 +1922,9 @@ struct dz_alignment_s *dz_trace( if(dz_inside(pcap->r.spos, _vector_idx(idx), pcap->r.epos) && score == _s(e, cap, idx)) { \ _debug(D); \ while(dz_inside(pcap->r.spos, _vector_idx(idx), pcap->r.epos) && score == _s(e, pcap, idx) - self->gev[0]) { \ - *--path = (DZ_CIGAR_OP>>24) & 0xff; cnt[3]++; score = _s(e, pcap, idx); _load_prev_cap(e, score, _idx); _debug(D); \ + *--path = (DZ_CIGAR_OP>>24) & 0xff; cnt[3]++; score = _s(e, pcap, idx); _load_prev_cap(e, score, _idx); _debug(D); \ } \ - *--path = (DZ_CIGAR_OP>>24) & 0xff; cnt[3]++; score = _s(s, pcap, idx); rch = _load_prev_cap(s, score, _idx); \ + *--path = (DZ_CIGAR_OP>>24) & 0xff; cnt[3]++; score = _s(s, pcap, idx); rch = _load_prev_cap(s, score, _idx); \ continue; \ } \ } From dac13b1d53655604b2dc5ad3ef929da62eb1a606 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Tue, 28 Apr 2020 15:12:57 -0700 Subject: [PATCH 09/43] fix traceback bug --- dozeu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dozeu.h b/dozeu.h index 2428436..a04db8a 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,5 +1,5 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c -//#ifndef DZ_QUAL_ADJ +//#ifdef DZ_QUAL_ADJ //#define DEBUG //#endif //#define DZ_PRINT_VECTOR @@ -1901,7 +1901,7 @@ struct dz_alignment_s *dz_trace( } #define _match(_idx) { \ if(dz_inside(pcap->r.spos, _vector_idx(idx - 1), pcap->r.epos) \ - && score == (_s(s, pcap, idx - 1) + dz_pair_score(self, query, rch, idx))) { \ + && score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx))) { \ uint64_t eq = dz_pair_eq(self, query, rch, idx); \ *--path = DZ_CIGAR_OP>>(eq<<3); cnt[eq]++; \ score = _s(s, pcap, idx - 1); idx--; rch = _load_prev_cap(s, score, _idx); \ @@ -1909,7 +1909,7 @@ struct dz_alignment_s *dz_trace( } \ } #define _ins(_idx) { \ - if(dz_inside(cap->r.spos, _vector_idx(idx - 1), cap->r.epos)&& score == _s(f, cap, idx)) { \ + if(dz_inside(cap->r.spos, _vector_idx(idx - 1), cap->r.epos) && score == _s(f, cap, idx)) { \ _debug(I); \ while(_vector_idx(idx - 1) >= cap->r.spos && score != _s(s, cap, idx - 1) - self->gev[0] - self->giv[0]) { \ *--path = (DZ_CIGAR_OP>>16) & 0xff; cnt[2]++; score = _s(f, cap, idx - 1); idx--; _debug(I); \ From 180db30841524a5c56db257a5a5129ed8716e9d3 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Sun, 9 Feb 2020 13:21:59 -0800 Subject: [PATCH 10/43] Remove unnecessary ifdef around __SSE4_1__ Dozeu already depends on the SSE4.1 API, this wasn't doing any good. --- dozeu.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/dozeu.h b/dozeu.h index a04db8a..b7c8352 100644 --- a/dozeu.h +++ b/dozeu.h @@ -142,12 +142,7 @@ unittest() { debug("hello"); } #ifndef __x86_64__ # error "x86_64 is required" #endif -#ifndef __SSE4_1__ -# warning "SSE4.1 is automatically enabled in dozeu.h, please check compatibility to the system." -# define __dz_vectorize __attribute__(( target( "sse4.1" ) )) -#else -# define __dz_vectorize /* follow the compiler options */ -#endif +#define __dz_vectorize /* follow the compiler options */ /* inlining (FIXME: add appropriate __force_inline flag) */ #define __dz_force_inline inline @@ -166,11 +161,7 @@ unittest() { debug("hello"); } #define dz_loadu_u64(p) ({ uint8_t const *_p = (uint8_t const *)(p); *((uint64_t const *)_p); }) #define dz_storeu_u64(p, e) { uint8_t *_p = (uint8_t *)(p); *((uint64_t *)(_p)) = (e); } -#ifdef __SSE4_1__ -# define dz_is_all_zero(x) ( _mm_test_all_zeros((x), (x)) == 1 ) -#else -# define dz_is_all_zero(x) ( _mm_movemask_epi8((x)) == 0 ) -#endif +#define dz_is_all_zero(x) ( _mm_test_all_zeros((x), (x)) == 1 ) #define DZ_MEM_MARGIN_SIZE ( 256 ) #define DZ_MEM_ALIGN_SIZE ( 16 ) From f21411d31fa138488b2d056ba487a43fd48c63d6 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Sun, 9 Feb 2020 12:53:11 -0800 Subject: [PATCH 11/43] Use instead of The latter in unavailable on many compilers. --- dozeu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dozeu.h b/dozeu.h index b7c8352..06f2ca9 100644 --- a/dozeu.h +++ b/dozeu.h @@ -44,7 +44,7 @@ extern "C" { #include #include #include -#include +#include #ifndef DZ_INCLUDE_ONCE From 73c1d2b2b4494cee1b0d7335f8f78fcfd687f281 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Sun, 9 Feb 2020 13:04:27 -0800 Subject: [PATCH 12/43] Allow alternate implementations of the SSE4.1 API. --- dozeu.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dozeu.h b/dozeu.h index 06f2ca9..8058c49 100644 --- a/dozeu.h +++ b/dozeu.h @@ -44,7 +44,15 @@ extern "C" { #include #include #include + +/* _MM_FROUND_CUR_DIRECTION is defined in smmintrin.h (the include + file for SSE4.1), so if it's already defined we assume that the + SSE4.1 API is available and we can skip including . + This allows Dozeu to use alternative implementations (e.g., + ) without depending on them. */ +#if !defined(_MM_FROUND_CUR_DIRECTION) #include +#endif #ifndef DZ_INCLUDE_ONCE From 0ba4f19db001a703a7e10cf3268e025735ecf349 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 25 Jun 2020 16:22:54 +0200 Subject: [PATCH 13/43] use the SIMD Everywhere library --- .gitmodules | 4 ++++ dozeu.h | 13 ++----------- simde | 1 + 3 files changed, 7 insertions(+), 11 deletions(-) create mode 100644 .gitmodules create mode 160000 simde diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d7ab9db --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "simde"] + path = simde + url = https://github.com/nemequ/simde-no-tests + branch = 2931676 diff --git a/dozeu.h b/dozeu.h index 8058c49..61acc9d 100644 --- a/dozeu.h +++ b/dozeu.h @@ -45,14 +45,8 @@ extern "C" { #include #include -/* _MM_FROUND_CUR_DIRECTION is defined in smmintrin.h (the include - file for SSE4.1), so if it's already defined we assume that the - SSE4.1 API is available and we can skip including . - This allows Dozeu to use alternative implementations (e.g., - ) without depending on them. */ -#if !defined(_MM_FROUND_CUR_DIRECTION) -#include -#endif +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse4.1.h" #ifndef DZ_INCLUDE_ONCE @@ -147,9 +141,6 @@ unittest() { debug("hello"); } #define dz_ut_sel(a, b, c) ( (DZ_UNITTEST_INDEX == 0) ? (a) : ((DZ_UNITTEST_INDEX == 1) ? (b) : (c)) ) /* vectorize */ -#ifndef __x86_64__ -# error "x86_64 is required" -#endif #define __dz_vectorize /* follow the compiler options */ /* inlining (FIXME: add appropriate __force_inline flag) */ diff --git a/simde b/simde new file mode 160000 index 0000000..2931676 --- /dev/null +++ b/simde @@ -0,0 +1 @@ +Subproject commit 2931676b5fc30ab3f782759c75f534a36cd2fad4 From d87c266ab68bcafd27685e2f06e55586d10628c1 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 1 Jul 2020 15:12:59 -0700 Subject: [PATCH 14/43] fix small bug in dp --- dozeu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/dozeu.h b/dozeu.h index a04db8a..e915989 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1431,6 +1431,7 @@ unittest() { _update_vector(p); \ if(dz_unlikely(_test_xdrop(s, xtv))) { /* mark _unlikely to move out of the core loop */ \ /* drop either leading or trailing vector, skip the forefront extension when the forefront is clipped */ \ + pts = _mm_set1_epi16(INT16_MIN); \ if(p == w.r.spos) { \ w.r.spos++; cdp--; continue; \ } else { \ From 9d1e9053eb595cff9f8cdb6e2a212e7f84dccff8 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 10 Jul 2020 16:48:26 -0700 Subject: [PATCH 15/43] incorporate full length bonus into dp --- dozeu.h | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/dozeu.h b/dozeu.h index e915989..c6b417a 100644 --- a/dozeu.h +++ b/dozeu.h @@ -79,14 +79,20 @@ enum dz_alphabet { rN = 0x90, qN = 0x90, qS = 0x02 /* pshufb instruction clears the column when the 7-th bit of the index is set */ #endif }; +#ifdef DZ_FULL_LENGTH_BONUS +#define dz_end_bonus(_self, _query, _i) ((_i) / 8 == (_query)->blen - 1 ? (_query)->bonus[8 + ((_i) & 7)] : 0) +#else +#define dz_end_bonus(_self, _query, _i) 0; +#endif + /* get the first index of the quals from a packed query (only valid if DZ_QUAL_ADJ is defined) */ #define dz_quals(_query) ( (uint8_t const *) &(_query)->arr[(_query)->arr_end] ) /* get the base quality adjusted matrix (only valid if DZ_QUAL_ADJ is defined) */ #define dz_qual_matrix(_self) ( (int8_t const *)((_self) + 1) ) -#define dz_pair_score(_self, _q, _r, _i) ( (_self)->matrix[((_r) | (_q)->arr[(_i)]) & 0x1f] ) +#define dz_pair_score(_self, _q, _r, _i) ( (_self)->matrix[((_r) | (_q)->arr[(_i)]) & 0x1f] + dz_end_bonus(_self, _q, _i)) -#define dz_qual_adj_pair_score(_self, _q, _r, _i) ( dz_qual_matrix(_self)[(((uint32_t) dz_quals(_q)[(_i)]) << 5) | ((uint32_t)(_r)) | ((uint32_t)((_q)->arr[(_i)] & 0x1f))] ) +#define dz_qual_adj_pair_score(_self, _q, _r, _i) ( dz_qual_matrix(_self)[(((uint32_t) dz_quals(_q)[(_i)]) << 5) | ((uint32_t)(_r)) | ((uint32_t)((_q)->arr[(_i)] & 0x1f))] + dz_end_bonus(_self, _q, _i) ) #define dz_pair_eq(_self, _q, _r, _i) ( (uint32_t)((_q)->arr[(_i)]) == ((uint32_t)(_r)<<2) ) @@ -96,7 +102,7 @@ enum dz_alphabet { # ifndef DZ_MAT_SIZE # define DZ_MAT_SIZE ( 32 ) # endif -#define dz_pair_score(_self, _q, _r, _i) ( (int8_t)((_q)->arr[(_r) * (_q)->blen * L + (_i)]) ) +#define dz_pair_score(_self, _q, _r, _i) ( (int8_t)((_q)->arr[(_r) * (_q)->blen * L + (_i)]) + dz_end_bonus(_self, _q, _i)) #define dz_pair_eq(_self, _q, _r, _i) ( (uint32_t)((_q)->q[(_i) - 1] & 0x1f) == (uint32_t)(_r) ) #endif @@ -642,7 +648,7 @@ unittest() { #define _update_vector(_p) { \ - __m128i sc = _calc_score_profile(_p); \ + __m128i sc = _add_bonus(_p, _calc_score_profile(_p)); \ __m128i te = _mm_subs_epi16(_mm_max_epi16(e, _mm_subs_epi16(s, giv)), gev1); \ /* print_vector(_mm_alignr_epi8(s, ps, 14)); print_vector(sc); */ \ __m128i ts = _mm_max_epi16(te, _mm_adds_epi16(sc, _mm_alignr_epi8(s, ps, 14))); \ @@ -652,7 +658,7 @@ unittest() { tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 12), gev2)); \ tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 8), gev4)); \ ts = _mm_max_epi16(ts, tf); print_vector(ts); \ - maxv = _mm_max_epi16(maxv, _add_bonus(_p, ts)); \ + maxv = _mm_max_epi16(maxv, ts); \ /* print_vector(te); print_vector(_add_bonus(_p, ts)); print_vector(tf); print_vector(maxv);*/ \ e = te; f = tf; s = ts; \ pts = ts; \ @@ -1748,10 +1754,8 @@ unittest( "extend.small" ) { */ static __dz_vectorize int64_t dz_calc_max_rpos( - struct dz_s *self, struct dz_forefront_s const *forefront) { - (void)self; struct dz_cap_s const *pcap = forefront->mcap; int32_t rpos = (pcap == dz_ccap(forefront)) ? 0 : pcap->rrem; return((int64_t)rpos); /* signed expansion */ @@ -1761,11 +1765,8 @@ int64_t dz_calc_max_rpos( * @fn dz_calc_max_qpos */ static __dz_vectorize -uint64_t dz_calc_max_qpos( - struct dz_s *self, - struct dz_forefront_s const *forefront) +uint64_t dz_calc_max_qpos(struct dz_forefront_s const *forefront) { - (void)self; size_t const L = sizeof(__m128i) / sizeof(uint16_t); #define _dp(_cap) ( (struct dz_swgv_s const *)(_cap) - (_cap)->r.epos ) @@ -1773,7 +1774,7 @@ uint64_t dz_calc_max_qpos( struct dz_cap_s const *pcap = forefront->mcap; __m128i const maxv = _mm_set1_epi16(forefront->inc); for(uint64_t p = pcap->r.spos; p < pcap->r.epos; p++) { - __m128i const s = _add_bonus(p, _mm_load_si128(&_dp(pcap)[p].s)); print_vector(s); + __m128i const s = _mm_load_si128(&_dp(pcap)[p].s); print_vector(s); uint64_t eq = _mm_movemask_epi8(_mm_cmpeq_epi16(s, maxv)); if(eq != 0) { /* tzcntq is faster but avoid using it b/c it requires relatively newer archs */ @@ -1795,14 +1796,12 @@ uint64_t dz_calc_max_qpos( * @brief rpos (signed int) in upper 32bit, qpos (unsigned int) in lower 32bit */ static __dz_vectorize -uint64_t dz_calc_max_pos( - struct dz_s *self, - struct dz_forefront_s const *forefront) +uint64_t dz_calc_max_pos(struct dz_forefront_s const *forefront) { struct dz_cap_s const *pcap = forefront->mcap; if(pcap == NULL) { debug("pcap == NULL, rlen(%d)", (int32_t)forefront->rlen); return(((uint64_t)forefront->rlen)<<32); } /* is root */ debug("forefront(%p), pcap(%p), rrem(%d), rlen(%d)", forefront, pcap, (int32_t)pcap->rrem, (int32_t)forefront->rlen); - return((dz_calc_max_rpos(self, forefront)<<32) | dz_calc_max_qpos(self, forefront)); + return((dz_calc_max_rpos(forefront)<<32) | dz_calc_max_qpos(forefront)); } #endif // DZ_INCLUDE_ONCE @@ -1865,7 +1864,7 @@ struct dz_alignment_s *dz_trace( if(forefront->mcap == NULL) { debug("mcap is null"); return(NULL); } /* detect pos */ - uint64_t idx = dz_calc_max_qpos(self, forefront); /* vector index, cell index */ + uint64_t idx = dz_calc_max_qpos(forefront); /* vector index, cell index */ uint64_t ref_length = 0, query_length = idx; /* allocate aln object */ @@ -1894,13 +1893,14 @@ struct dz_alignment_s *dz_trace( /* traceback loop */ #define _debug(_label) { \ - debug("test %s, idx(%lu), rbase(%d, %c), qbase(%c), c(%d), e(%d), f(%d), s(%d), score(%d, %lu), diag(%d)", \ + debug("test %s, idx(%lu), rbase(%d, %c), qbase(%c), c(%d), e(%d), f(%d), s(%d), score(%d, %d, %lu), diag(%d)", \ #_label, idx, rch, '-', '-', /*"ACGTNNNNNNNNNNNN"[rch & 0xff], "ANNNCNNNGNNNTNNN"[query->arr[idx]],*/ \ score, _s(e, cap, idx), _s(f, cap, idx), _s(s, pcap, idx - 1), \ _pair_score(self, query, rch, idx), (uint64_t)dz_pair_eq(self, query, rch, idx), \ - (score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx)))); \ + dz_end_bonus(self, query, idx), (score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx)))); \ } #define _match(_idx) { \ + _debug(M); \ if(dz_inside(pcap->r.spos, _vector_idx(idx - 1), pcap->r.epos) \ && score == (_s(s, pcap, idx - 1) + _pair_score(self, query, rch, idx))) { \ uint64_t eq = dz_pair_eq(self, query, rch, idx); \ @@ -1931,7 +1931,12 @@ struct dz_alignment_s *dz_trace( } uint32_t rch = _load_prev_cap(s, score, _idx_asc); - while(1) { _debug(M); _match(_idx_asc); _ins(_idx_asc); _del(_idx_asc); dz_trap(); } + while(1) { + _match(_idx_asc); + _ins(_idx_asc); + _del(_idx_asc); + dz_trap(); + } _trace_tail:; span++; /* adjust root span */ @@ -1947,7 +1952,7 @@ _trace_tail:; aln->path_length = path_length; aln->ref_length = ref_length; aln->query_length = query_length; - aln->rrem = dz_calc_max_rpos(self, forefront); + aln->rrem = dz_calc_max_rpos(forefront); aln->score = forefront->max; aln->mismatch_count = cnt[0]; aln->match_count = cnt[1]; From 10d70a7388db38fea600c3867c7262fe70432be1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 13 Jul 2020 17:07:52 -0700 Subject: [PATCH 16/43] Use a simde that builds on Clang 11 --- simde | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simde b/simde index 2931676..3a4c5bb 160000 --- a/simde +++ b/simde @@ -1 +1 @@ -Subproject commit 2931676b5fc30ab3f782759c75f534a36cd2fad4 +Subproject commit 3a4c5bbc84320d846b35944a0b022b67b386413b From d0e6efe1b7e32cda0942c003e4d07150011a8698 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 13 Jul 2020 17:33:57 -0700 Subject: [PATCH 17/43] Avoid complaints about extern C templates from simde --- dozeu.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/dozeu.h b/dozeu.h index cc6359a..5e26aef 100644 --- a/dozeu.h +++ b/dozeu.h @@ -26,9 +26,6 @@ * 2. Myers bit vector (describes vertical (horizontal) tiling) * Gene Myers, A fast bit-vector algorithm for approximate string matching based on dynamic programming, JACM (1999) */ -#ifdef __cplusplus -extern "C" { -#endif /* make sure POSIX APIs are properly activated */ #if defined(__linux__) && !defined(_POSIX_C_SOURCE) @@ -39,15 +36,20 @@ extern "C" { # define _BSD_SOURCE #endif +#define SIMDE_ENABLE_NATIVE_ALIASES +#include "simde/x86/sse4.1.h" + +// Only turn on extern "C" after including code that can use C++ features. +#ifdef __cplusplus +extern "C" { +#endif + #include #include #include #include #include -#define SIMDE_ENABLE_NATIVE_ALIASES -#include "simde/x86/sse4.1.h" - #ifndef DZ_INCLUDE_ONCE #define DZ_NUM_QUAL_SCORES 64 @@ -2016,6 +2018,9 @@ unittest( "trace" ) { #if !defined(DZ_QUAL_ADJ) && !defined(DZ_UNITTESTS_INCLUDED) #define DZ_UNITTESTS_INCLUDED #endif + +#undef SIMDE_ENABLE_NATIVE_ALIASES + /** * end of dozeu.h */ From 89c8e1e5b906e830bbefe9753fe0069053a0e384 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 27 Jul 2020 19:06:04 -0700 Subject: [PATCH 18/43] handle xdrops between two non-xdrop vectors --- dozeu.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dozeu.h b/dozeu.h index 5e26aef..71da0a4 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1424,6 +1424,7 @@ unittest() { __m128i const xtv = _mm_set1_epi16(w.inc - xt); /* next offset == current max thus X-drop threshold is always -xt */ \ /* until the bottommost vertically placed band... */ \ uint32_t sspos = w.r.spos; /* save spos on the stack */ \ + uint64_t first_drop = w.r.epos;\ for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ _load_vector(&pdp[p]); \ _update_vector(p); \ @@ -1433,12 +1434,19 @@ unittest() { if(p == w.r.spos) { \ w.r.spos++; cdp--; continue; \ } else { \ - w.r.epos = p; goto dz_pp_cat(_forefront_, __LINE__); \ + first_drop = dz_min2(p, first_drop); \ } \ - } \ + } else { \ + first_drop = w.r.epos; /* reset the xdrop to indicate that any xdrops we found are not contiguous */ \ + } \ _store_vector(&cdp[p]); \ } \ - /* if reached the forefront of the query sequence, finish the extension */ \ + if (first_drop < w.r.epos) { \ + /* we xdropped a contiguous block of vectors at the end of this column */\ + w.r.epos = first_drop; \ + goto dz_pp_cat(_forefront_, __LINE__); \ + } \ + /* if reached the forefront of the query sequence, finish the extension */ \ if(w.r.epos < query->blen) { \ /* forefront extension; clip the column length if too long */ \ __m128i e = minv, s = minv; _update_vector(w.r.epos); \ From d8c2d867d1929f0b640b9dc13a810ccc1e34a257 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 12 Oct 2020 12:57:50 -0700 Subject: [PATCH 19/43] allow full length bonus to be selected per read --- dozeu.h | 103 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 36 deletions(-) diff --git a/dozeu.h b/dozeu.h index 71da0a4..cfe7ef2 100644 --- a/dozeu.h +++ b/dozeu.h @@ -254,7 +254,7 @@ struct dz_mem_s { struct dz_mem_block_s blk; struct dz_stack_s stack; }; struct dz_s { int8_t matrix[32]; - uint16_t giv[8], gev[8], riv[8], rev[8], bonus, _pad[7]; // padding ensures memory alignment + uint16_t giv[8], gev[8], riv[8], rev[8]; // padding ensures memory alignment int8_t protein_matrix[]; }; dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); @@ -686,9 +686,7 @@ struct dz_s *dz_init_intl( int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ #endif uint16_t gap_open, /* gap penalties in positive */ - uint16_t gap_extend, - //uint64_t max_gap_len, /* as X-drop threshold */ - uint16_t full_length_bonus) /* end-to-end mapping bonus; only activated when compiled with -DDZ_FULL_LENGTH_BONUS */ + uint16_t gap_extend) { /* static uint8_t const transpose[16] __attribute__(( aligned(16) )) = { @@ -742,17 +740,8 @@ struct dz_s *dz_init_intl( __m128i const gev = _mm_set1_epi16(gap_extend); _mm_store_si128((__m128i *)self->giv, giv); _mm_store_si128((__m128i *)self->gev, gev); - // TODO: see if i can remove this to cut down on state - //self->xt = 0; /* X-drop threshold */ - self->bonus = full_length_bonus; - //self->max_gap_len = max_gap_len; /* save raw value */ - debug("gi(%u), ge(%u), full_length_bonus(%u)", gap_open, gap_extend, self->bonus); - - // TODO: i'm pretty sure this cap is extraneous and never used - ///* create root head */ - //struct dz_cap_s *cap = (struct dz_cap_s *)dz_mem_malloc(mem, sizeof(struct dz_cap_s)); - //_mm_store_si128((__m128i *)cap, _mm_setzero_si128()); - + debug("gi(%u), ge(%u)", gap_open, gap_extend); + /* initialize and memoize the vectors we need to compute the root */ __m128i riv = _mm_setr_epi16( 0, @@ -795,14 +784,12 @@ struct dz_s *dz_init( int8_t const *qual_adj_score_matrix, /* series of DZ_NUM_QUAL_SCORES adjusted score matrices for each phred base qual (no offset) */ #endif uint16_t gap_open, - uint16_t gap_extend, - //uint64_t max_gap_len, - uint16_t full_length_bonus) + uint16_t gap_extend) { #ifdef DZ_QUAL_ADJ - return(dz_qual_adj_init_intl(score_matrix, qual_adj_score_matrix, gap_open, gap_extend, full_length_bonus)); + return(dz_qual_adj_init_intl(score_matrix, qual_adj_score_matrix, gap_open, gap_extend)); #else - return(dz_init_intl(score_matrix, gap_open, gap_extend, full_length_bonus)); + return(dz_init_intl(score_matrix, gap_open, gap_extend)); #endif } @@ -1021,7 +1008,7 @@ static int8_t const dz_unittest_score_matrix[DZ_MAT_SIZE * DZ_MAT_SIZE] = { #ifndef DZ_UNITTEST_SCORE_PARAMS #ifdef DZ_FULL_LENGTH_BONUS -#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1, 0 +#define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1 #else #define DZ_UNITTEST_SCORE_PARAMS dz_unittest_score_matrix, 5, 1 #endif @@ -1060,7 +1047,10 @@ struct dz_query_s *dz_pack_query_forward( #ifdef DZ_QUAL_ADJ uint8_t const *qual, #endif - size_t qlen) +#ifdef DZ_FULL_LENGTH_BONUS + int8_t full_length_bonus, +#endif + size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); // TODO: qual: make sure I'm doing this right @@ -1080,8 +1070,9 @@ struct dz_query_s *dz_pack_query_forward( /* * tentative support of full-length bonus; precaclulated bonus score is saved at q->bonus[0] and q->bonus[1]; [0] for non-tail vector and [1] for tail vector */ - q->bonus[L + (qlen % L)] = self->bonus; - + #ifdef DZ_FULL_LENGTH_BONUS + q->bonus[L + (qlen % L)] = full_length_bonus; + #endif /* * ASCII to 2-bit conversion table (shifted by two bits to be used as the upper (row) shuffle index) * @ABC_DEFG_HIJK_LMNO @@ -1154,6 +1145,9 @@ struct dz_query_s *dz_pack_query_reverse( char const *query, #ifdef DZ_QUAL_ADJ uint8_t const *qual, +#endif +#ifdef DZ_FULL_LENGTH_BONUS + int8_t full_length_bonus, #endif size_t qlen) { @@ -1171,7 +1165,9 @@ struct dz_query_s *dz_pack_query_reverse( .q = query, .bonus = { 0 } }; - q->bonus[L + (qlen % L)] = self->bonus; + #ifdef DZ_FULL_LENGTH_BONUS + q->bonus[L + (qlen % L)] = full_length_bonus; + #endif static uint8_t const conv[16] __attribute__(( aligned(16) )) = { #ifdef DZ_NUCL_ASCII @@ -1234,7 +1230,11 @@ struct dz_query_s *dz_pack_query_reverse( static __dz_vectorize struct dz_query_s *dz_pack_query_forward( struct dz_s *self, - char const *query, size_t qlen) + char const *query, +#ifdef DZ_FULL_LENGTH_BONUS + int8_t full_length_bonus, +#endif + size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(__m128i)); @@ -1243,7 +1243,10 @@ struct dz_query_s *dz_pack_query_forward( .q = query, .bonus = { 0 } }; - q->bonus[L + (qlen % L)] = self->bonus; + + #ifdef DZ_FULL_LENGTH_BONUS + q->bonus[L + (qlen % L)] = full_length_bonus; + #endif for(uint64_t j = 0; j < DZ_MAT_SIZE; j++) { size_t const clen = dz_roundup(qlen + 1, L); @@ -1280,7 +1283,11 @@ struct dz_query_s *dz_pack_query_forward( static __dz_vectorize struct dz_query_s *dz_pack_query_reverse( struct dz_s *self, - char const *query, size_t qlen) + char const *query, +#ifdef DZ_FULL_LENGTH_BONUS + int8_t full_length_bonus, +#endif + size_t qlen) { size_t const L = sizeof(__m128i) / sizeof(uint16_t); struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(__m128i)); @@ -1289,7 +1296,9 @@ struct dz_query_s *dz_pack_query_reverse( .q = query, .bonus = { 0 } }; - q->bonus[L + (qlen % L)] = self->bonus; + #ifdef DZ_FULL_LENGTH_BONUS + q->bonus[L + (qlen % L)] = full_length_bonus; + #endif for(uint64_t j = 0; j < DZ_MAT_SIZE; j++) { size_t const clen = dz_roundup(qlen + 1, L); @@ -1340,20 +1349,23 @@ struct dz_query_s *dz_pack_query( char const *query, #ifdef DZ_QUAL_ADJ const uint8_t *qual, +#endif +#ifdef DZ_FULL_LENGTH_BONUS + int8_t full_length_bonus, #endif int64_t qlen) { if(qlen >= 0) { #ifdef DZ_QUAL_ADJ - return(dz_qual_adj_pack_query_forward(self, query, qual, (size_t)qlen)); + return(dz_qual_adj_pack_query_forward(self, query, qual, full_length_bonus, (size_t)qlen)); #else - return(dz_pack_query_forward(self, query, (size_t)qlen)); + return(dz_pack_query_forward(self, query, full_length_bonus, (size_t)qlen)); #endif } else { #ifdef DZ_QUAL_ADJ - return(dz_qual_adj_pack_query_reverse(self, &query[qlen], &qual[qlen], (size_t)-qlen)); + return(dz_qual_adj_pack_query_reverse(self, &query[qlen], &qual[qlen], full_length_bonus, (size_t)-qlen)); #else - return(dz_pack_query_reverse(self, &query[qlen], (size_t)-qlen)); + return(dz_pack_query_reverse(self, &query[qlen], full_length_bonus, (size_t)-qlen)); #endif } } @@ -1364,7 +1376,12 @@ unittest() { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); ut_assert(dz != NULL); - struct dz_query_s *q = dz_pack_query(dz, dz_unittest_query, dz_unittest_query_length); + struct dz_query_s *q = dz_pack_query(dz, dz_unittest_query, +#ifdef DZ_FULL_LENGTH_BONUS + 0, +#endif + + dz_unittest_query_length); dz_unused(q); dz_destroy(dz); } @@ -1597,6 +1614,9 @@ unittest( "extend.base" ) { for(size_t trial = 0; trial < 3; trial++) { struct dz_query_s const *q = dz_pack_query(dz, dz_unittest_query, +#ifdef DZ_FULL_LENGTH_BONUS + 0, +#endif trial == 0 ? dz_unittest_query_length : trial == 1 ? 3 : 10 @@ -1656,7 +1676,11 @@ unittest( "extend.base.revcomp" ) { size_t length = trial == 0 ? dz_unittest_query_length : trial == 1 ? 3 : 10; - struct dz_query_s const *q = dz_pack_query_reverse(dz, &revcomp[dz_unittest_query_length - length], length); + struct dz_query_s const *q = dz_pack_query_reverse(dz, &revcomp[dz_unittest_query_length - length], +#ifdef DZ_FULL_LENGTH_BONUS + 0, +#endif + length); struct dz_forefront_s const *forefront = NULL; /* nothing occurs */ @@ -1706,6 +1730,9 @@ unittest( "extend.small" ) { for(size_t trial = 0; trial < 4; trial++) { struct dz_query_s const *q = dz_pack_query(dz, dz_unittest_query, +#ifdef DZ_FULL_LENGTH_BONUS + 0, +#endif trial == 0 ? dz_unittest_query_length : trial == 1 ? 3 : trial == 2 ? 4 @@ -1992,7 +2019,11 @@ unittest( "trace" ) { struct dz_s *dz = dz_init(DZ_UNITTEST_SCORE_PARAMS); ut_assert(dz != NULL); - struct dz_query_s *q = dz_pack_query(dz, dz_unittest_query, dz_unittest_query_length); + struct dz_query_s *q = dz_pack_query(dz, dz_unittest_query, +#ifdef DZ_FULL_LENGTH_BONUS + 0, +#endif + dz_unittest_query_length); struct dz_forefront_s const *forefront = NULL; struct dz_alignment_s *aln = NULL; From 01ba10ed11cd7cdc2090efd559ce942d62c2d808 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 21 Jan 2021 18:06:00 -0800 Subject: [PATCH 20/43] fix traceback over x-dropped vectors --- dozeu.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dozeu.h b/dozeu.h index cfe7ef2..026f49d 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1440,8 +1440,9 @@ unittest() { __m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN), pts = _mm_set1_epi16(INT16_MIN); \ __m128i const xtv = _mm_set1_epi16(w.inc - xt); /* next offset == current max thus X-drop threshold is always -xt */ \ /* until the bottommost vertically placed band... */ \ - uint32_t sspos = w.r.spos; /* save spos on the stack */ \ uint64_t first_drop = w.r.epos;\ + uint64_t colspos = w.r.spos;\ + uint64_t colepos = w.r.epos;\ for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ _load_vector(&pdp[p]); \ _update_vector(p); \ @@ -1449,7 +1450,7 @@ unittest() { /* drop either leading or trailing vector, skip the forefront extension when the forefront is clipped */ \ pts = _mm_set1_epi16(INT16_MIN); \ if(p == w.r.spos) { \ - w.r.spos++; cdp--; continue; \ + w.r.spos++; \ } else { \ first_drop = dz_min2(p, first_drop); \ } \ @@ -1469,13 +1470,13 @@ unittest() { __m128i e = minv, s = minv; _update_vector(w.r.epos); \ do { \ if(_test_xdrop(s, xtv)) { break; } \ - _store_vector(&cdp[w.r.epos]); w.r.epos++; \ + _store_vector(&cdp[w.r.epos]); w.r.epos++; colepos++; \ f = _mm_subs_epi16(f, gev8); s = _mm_subs_epi16(s, gev8); \ } while(w.r.epos < query->blen); \ } \ dz_pp_cat(_forefront_, __LINE__):; \ /* create cap object that contains [spos, epos) range (for use in the traceback routine) */ \ - struct dz_cap_s *cap = _end_column(cdp, w.r.spos, w.r.epos); \ + struct dz_cap_s *cap = _end_column(cdp, colspos, colepos); \ int32_t inc = _hmax_vector(maxv); \ if(dz_cmp_max(inc, w.inc)) { w.inc = inc; w.mcap = cap; }/* update max; the actual score (absolute score accumulated from the origin) is expressed as max + inc; 2 x cmov */ \ /* FIXME: rescue overflow */ \ From 54578f15432e95aec7f66fb616884ef01d4bd8ad Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 22 Jan 2021 23:05:23 -0800 Subject: [PATCH 21/43] maintain two separate ranges for forward and backward in forefront --- dozeu.h | 83 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/dozeu.h b/dozeu.h index 026f49d..216519a 100644 --- a/dozeu.h +++ b/dozeu.h @@ -207,10 +207,12 @@ struct dz_cap_s { /* followed by dz_forefront_s; spos and epos are sha uint32_t rch; int32_t rrem; }; struct dz_forefront_s { - struct dz_range_s r; + struct dz_range_s r; // the range that applies to the preceding column uint32_t rid; int32_t rlen; + struct dz_range_s fr; // the range that should continue forward uint32_t rsum, rcnt; int32_t max, inc; + uint8_t _pad[8]; struct dz_query_s const *query; struct dz_cap_s const *mcap; }; @@ -510,7 +512,7 @@ unittest() { /* push cap info */ \ struct dz_cap_s *cap = dz_cap(dz_mem(self)->stack.top); \ /* calculate sizes */ \ - size_t next_req = _calc_next_size((_w).r.spos, (_w).r.epos, 0); \ + size_t next_req = _calc_next_size((_w).fr.spos, (_w).fr.epos, 0); \ /* allocate from heap */ \ cap->rch = (_rch); /* record rch for the next column */ \ cap->rrem = (_rlen); /* record rlen for use in traceback */ \ @@ -519,9 +521,9 @@ unittest() { dz_mem_add_stack(dz_mem(self), next_req); /* 0); }*/ \ cap = _init_cap(0, _rch, &cap, 1); \ } \ - debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).r.spos, (_w).r.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ + debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ /* return array pointer */ \ - (struct dz_swgv_s *)(cap + 1) - (_w).r.spos; \ + (struct dz_swgv_s *)(cap + 1) - (_w).fr.spos; \ }) #define _end_column(_p, _spos, _epos) ({ \ /* write back the stack pointer and return a cap */ \ @@ -534,16 +536,16 @@ unittest() { #define _end_matrix(_p, _wp, _rrem) ({ \ /* create forefront object */ \ struct dz_forefront_s *forefront = dz_forefront(&(dz_swgv(_p))[(_wp)->r.epos]); \ - /* if((_wp)->r.spos >= (_wp)->r.epos) { (_wp)->r.epos = 0; } */ \ forefront->rid = (_wp)->rid; \ forefront->rlen = (_wp)->rlen; \ + forefront->fr = (_wp)->fr; \ forefront->rsum = (_wp)->rsum + ((_wp)->rlen < 0 ? -1 : 1) * ((_wp)->rlen - (_rrem)); \ forefront->rcnt = (_wp)->rcnt + 1; \ forefront->max = (_wp)->max + (_wp)->inc; \ forefront->inc = (_wp)->inc; \ forefront->query = (_wp)->query; \ forefront->mcap = (_wp)->mcap; \ - debug("create forefront(%p), [%u, %u), max(%d), inc(%d), rlen(%d), rrem(%d)", forefront, (_wp)->r.spos, (_wp)->r.epos, (_wp)->max, (_wp)->inc, (int32_t)(_wp)->rlen, (int32_t)(_rrem)); \ + debug("create forefront(%p), [%u, %u), [%u, %u), max(%d), inc(%d), rlen(%d), query(%p), rrem(%d)", forefront, (_wp)->r.spos, (_wp)->r.epos, (_wp)->fr.spos, (_wp)->fr.epos, (_wp)->max, (_wp)->inc, (int32_t)(_wp)->rlen, (_wp)->query, (int32_t)(_rrem)); \ /* write back stack pointer */ \ dz_mem(self)->stack.top = (uint8_t *)(forefront + 1); \ /* return the forefront pointer */ \ @@ -836,13 +838,13 @@ struct dz_alignment_init_s dz_align_init( size_t const L = sizeof(__m128i) / sizeof(uint16_t); size_t max_gap_vec_len = dz_roundup(max_gap_len, L) / L; - struct dz_forefront_s w = { { 0, (uint32_t) max_gap_vec_len}, 0, 0, 0, 0, 0, 0, NULL, NULL }; + struct dz_forefront_s w = { { 0, (uint32_t) max_gap_vec_len}, 0, 0, { 0, (uint32_t) max_gap_vec_len}, 0, 0, 0, 0, NULL, NULL }; struct dz_forefront_s *a = &w; /* malloc the first column */ struct dz_swgv_s *dp = _begin_column_head(0, max_gap_vec_len, 0, &a, 0); - uint16_t xt = self->giv[0] + self->gev[0] * max_gap_len; + int16_t xt = self->giv[0] + self->gev[0] * max_gap_len; // calculate the x-drop threshold for this gap length __m128i xtv = _mm_set1_epi16(-xt); @@ -863,6 +865,7 @@ struct dz_alignment_init_s dz_align_init( } s = _mm_subs_epi16(s, _mm_slli_epi16(_mm_load_si128((__m128i const *)self->gev), 3)); } + w.fr.epos = w.r.epos; /* done; create forefront object */ _end_column(dp, w.r.spos, w.r.epos); @@ -1391,16 +1394,16 @@ unittest() { #define _merge_column(w, adj, forefronts, n_forefronts, query, init_s) ({ \ for(size_t i = 0; i < n_forefronts; i++) { \ /* update max and pos */ \ - w.r.spos = dz_min2(w.r.spos, forefronts[i]->r.spos); \ - w.r.epos = dz_max2(w.r.epos, forefronts[i]->r.epos); \ + w.fr.spos = dz_min2(w.fr.spos, forefronts[i]->fr.spos); \ + w.fr.epos = dz_max2(w.fr.epos, forefronts[i]->fr.epos); \ w.rsum = dz_max2(w.rsum, forefronts[i]->rsum); \ w.rcnt = dz_max2(w.rcnt, forefronts[i]->rcnt); \ if(init_s != 0) { w.max = dz_max2(w.max, forefronts[i]->max); } \ - debug("i(%lu, %p), [%u, %u), inc(%d), max(%d)", i, forefronts[i], forefronts[i]->r.spos, forefronts[i]->r.epos, forefronts[i]->inc, forefronts[i]->max); \ + debug("i(%lu, %p), r [%u, %u), fr [%u, %u), inc(%d), max(%d)", i, forefronts[i], forefronts[i]->r.spos, forefronts[i]->r.epos, forefronts[i]->fr.spos, forefronts[i]->fr.epos, forefronts[i]->inc, forefronts[i]->max); \ } \ - w.r.spos = w.r.spos > INT32_MAX ? 0 : w.r.spos; \ - w.r.epos = dz_min2(w.r.epos, query->blen); /* make sure epos and p-iteration index is no larger than blen */ \ - debug("start extension [%u, %u), inc(%d), max(%d), stack(%p, %p), rlen(%d)", w.r.spos, w.r.epos, w.inc, w.max, dz_mem(self)->stack.top, dz_mem(self)->stack.end, (int32_t)rlen); \ + w.fr.spos = w.fr.spos > INT32_MAX ? 0 : w.fr.spos; \ + w.fr.epos = dz_min2(w.fr.epos, query->blen); /* make sure epos and p-iteration index is no larger than blen */ \ + debug("start extension [%u, %u), inc(%d), max(%d), stack(%p, %p), rlen(%d)", w.fr.spos, w.fr.epos, w.inc, w.max, dz_mem(self)->stack.top, dz_mem(self)->stack.end, (int32_t)rlen); \ if(init_s != 0) { \ for(size_t i = 0; i < n_forefronts; i++) { \ adj[i] = w.max - (forefronts[i]->max - forefronts[i]->inc); /* base = max - inc */ \ @@ -1408,16 +1411,16 @@ unittest() { } \ } \ /* once; merge incoming vectors */ \ - struct dz_swgv_s *cdp = _begin_column_head(w.r.spos, w.r.epos, w.max, forefronts, n_forefronts); /* allocate memory for the first column */ \ - for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ + struct dz_swgv_s *cdp = _begin_column_head(w.fr.spos, w.fr.epos, w.max, forefronts, n_forefronts); /* allocate memory for the first column */ \ + for(uint64_t p = w.fr.spos; p < w.fr.epos; p++) { \ /* memset(cdp, 0xff, sizeof(struct dz_swgv_s) * (w.r.epos - w.r.spos)); */ \ __m128i const e = _mm_set1_epi16(INT16_MIN), f = e, s = e; _store_vector(&cdp[p]); \ } \ /* paste the last vectors */ \ for(size_t i = 0; i < n_forefronts; i++) { \ - struct dz_swgv_s const *tdp = (struct dz_swgv_s const *)forefronts[i] - forefronts[i]->r.epos; \ + struct dz_swgv_s const *tdp = dz_cswgv(forefronts[i]) - forefronts[i]->r.epos; \ __m128i const adjv = _mm_set1_epi16(init_s == 0 ? 0 : adj[i]); \ - for(uint64_t p = forefronts[i]->r.spos; p < forefronts[i]->r.epos; p++) { \ + for(uint64_t p = forefronts[i]->fr.spos; p < forefronts[i]->fr.epos; p++) { \ /* adjust offset */ \ __m128i e = _mm_subs_epi16(_mm_load_si128(&tdp[p].e), adjv); \ __m128i f = _mm_subs_epi16(_mm_load_si128(&tdp[p].f), adjv); \ @@ -1429,7 +1432,8 @@ unittest() { print_vector(s); \ } \ } \ - _end_column(cdp, w.r.spos, w.r.epos); \ + w.r = w.fr;\ + _end_column(cdp, w.fr.spos, w.fr.epos); \ cdp; \ }) #define _fill_column(w, pdp, query, rt, rrem, xt, init_s) ({ \ @@ -1440,43 +1444,43 @@ unittest() { __m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN), pts = _mm_set1_epi16(INT16_MIN); \ __m128i const xtv = _mm_set1_epi16(w.inc - xt); /* next offset == current max thus X-drop threshold is always -xt */ \ /* until the bottommost vertically placed band... */ \ - uint64_t first_drop = w.r.epos;\ - uint64_t colspos = w.r.spos;\ - uint64_t colepos = w.r.epos;\ - for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \ + uint64_t first_drop = w.fr.epos;\ + /* we want to remember the backward facing range even if we x-drop the forward facing range */ \ + w.r = w.fr; \ + for(uint64_t p = w.fr.spos; p < w.fr.epos; p++) { \ _load_vector(&pdp[p]); \ _update_vector(p); \ if(dz_unlikely(_test_xdrop(s, xtv))) { /* mark _unlikely to move out of the core loop */ \ /* drop either leading or trailing vector, skip the forefront extension when the forefront is clipped */ \ pts = _mm_set1_epi16(INT16_MIN); \ - if(p == w.r.spos) { \ - w.r.spos++; \ + if(p == w.fr.spos) { \ + w.fr.spos++; \ } else { \ first_drop = dz_min2(p, first_drop); \ } \ } else { \ - first_drop = w.r.epos; /* reset the xdrop to indicate that any xdrops we found are not contiguous */ \ + first_drop = w.fr.epos; /* reset the xdrop to indicate that any xdrops we found are not contiguous */ \ } \ _store_vector(&cdp[p]); \ } \ - if (first_drop < w.r.epos) { \ + if (first_drop < w.fr.epos) { \ /* we xdropped a contiguous block of vectors at the end of this column */\ - w.r.epos = first_drop; \ + w.fr.epos = first_drop; \ goto dz_pp_cat(_forefront_, __LINE__); \ } \ /* if reached the forefront of the query sequence, finish the extension */ \ - if(w.r.epos < query->blen) { \ + if(w.fr.epos < query->blen) { \ /* forefront extension; clip the column length if too long */ \ - __m128i e = minv, s = minv; _update_vector(w.r.epos); \ + __m128i e = minv, s = minv; _update_vector(w.fr.epos); \ do { \ if(_test_xdrop(s, xtv)) { break; } \ - _store_vector(&cdp[w.r.epos]); w.r.epos++; colepos++; \ + _store_vector(&cdp[w.fr.epos]); w.fr.epos++; w.r.epos++; \ f = _mm_subs_epi16(f, gev8); s = _mm_subs_epi16(s, gev8); \ - } while(w.r.epos < query->blen); \ + } while(w.fr.epos < query->blen); \ } \ dz_pp_cat(_forefront_, __LINE__):; \ /* create cap object that contains [spos, epos) range (for use in the traceback routine) */ \ - struct dz_cap_s *cap = _end_column(cdp, colspos, colepos); \ + struct dz_cap_s *cap = _end_column(cdp, w.r.spos, w.r.epos); \ int32_t inc = _hmax_vector(maxv); \ if(dz_cmp_max(inc, w.inc)) { w.inc = inc; w.mcap = cap; }/* update max; the actual score (absolute score accumulated from the origin) is expressed as max + inc; 2 x cmov */ \ /* FIXME: rescue overflow */ \ @@ -1528,22 +1532,20 @@ struct dz_forefront_s const *dz_extend_intl( __m128i const gev4 = _mm_slli_epi16(gev1, 2); __m128i const gev8 = _mm_slli_epi16(gev1, 3); - struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, 0, 0, 0, 0, NULL, NULL }; /* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ + struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, { UINT32_MAX, 0 }, 0, 0, 0, 0, NULL, NULL }; /* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ w.rlen = rlen; w.rid = rid; w.query = query; - /* first iterate over the incoming edge objects to get the current max */ uint64_t adj[n_forefronts]; /* keep variable length array out of statement expression to avoid a bug of icc */ struct dz_swgv_s *pdp = _merge_column(w, adj, forefronts, n_forefronts, query, init_s); - /* fetch the first base */ int64_t rrem = rlen, dir = rlen < 0 ? 1 : -1; uint8_t const *rt = (uint8_t const *)&ref[rrem - (rlen < 0)]; - while(rrem != 0 && w.r.spos < w.r.epos) { + while(rrem != 0 && w.fr.spos < w.fr.epos) { pdp = _fill_column(w, pdp, query, rt, rrem, xt, init_s); rrem += dir; - debug("rrem(%d), [%u, %u)", rrem, w.r.spos, w.r.epos); + debug("rrem(%d), [%u, %u)", rrem, w.fr.spos, w.fr.epos); } return(_end_matrix(pdp, &w, rrem)); /* update max vector (pdp always points at the last vector) */ } @@ -1849,7 +1851,9 @@ uint64_t dz_calc_max_pos(struct dz_forefront_s const *forefront) cap = pcap; pcap = _unwind_cap(cap); \ debug("idx(%lu), cap(%p), pcap(%p), [%u, %u)", idx, cap, pcap, pcap->r.spos, pcap->r.epos); \ /* escape if not head */ \ - if(dz_likely(!_is_head(pcap))) { break; } \ + if(dz_likely(!_is_head(pcap))) { \ + break; \ + } \ /* escape if root */ \ debug("head, n_forefronts(%u)", dz_chead(pcap)->n_forefronts); \ if(pcap->rrem == 0) { debug("reached root"); goto _trace_tail; } \ @@ -1890,7 +1894,6 @@ struct dz_alignment_s *dz_trace( { size_t const L = sizeof(__m128i) / sizeof(uint16_t); if(forefront->mcap == NULL) { debug("mcap is null"); return(NULL); } - /* detect pos */ uint64_t idx = dz_calc_max_qpos(forefront); /* vector index, cell index */ uint64_t ref_length = 0, query_length = idx; From 17e3896538058d64a3528e2684267007afeef32e Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 28 Feb 2022 18:32:47 -0500 Subject: [PATCH 22/43] use fast indexing for constant qual values --- dozeu.h | 71 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/dozeu.h b/dozeu.h index 216519a..fb06592 100644 --- a/dozeu.h +++ b/dozeu.h @@ -283,22 +283,22 @@ dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0); } #define print_vector8(v) { \ debug("%s (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", #v, \ - (int)_mm_extract_epi8(v, 15), \ - (int)_mm_extract_epi8(v, 14), \ - (int)_mm_extract_epi8(v, 13), \ - (int)_mm_extract_epi8(v, 12), \ - (int)_mm_extract_epi8(v, 11), \ - (int)_mm_extract_epi8(v, 10), \ - (int)_mm_extract_epi8(v, 9), \ - (int)_mm_extract_epi8(v, 8), \ - (int)_mm_extract_epi8(v, 7), \ - (int)_mm_extract_epi8(v, 6), \ - (int)_mm_extract_epi8(v, 5), \ - (int)_mm_extract_epi8(v, 4), \ - (int)_mm_extract_epi8(v, 3), \ - (int)_mm_extract_epi8(v, 2), \ - (int)_mm_extract_epi8(v, 1), \ - (int)_mm_extract_epi8(v, 0)); \ + (int)(int8_t)_mm_extract_epi8(v, 15), \ + (int)(int8_t)_mm_extract_epi8(v, 14), \ + (int)(int8_t)_mm_extract_epi8(v, 13), \ + (int)(int8_t)_mm_extract_epi8(v, 12), \ + (int)(int8_t)_mm_extract_epi8(v, 11), \ + (int)(int8_t)_mm_extract_epi8(v, 10), \ + (int)(int8_t)_mm_extract_epi8(v, 9), \ + (int)(int8_t)_mm_extract_epi8(v, 8), \ + (int)(int8_t)_mm_extract_epi8(v, 7), \ + (int)(int8_t)_mm_extract_epi8(v, 6), \ + (int)(int8_t)_mm_extract_epi8(v, 5), \ + (int)(int8_t)_mm_extract_epi8(v, 4), \ + (int)(int8_t)_mm_extract_epi8(v, 3), \ + (int)(int8_t)_mm_extract_epi8(v, 2), \ + (int)(int8_t)_mm_extract_epi8(v, 1), \ + (int)(int8_t)_mm_extract_epi8(v, 0)); \ } #else #define print_vector(v) ; @@ -577,20 +577,31 @@ unittest() { __m128i const rv = _mm_set1_epi8(rch); #define _calc_score_profile(_i) ({ \ - /* construct the within-matrix and between-matrix indexes and then combine them */ \ - __m128i iv = _mm_or_si128(_mm_slli_epi16(_mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i const *)&qarr[(_i) * L])), 5), \ - _mm_cvtepi8_epi16(_mm_and_si128(_mm_or_si128(rv, _mm_loadl_epi64((__m128i const *)&parr[(_i) * L])), _mm_set1_epi8(0x1f)))); \ - /* get the scores from the quality matrices (unvectorized, unfortunately) */ \ - int8_t const *qual_mat = dz_qual_matrix(self); \ - /* TODO: qual: is this the right order? */ \ - __m128i sc = _mm_setr_epi16(qual_mat[_mm_extract_epi16(iv, 0)], \ - qual_mat[_mm_extract_epi16(iv, 1)], \ - qual_mat[_mm_extract_epi16(iv, 2)], \ - qual_mat[_mm_extract_epi16(iv, 3)], \ - qual_mat[_mm_extract_epi16(iv, 4)], \ - qual_mat[_mm_extract_epi16(iv, 5)], \ - qual_mat[_mm_extract_epi16(iv, 6)], \ - qual_mat[_mm_extract_epi16(iv, 7)]); \ + /* load qual and query index values */ \ + __m128i qxv = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i const *)&qarr[(_i) * L])); \ + __m128i sxv = _mm_or_si128(rv, _mm_loadl_epi64((__m128i const *)&parr[(_i) * L])); /* note: left in 8-bit */\ + __m128i sc; \ + if (_mm_movemask_epi8(_mm_cmpeq_epi8(qxv, _mm_alignr_epi8(qxv, qxv, 2))) == 0xffff) { \ + /* we have uniform quality values, so we can pull from just one matrix */ \ + int8_t const *qual_mat = dz_qual_matrix(self) + 32 * (*((int16_t*)&qxv)); \ + sc = _mm_cvtepi8_epi16(_mm_shuffle_epi8(_mm_load_si128((__m128i const *)qual_mat), sxv)); \ + }\ + else { \ + /* combine the qual and sequence indexes */\ + __m128i xv = _mm_or_si128(_mm_slli_epi16(qxv, 5), \ + _mm_and_si128(_mm_cvtepi8_epi16(sxv), _mm_set1_epi16(0x1f)));\ + print_vector(xv);\ + /* get the scores from the quality matrices (unvectorized, unfortunately) */ \ + int8_t const *qual_mat = dz_qual_matrix(self); \ + sc = _mm_setr_epi16(qual_mat[_mm_extract_epi16(xv, 0)], \ + qual_mat[_mm_extract_epi16(xv, 1)], \ + qual_mat[_mm_extract_epi16(xv, 2)], \ + qual_mat[_mm_extract_epi16(xv, 3)], \ + qual_mat[_mm_extract_epi16(xv, 4)], \ + qual_mat[_mm_extract_epi16(xv, 5)], \ + qual_mat[_mm_extract_epi16(xv, 6)], \ + qual_mat[_mm_extract_epi16(xv, 7)]);\ + } \ print_vector(sc)\ sc; \ }) From b40572ba3a917c6f0835d22916269d05344aa954 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 7 Sep 2023 18:01:06 -0400 Subject: [PATCH 23/43] Start trying to run Dozeu tests; notice that root which they use has been removed --- dozeu.h | 18 ++++++++++-------- example.c | 6 +++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/dozeu.h b/dozeu.h index fb06592..d87949d 100644 --- a/dozeu.h +++ b/dozeu.h @@ -216,6 +216,8 @@ struct dz_forefront_s { struct dz_query_s const *query; struct dz_cap_s const *mcap; }; +// When initializing forefronts, we want to be able to fill in the pad without saying 0s everywhere. +#define DZ_FOREFRONT_PAD {0, 0, 0, 0, 0, 0, 0, 0} struct dz_alignment_init_s { struct dz_forefront_s const *root; uint16_t xt; @@ -849,7 +851,7 @@ struct dz_alignment_init_s dz_align_init( size_t const L = sizeof(__m128i) / sizeof(uint16_t); size_t max_gap_vec_len = dz_roundup(max_gap_len, L) / L; - struct dz_forefront_s w = { { 0, (uint32_t) max_gap_vec_len}, 0, 0, { 0, (uint32_t) max_gap_vec_len}, 0, 0, 0, 0, NULL, NULL }; + struct dz_forefront_s w = { { 0, (uint32_t) max_gap_vec_len}, 0, 0, { 0, (uint32_t) max_gap_vec_len}, 0, 0, 0, 0, DZ_FOREFRONT_PAD, NULL, NULL }; struct dz_forefront_s *a = &w; /* malloc the first column */ @@ -882,7 +884,7 @@ struct dz_alignment_init_s dz_align_init( _end_column(dp, w.r.spos, w.r.epos); /* package forefront and xt, return */ - dz_alignment_init_s aln_init; + struct dz_alignment_init_s aln_init; aln_init.root = _end_matrix(dp, &w, 0); aln_init.xt = xt; return(aln_init); @@ -918,7 +920,7 @@ void dz_flush( #ifdef DZ_QUAL_ADJ bottom = (void *)(((int8_t *)bottom) + 2 * DZ_QUAL_MATRIX_SIZE); #endif - bottom = (void *)(((dz_cap_s *)bottom) + 1); + bottom = (void *)(((struct dz_cap_s *)bottom) + 1); dz_mem(self)->stack.top = (uint8_t *)bottom; return; @@ -1543,7 +1545,7 @@ struct dz_forefront_s const *dz_extend_intl( __m128i const gev4 = _mm_slli_epi16(gev1, 2); __m128i const gev8 = _mm_slli_epi16(gev1, 3); - struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, { UINT32_MAX, 0 }, 0, 0, 0, 0, NULL, NULL }; /* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ + struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, { UINT32_MAX, 0 }, 0, 0, 0, 0, DZ_FOREFRONT_PAD, NULL, NULL }; /* uint32_t spos, epos, max, inc; uint8_t _pad[8]; struct dz_query_s const *query; struct dz_cap_s const *cap; */ w.rlen = rlen; w.rid = rid; w.query = query; @@ -1642,7 +1644,7 @@ unittest( "extend.base" ) { ut_assert(forefront == NULL); forefront = dz_extend(dz, q, NULL, 0, "", 0, 2, 0); ut_assert(forefront == NULL); - dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + struct dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); forefront = dz_extend(dz, q, &aln_init.root, 1, "", 0, 3, aln_init.xt); ut_assert(forefront == aln_init.root); @@ -1702,7 +1704,7 @@ unittest( "extend.base.revcomp" ) { ut_assert(forefront == NULL); forefront = dz_extend(dz, q, NULL, 0, "", 0, 2, 0); ut_assert(forefront == NULL); - dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + struct dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); forefront = dz_extend(dz, q, &aln_init.root, 1, "", 0, 3, aln_init.xt); ut_assert(forefront == aln_init.root); @@ -1759,7 +1761,7 @@ unittest( "extend.small" ) { * \ / \ / * C CATT */ - dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + struct dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); forefronts[0] = dz_extend(dz, q, &aln_init.root, 1, dz_ut_sel("AG", "\x0\x2", "MA"), 2, 1, aln_init.xt); ut_assert(forefronts[0] != NULL && forefronts[0]->max == dz_ut_sel(4, 4, 9)); @@ -2042,7 +2044,7 @@ unittest( "trace" ) { struct dz_forefront_s const *forefront = NULL; struct dz_alignment_s *aln = NULL; - dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); + struct dz_alignment_init_s aln_init = dz_align_init(dz, DZ_UNITTEST_MAX_GAP_LEN); forefront = dz_extend(dz, q, &aln_init.root, 1, "A", 1, 1, aln_init.xt); aln = dz_trace(dz, forefront); diff --git a/example.c b/example.c index 355cb01..b6dc046 100644 --- a/example.c +++ b/example.c @@ -44,16 +44,16 @@ int main(int argc, char *argv[]) }; struct dz_s *dz = dz_init( score_matrix, - GI, GE, - xdrop_threshold, - full_length_bonus + GI, GE ); + // TODO: does xdrop_threshold go in anymore? /* pack query */ char const *query = "ACACTTCTAGACTTTACCACTA"; struct dz_query_s const *q = dz_pack_query_forward( dz, query, /* char const *seq: query sequence */ + full_length_bonus, strlen(query) /* length */ ); From 9e9c39f5e3ccca505465a0d87fda53b7c95edb4e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 10:34:46 -0400 Subject: [PATCH 24/43] Get Dozeu examples to use the new dz_alignment_init_s --- dozeu.h | 14 +++++++++++++- example.2bit.c | 39 +++++++++++++++++++++------------------ example.c | 37 ++++++++++++++++++++----------------- example.protein.c | 39 +++++++++++++++++++++------------------ 4 files changed, 75 insertions(+), 54 deletions(-) diff --git a/dozeu.h b/dozeu.h index d87949d..757b18f 100644 --- a/dozeu.h +++ b/dozeu.h @@ -106,6 +106,11 @@ enum dz_alphabet { # ifndef DZ_MAT_SIZE # define DZ_MAT_SIZE ( 32 ) # endif +#ifdef DZ_FULL_LENGTH_BONUS +#define dz_end_bonus(_self, _query, _i) ((_i) / 8 == (_query)->blen - 1 ? (_query)->bonus[8 + ((_i) & 7)] : 0) +#else +#define dz_end_bonus(_self, _query, _i) 0; +#endif #define dz_pair_score(_self, _q, _r, _i) ( (int8_t)((_q)->arr[(_r) * (_q)->blen * L + (_i)]) + dz_end_bonus(_self, _q, _i)) #define dz_pair_eq(_self, _q, _r, _i) ( (uint32_t)((_q)->q[(_i) - 1] & 0x1f) == (uint32_t)(_r) ) #endif @@ -216,8 +221,15 @@ struct dz_forefront_s { struct dz_query_s const *query; struct dz_cap_s const *mcap; }; -// When initializing forefronts, we want to be able to fill in the pad without saying 0s everywhere. +/* + * When initializing forefronts, we want to be able to fill in the pad without saying 0s everywhere. + */ #define DZ_FOREFRONT_PAD {0, 0, 0, 0, 0, 0, 0, 0} + +/* + * This holds the forefront for the alignment root, and the x-drop threshold, + * computed from the max gap length. + */ struct dz_alignment_init_s { struct dz_forefront_s const *root; uint16_t xt; diff --git a/example.2bit.c b/example.2bit.c index 9d57231..941806c 100644 --- a/example.2bit.c +++ b/example.2bit.c @@ -36,6 +36,7 @@ int main(int argc, char *argv[]) /* init score matrix and memory arena */ int8_t const M = 2, X = -3, GI = 5, GE = 1; /* match, mismatch, gap open, and gap extend; g(k) = GI + k + GE for k-length gap */ int8_t const xdrop_threshold = 70, full_length_bonus = 10; + int8_t max_gap_length = (xdrop_threshold - GI) / GE; int8_t const score_matrix[16] = { /* ref-side */ /* A C G T */ @@ -47,17 +48,18 @@ int main(int argc, char *argv[]) }; struct dz_s *dz = dz_init( score_matrix, - GI, GE, - xdrop_threshold, - full_length_bonus + GI, GE ); + /* Initialize root of alignment */ + struct dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); /* pack query */ char const *query = "\x0\x1\x0\x1\x3\x3\x1\x3\x0\x2\x0\x1\x3\x3\x3\x0\x1\x1\x0\x1\x3\x0"; struct dz_query_s const *q = dz_pack_query_forward( - dz, - query, /* char const *seq: query sequence */ - 22 /* length */ + dz, + query, /* char const *seq: query sequence */ + full_length_bonus, + 22 /* length */ ); /* init node array */ @@ -65,25 +67,26 @@ int main(int argc, char *argv[]) /* fill root: node 0 */ ff[0] = dz_extend( - dz, q, - dz_root(dz), 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; dz_root(dz) for root node */ - "\x0\x1\x0\x1", 4, 0 /* reference-side sequence, its length, and node id */ + dz, q, + &aln_init.root, 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; dz_root(dz) for root node */ + "\x0\x1\x0\x1", 4, 0, /* reference-side sequence, its length, and node id */ + aln_init.xt /* x-drop threshold */ ); /* branching paths: 1, 2 -> 3 */ - ff[1] = dz_extend(dz, q, &ff[0], 1, "\x3\x3\x2\x3", 4, 1); - ff[2] = dz_extend(dz, q, &ff[0], 1, "\x0\x3\x1\x1", 4, 2); - ff[3] = dz_extend(dz, q, &ff[1], 2, "\x0\x2\x0\x1", 4, 3); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ + ff[1] = dz_extend(dz, q, &ff[0], 1, "\x3\x3\x2\x3", 4, 1, aln_init.xt); + ff[2] = dz_extend(dz, q, &ff[0], 1, "\x0\x3\x1\x1", 4, 2, aln_init.xt); + ff[3] = dz_extend(dz, q, &ff[1], 2, "\x0\x2\x0\x1", 4, 3, aln_init.xt); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ /* insertion: -, 4 -> 5 */ - ff[4] = dz_extend(dz, q, &ff[3], 1, "\x3", 1, 4); - ff[5] = dz_extend(dz, q, &ff[3], 2, "\x3\x3\x1\x3\x0", 5, 5); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ + ff[4] = dz_extend(dz, q, &ff[3], 1, "\x3", 1, 4, aln_init.xt); + ff[5] = dz_extend(dz, q, &ff[3], 2, "\x3\x3\x1\x3\x0", 5, 5, aln_init.xt); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ /* SNVs: 6, 7, 8 -> 9 */ - ff[6] = dz_extend(dz, q, &ff[5], 1, "\x0", 1, 6); - ff[7] = dz_extend(dz, q, &ff[5], 1, "\x1", 1, 7); - ff[8] = dz_extend(dz, q, &ff[5], 1, "\x2", 1, 8); - ff[9] = dz_extend(dz, q, &ff[6], 3, "\x1\x0\x1\x2\x2", 5, 9); + ff[6] = dz_extend(dz, q, &ff[5], 1, "\x0", 1, 6, aln_init.xt); + ff[7] = dz_extend(dz, q, &ff[5], 1, "\x1", 1, 7, aln_init.xt); + ff[8] = dz_extend(dz, q, &ff[5], 1, "\x2", 1, 8, aln_init.xt); + ff[9] = dz_extend(dz, q, &ff[6], 3, "\x1\x0\x1\x2\x2", 5, 9, aln_init.xt); /* detect max */ struct dz_forefront_s const *max = NULL; diff --git a/example.c b/example.c index b6dc046..800d631 100644 --- a/example.c +++ b/example.c @@ -34,6 +34,7 @@ int main(int argc, char *argv[]) /* init score matrix and memory arena */ int8_t const M = 2, X = -3, GI = 5, GE = 1; /* match, mismatch, gap open, and gap extend; g(k) = GI + k + GE for k-length gap */ int8_t const xdrop_threshold = 70, full_length_bonus = 10; + int8_t max_gap_length = (xdrop_threshold - GI) / GE; int8_t const score_matrix[16] = { /* ref-side */ /* A C G T */ @@ -46,15 +47,16 @@ int main(int argc, char *argv[]) score_matrix, GI, GE ); - // TODO: does xdrop_threshold go in anymore? + /* Initialize root of alignment */ + struct dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); /* pack query */ char const *query = "ACACTTCTAGACTTTACCACTA"; struct dz_query_s const *q = dz_pack_query_forward( - dz, - query, /* char const *seq: query sequence */ - full_length_bonus, - strlen(query) /* length */ + dz, + query, /* char const *seq: query sequence */ + full_length_bonus, + strlen(query) /* length */ ); /* init node array */ @@ -62,25 +64,26 @@ int main(int argc, char *argv[]) /* fill root: node 0 */ ff[0] = dz_extend( - dz, q, - dz_root(dz), 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; dz_root(dz) for root node */ - "ACAC", strlen("ACAC"), 0 /* reference-side sequence, its length, and node id */ + dz, q, + &aln_init.root, 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; */ + "ACAC", strlen("ACAC"), 0, /* reference-side sequence, its length, and node id */ + aln_init.xt /* x-drop threshold */ ); /* branching paths: 1, 2 -> 3 */ - ff[1] = dz_extend(dz, q, &ff[0], 1, "TTGT", strlen("TTGT"), 1); - ff[2] = dz_extend(dz, q, &ff[0], 1, "ATCC", strlen("ATCC"), 2); - ff[3] = dz_extend(dz, q, &ff[1], 2, "AGAC", strlen("AGAC"), 3); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ + ff[1] = dz_extend(dz, q, &ff[0], 1, "TTGT", strlen("TTGT"), 1, aln_init.xt); + ff[2] = dz_extend(dz, q, &ff[0], 1, "ATCC", strlen("ATCC"), 2, aln_init.xt); + ff[3] = dz_extend(dz, q, &ff[1], 2, "AGAC", strlen("AGAC"), 3, aln_init.xt); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ /* insertion: -, 4 -> 5 */ - ff[4] = dz_extend(dz, q, &ff[3], 1, "T", strlen("T"), 4); - ff[5] = dz_extend(dz, q, &ff[3], 2, "TTCTA", strlen("TTCTA"), 5); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ + ff[4] = dz_extend(dz, q, &ff[3], 1, "T", strlen("T"), 4, aln_init.xt); + ff[5] = dz_extend(dz, q, &ff[3], 2, "TTCTA", strlen("TTCTA"), 5, aln_init.xt); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ /* SNVs: 6, 7, 8 -> 9 */ - ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6); - ff[7] = dz_extend(dz, q, &ff[5], 1, "C", strlen("C"), 7); - ff[8] = dz_extend(dz, q, &ff[5], 1, "G", strlen("G"), 8); - ff[9] = dz_extend(dz, q, &ff[6], 3, "CACGG", strlen("CACGG"), 9); + ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6, aln_init.xt); + ff[7] = dz_extend(dz, q, &ff[5], 1, "C", strlen("C"), 7, aln_init.xt); + ff[8] = dz_extend(dz, q, &ff[5], 1, "G", strlen("G"), 8, aln_init.xt); + ff[9] = dz_extend(dz, q, &ff[6], 3, "CACGG", strlen("CACGG"), 9, aln_init.xt); /* detect max */ struct dz_forefront_s const *max = NULL; diff --git a/example.protein.c b/example.protein.c index 2bb78c2..6815453 100644 --- a/example.protein.c +++ b/example.protein.c @@ -36,6 +36,7 @@ int main(int argc, char *argv[]) /* init score matrix and memory arena */ int8_t const GI = 5, GE = 1; /* match, mismatch, gap open, and gap extend; g(k) = GI + k + GE for k-length gap */ int8_t const xdrop_threshold = 70, full_length_bonus = 10; + int8_t max_gap_length = (xdrop_threshold - GI) / GE; int8_t const raw_score_matrix[24 * 24] = { /* BLOSUM62 obtained from https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt */ /* ref-side */ @@ -80,17 +81,18 @@ int main(int argc, char *argv[]) /* create context */ struct dz_s *dz = dz_init( score_matrix, - GI, GE, - xdrop_threshold, - full_length_bonus + GI, GE ); + /* Initialize root of alignment */ + struct dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); /* pack query */ char const *query = "MSALLILALVGAAVAPLEDDDK"; struct dz_query_s const *q = dz_pack_query_forward( - dz, - query, /* char const *seq: query sequence */ - strlen(query) /* length */ + dz, + query, /* char const *seq: query sequence */ + full_length_bonus, + strlen(query) /* length */ ); /* init node array */ @@ -98,25 +100,26 @@ int main(int argc, char *argv[]) /* fill root: node 0 */ ff[0] = dz_extend( - dz, q, - dz_root(dz), 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; dz_root(dz) for root node */ - "MSAL", strlen("MSAL"), 0 /* reference-side sequence, its length, and node id */ + dz, q, + &aln_init.root, 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; dz_root(dz) for root node */ + "MSAL", strlen("MSAL"), 0, /* reference-side sequence, its length, and node id */ + aln_init.xt /* x-drop threshold */ ); /* branching paths: 1, 2 -> 3 */ - ff[1] = dz_extend(dz, q, &ff[0], 1, "LILA", strlen("LILA"), 1); - ff[2] = dz_extend(dz, q, &ff[0], 1, "KKLG", strlen("KKLG"), 2); - ff[3] = dz_extend(dz, q, &ff[1], 2, "LVGA", strlen("LVGA"), 3); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ + ff[1] = dz_extend(dz, q, &ff[0], 1, "LILA", strlen("LILA"), 1, aln_init.xt); + ff[2] = dz_extend(dz, q, &ff[0], 1, "KKLG", strlen("KKLG"), 2, aln_init.xt); + ff[3] = dz_extend(dz, q, &ff[1], 2, "LVGA", strlen("LVGA"), 3, aln_init.xt); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ /* insertion: -, 4 -> 5 */ - ff[4] = dz_extend(dz, q, &ff[3], 1, "A", strlen("A"), 4); - ff[5] = dz_extend(dz, q, &ff[3], 2, "AVAFP", strlen("AVAFP"), 5); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ + ff[4] = dz_extend(dz, q, &ff[3], 1, "A", strlen("A"), 4, aln_init.xt); + ff[5] = dz_extend(dz, q, &ff[3], 2, "AVAFP", strlen("AVAFP"), 5, aln_init.xt); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ /* SNVs: 6, 7, 8 -> 9 */ - ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6); - ff[7] = dz_extend(dz, q, &ff[5], 1, "E", strlen("E"), 7); - ff[8] = dz_extend(dz, q, &ff[5], 1, "M", strlen("M"), 8); - ff[9] = dz_extend(dz, q, &ff[6], 3, "EDDCL", strlen("EDDCL"), 9); + ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6, aln_init.xt); + ff[7] = dz_extend(dz, q, &ff[5], 1, "E", strlen("E"), 7, aln_init.xt); + ff[8] = dz_extend(dz, q, &ff[5], 1, "M", strlen("M"), 8, aln_init.xt); + ff[9] = dz_extend(dz, q, &ff[6], 3, "EDDCL", strlen("EDDCL"), 9, aln_init.xt); /* detect max */ struct dz_forefront_s const *max = NULL; From ead027d07e6e4c7273b823df30d6caf3708b34e9 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 11:03:29 -0400 Subject: [PATCH 25/43] Add allocator documentation and avoid possible free space overflow --- dozeu.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/dozeu.h b/dozeu.h index 757b18f..6a747de 100644 --- a/dozeu.h +++ b/dozeu.h @@ -263,9 +263,27 @@ struct dz_alignment_s { }; /* context (constants and working buffers) */ + +/* + * Header for a block of memory used in an arena. + * + * Exists at the beginning of the block. + */ struct dz_mem_block_s { struct dz_mem_block_s *next; size_t size; }; +/* + * Bookkeeping information for the arena allocator. + * + * Exists at the beginning of the first block, just after its block header. + */ struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t _pad[3]; }; +/* + * Represents a Dozeu memory allocation arena. + * + * Exists at the beginning of the arena's first block, and contains its block + * header and also its stack bookkeeping information. + */ struct dz_mem_s { struct dz_mem_block_s blk; struct dz_stack_s stack; }; +/* Compute the number of bytes available to allocate in the current active block of an arena. */ #define dz_mem_stack_rem(_mem) ( (size_t)((_mem)->stack.end - (_mem)->stack.top) ) struct dz_s { @@ -372,6 +390,13 @@ unittest() { * @fn dz_mem_init, dz_mem_destroy, dz_mem_add_stack, dz_mem_malloc, dz_mem_flush * @brief stack chain */ + +/* + * "Flush" a memory arena. + * + * Retain all blocks allocated from the system allcoator, but internally amrk + * all the space as free. + */ static __dz_force_inline void dz_mem_flush( struct dz_mem_s *mem) @@ -383,6 +408,15 @@ void dz_mem_flush( mem->stack.end = (uint8_t *)mem + mem->blk.size;// - DZ_MEM_MARGIN_SIZE; return; } + +/* + * Create a new memory arena. + * + * The arena will initially have space for internal allocation structures, and + * at least the given number of bytes of user data. + * + * If allocation fails, NULL is returned. + */ static __dz_force_inline struct dz_mem_s *dz_mem_init( size_t size) @@ -399,6 +433,12 @@ struct dz_mem_s *dz_mem_init( dz_mem_flush(mem); return(mem); } + +/* + * Free a memory arena created with dz_mem_init(). + * + * The arena must not be NULL. + */ static __dz_force_inline void dz_mem_destroy( struct dz_mem_s *mem) @@ -413,6 +453,18 @@ void dz_mem_destroy( dz_free(mem); return; } + +/* + * Advance a memory arena to a next block that can allocate the given number of bytes. + * + * Always advances to a new block; assumes sufficient space is definitely not + * available in the current block. + * + * Handles creating new blocks for the arena, or replacing old blocks that are + * not large enough. + * + * Returns 0 on success, and 1 if memory could not be allocated. + */ static __dz_force_inline uint64_t dz_mem_add_stack( struct dz_mem_s *mem, @@ -453,20 +505,38 @@ uint64_t dz_mem_add_stack( mem->stack.end = (uint8_t *)mem->stack.curr + mem->stack.curr->size - DZ_MEM_MARGIN_SIZE; return(0); } + +/* + * Allocate memory from an arena. + * + * Returns NULL if memory could not be allocated. + */ static __dz_force_inline void *dz_mem_malloc( struct dz_mem_s *mem, size_t size) { + /* Make sure to maintain memory alignment. + * + * We need to precompute the aligned size instead of just bumping top by + * the aligned size, because we need to make sure the alignment padding + * memory is actually available in the space between top and end. + * Otherwise, top could pass end, and dz_mem_stack_rem() could overflow. + */ + size = dz_roundup(size, sizeof(__m128i)); + // TODO: this doesn't seem to check to make sure the size allocated is big enough... // also, where does 4096 come from? // also, don't we need to provide the size to ensure that a large enough block is allocated? //if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } if(dz_mem_stack_rem(mem) < size) { - dz_mem_add_stack(mem, size); + if(dz_mem_add_stack(mem, size)) { + /* Report a failed allocation. */ + return NULL; + } } void *ptr = (void *)mem->stack.top; - mem->stack.top += dz_roundup(size, sizeof(__m128i)); + mem->stack.top += size; return(ptr); } From 0115be5410216224ee09c86ada05fc63bebcd80c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 12:08:21 -0400 Subject: [PATCH 26/43] Start allocating a bit with runs --- dozeu.h | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/dozeu.h b/dozeu.h index 6a747de..ce9876a 100644 --- a/dozeu.h +++ b/dozeu.h @@ -525,10 +525,6 @@ void *dz_mem_malloc( */ size = dz_roundup(size, sizeof(__m128i)); - // TODO: this doesn't seem to check to make sure the size allocated is big enough... - // also, where does 4096 come from? - // also, don't we need to provide the size to ensure that a large enough block is allocated? - //if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); } if(dz_mem_stack_rem(mem) < size) { if(dz_mem_add_stack(mem, size)) { /* Report a failed allocation. */ @@ -539,6 +535,41 @@ void *dz_mem_malloc( mem->stack.top += size; return(ptr); } + +/* + * Prepare a contiguous run of memory from an arena. + * + * Returns 0 if successful, and 1 on failure. + */ +static __dz_force_inline +uint64_t dz_mem_run( + struct dz_mem_s *mem, + size_t size) +{ + /* We cheat and just use a whole block as a run. */ + if(dz_mem_stack_rem(mem) < size) { + return dz_mem_add_stack(mem, size); + } + return 0; +} + +/* + * Allocate a number of bytes from the current contiguous run. + * + * Returns a pointer to them, or NULL if they do not fit. + */ +static __dz_force_inline +void *dz_mem_from_run( + struct dz_mem_s *mem, + size_t size) +{ + if(dz_mem_stack_rem(mem) < size) { + return NULL; + } + void *ptr = (void *)mem->stack.top; + mem->stack.top += size; + return(ptr); +} #endif // DZ_INCLUDE_ONCE @@ -585,8 +616,8 @@ unittest() { #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ - /* allocate from heap */ \ - if(dz_mem_stack_rem(dz_mem(self)) < next_req) { dz_mem_add_stack(dz_mem(self), next_req); } /* 0); }*/ \ + /* reserve run from heap */ \ + dz_mem_run(dz_mem(self), next_req); \ /* push head-cap */ \ struct dz_cap_s *cap = _init_cap(_adj, 0xff, _forefronts, _n_forefronts); \ /* return array pointer */ \ From f2eab2c97bba262a64ed9013ec251a863daa2f7f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 12:09:54 -0400 Subject: [PATCH 27/43] Change dst semantics --- dozeu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dozeu.h b/dozeu.h index ce9876a..5692cf2 100644 --- a/dozeu.h +++ b/dozeu.h @@ -601,11 +601,11 @@ unittest() { #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \ /* push forefront pointers */ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \ - struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem(self)->stack.top + forefront_arr_size); \ + struct dz_forefront_s const **dst = ((struct dz_forefront_s const **)(dz_mem(self)->stack.top + forefront_arr_size)) - ((int64_t)(_n_forefronts)); \ struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \ - for(size_t i = 0; i < (_n_forefronts); i++) { dst[-((int64_t)(_n_forefronts)) + i] = src[i]; } \ + for(size_t i = 0; i < (_n_forefronts); i++) { dst[i] = src[i]; } \ /* push head-cap info */ \ - struct dz_head_s *_head = (struct dz_head_s *)dst; \ + struct dz_head_s *_head = (struct dz_head_s *)(dst + ((int64_t)(_n_forefronts))); \ (_head)->r.spos = (_adj); /* save merging adjustment */ \ (_head)->r.epos = 0; /* head marked as zero */ \ (_head)->rch = (_rch); /* rch for the first column */ \ From 9e65cb4bfd6171f513af77c21b2f6c89be3ff4d8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 12:33:00 -0400 Subject: [PATCH 28/43] Explain more about caps --- dozeu.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/dozeu.h b/dozeu.h index 5692cf2..feca31b 100644 --- a/dozeu.h +++ b/dozeu.h @@ -200,14 +200,20 @@ struct dz_node_s { int32_t id, len; uint8_t const *ptr; }; dz_static_assert(sizeof(struct dz_node_s) % sizeof(__m128i) == 0); /* DP matrix structures */ -struct dz_swgv_s { __m128i e, f, s; }; /* followed by dz_cap_s */ -struct dz_range_s { uint32_t spos, epos; }; /* placed just after every score vector to indicate the length */ +/* Score vector item. Array of these will be followed by a dz_cap_s, which leads with a range. */ +struct dz_swgv_s { __m128i e, f, s; }; +/* placed just after every score vector (as part of a cap) to indicate the length */ +struct dz_range_s { uint32_t spos, epos; }; + +/* A special kind of initial cap. Identical in size to a cap. */ struct dz_head_s { struct dz_range_s r; uint32_t rch, n_forefronts; }; -struct dz_cap_s { /* followed by dz_forefront_s; spos and epos are shared to forefront_s */ + +/* followed by dz_forefront_s; range spos and epos are "shared to" (???) forefront_s */ +struct dz_cap_s { struct dz_range_s r; uint32_t rch; int32_t rrem; }; @@ -598,6 +604,9 @@ unittest() { /* debug("est_column_size(%lu), next_req(%lu)", est_column_size, next_req); */ \ next_req; \ }) +/* + * Save an array of forefront pointers to the stack, followed by a head cap that indicates the number of forefronts before it. + */ #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \ /* push forefront pointers */ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \ @@ -613,6 +622,8 @@ unittest() { debug("create head cap(%p), n_forefronts(%lu)", _head, (uint64_t)(_n_forefronts)); \ dz_cap(_head); \ }) +/* + * Reserve space for a column and put a cap */ #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ From 614e35ab3699923b774b989e53ef4a24d5007ea0 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 13:18:56 -0400 Subject: [PATCH 29/43] Explain about the above-the-stack range logic we use --- dozeu.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/dozeu.h b/dozeu.h index feca31b..1b4c632 100644 --- a/dozeu.h +++ b/dozeu.h @@ -597,6 +597,10 @@ unittest() { /** * vector update macros */ + +/* + * Determine how many bytes are needed to store forefront pointers, the cap, and a column vector. + */ #define _calc_next_size(_sp, _ep, _nt) ({ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \ size_t est_column_size = 2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s); \ @@ -623,7 +627,16 @@ unittest() { dz_cap(_head); \ }) /* - * Reserve space for a column and put a cap */ + * Reserve space for the forefront pointers, head cap, and column vector, in + * that order. + * + * Return the slice array address, for which the _spos to _epos range + * corresponds to the column vector. + * + * The returned pointer must be passed to _end_column() before _begin_column() + * can be called. The final column's slice array address must be passed to + * _end_matrix() before the arena can be used again for something else. + */ #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ @@ -634,6 +647,19 @@ unittest() { /* return array pointer */ \ (struct dz_swgv_s *)(cap + 1) - (_spos); \ }) +/* + * Fill in a terminating cap for the most recent column passed to _end_column. + * Either immediately after it or in a new contiguous run of memory, reserve + * space for a new column. If the new column is not immediately after the old + * column, copy a (fake?) version of the old column's cap to be before it. + * + * Return the slice array address, for which the _spos to _epos range + * corresponds to the column vector. + * + * The returned pointer must be passed to _end_column() before another column + * can be begin, and the last column must be passed to _end_matrix() before the + * arena can be used again. + */ #define _begin_column(_w, _rch, _rlen) ({ \ /* push cap info */ \ struct dz_cap_s *cap = dz_cap(dz_mem(self)->stack.top); \ @@ -651,6 +677,18 @@ unittest() { /* return array pointer */ \ (struct dz_swgv_s *)(cap + 1) - (_w).fr.spos; \ }) +/* + * Given the slice array address for a column begun with _begin_column() or + * _begin_column_head(), finish the column, and allow the allocator to be used + * again. + * + * Note that we have actually used the _spos to _epos part of the column's + * vector, and save range data after that vector recording the range its slice + * covers. + * + * Returns the address of the filled-in range, which it leaves just above the + * allocator stack. + */ #define _end_column(_p, _spos, _epos) ({ \ /* write back the stack pointer and return a cap */ \ struct dz_range_s *r = dz_range(&dz_swgv(_p)[(_epos)]); \ @@ -659,6 +697,13 @@ unittest() { r->spos = (_spos); r->epos = (_epos); \ (struct dz_cap_s *)r; \ }) +/* + * Given the slice array address of the final column, which has been ended (so + * its filled-in range is above the allocator stack), turn that filled-in range + * into a full forefront on top of the allocator stack. + * + * After calling this, the arena can be used again. + */ #define _end_matrix(_p, _wp, _rrem) ({ \ /* create forefront object */ \ struct dz_forefront_s *forefront = dz_forefront(&(dz_swgv(_p))[(_wp)->r.epos]); \ From 18f98e7e4299d9b481efcfdcfebda670ae912917 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 13:25:37 -0400 Subject: [PATCH 30/43] Clarify the loop relationship --- dozeu.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dozeu.h b/dozeu.h index 1b4c632..f552f17 100644 --- a/dozeu.h +++ b/dozeu.h @@ -648,7 +648,8 @@ unittest() { (struct dz_swgv_s *)(cap + 1) - (_spos); \ }) /* - * Fill in a terminating cap for the most recent column passed to _end_column. + * Fill in a terminating cap for the most recent column passed to _end_column, + * using the range information already above the allocator stack. * Either immediately after it or in a new contiguous run of memory, reserve * space for a new column. If the new column is not immediately after the old * column, copy a (fake?) version of the old column's cap to be before it. @@ -679,15 +680,15 @@ unittest() { }) /* * Given the slice array address for a column begun with _begin_column() or - * _begin_column_head(), finish the column, and allow the allocator to be used - * again. + * _begin_column_head(), finish the column. * * Note that we have actually used the _spos to _epos part of the column's * vector, and save range data after that vector recording the range its slice * covers. * * Returns the address of the filled-in range, which it leaves just above the - * allocator stack. + * allocator stack, to be made into a cap by _begin_column() or into a + * forefront by _end_matrix(). */ #define _end_column(_p, _spos, _epos) ({ \ /* write back the stack pointer and return a cap */ \ From 2fe9393a9af4e7e9efbe53b719040238cd19edef Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 13:49:02 -0400 Subject: [PATCH 31/43] Explain matrix structure more --- dozeu.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/dozeu.h b/dozeu.h index f552f17..410febc 100644 --- a/dozeu.h +++ b/dozeu.h @@ -201,6 +201,29 @@ dz_static_assert(sizeof(struct dz_node_s) % sizeof(__m128i) == 0); /* DP matrix structures */ +/* + * The Dozeu DP matrix for a node looks like: + * - A head column, containing + * - An array of pointers to forefronts for preceeding nodes + * - A head cap (same size as a normal cap) + * - 0 or more internal columns, which are: + * - An array slice of score vector items + * - A cap, containing: + * - The range of the array slice + * - Other data + * - A final column, which is: + * - An array slice of score vector items + * - A forefront (a special final cap), containing + * - The range of the array slice + * - Other data + * + * So, each slice has the cap for the preceeding slice before it, and its own + * range stored immediately after it. This is because we dynamically stop + * slices early, and we put the final range actually used once we know it. + * + * We are able to break this structure in memory! If the next column's slice and cap could be too big to fit right after the previous column in the contiguous memory available, we start a new head column with the previous column's cap as a forefront, and all the forefront data except the cap left uninitialized. + */ + /* Score vector item. Array of these will be followed by a dz_cap_s, which leads with a range. */ struct dz_swgv_s { __m128i e, f, s; }; /* placed just after every score vector (as part of a cap) to indicate the length */ @@ -217,6 +240,11 @@ struct dz_cap_s { struct dz_range_s r; uint32_t rch; int32_t rrem; }; +/* + * A forefront. Appears at the end of a matrix. + * + * Is also a special kind of cap. + */ struct dz_forefront_s { struct dz_range_s r; // the range that applies to the preceding column uint32_t rid; @@ -670,7 +698,8 @@ unittest() { cap->rch = (_rch); /* record rch for the next column */ \ cap->rrem = (_rlen); /* record rlen for use in traceback */ \ if(dz_likely(dz_mem_stack_rem(dz_mem(self)) < next_req)) { \ - /* TODO: editing to make sure we ask for enough memory */ \ + /* We start a new head column with the previous column's cap pointed to as if it were a forefront. */ + /* TODO: editing to make sure we ask for enough memory */ \ dz_mem_add_stack(dz_mem(self), next_req); /* 0); }*/ \ cap = _init_cap(0, _rch, &cap, 1); \ } \ From 46e9820f059fd315057b054d8ddfe5c420314351 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 15:28:37 -0400 Subject: [PATCH 32/43] Refactor around a stream with reservations and break traceback --- dozeu.h | 183 +++++++++++++++++++++++++++++++++++++++++++----------- example.c | 1 + 2 files changed, 148 insertions(+), 36 deletions(-) diff --git a/dozeu.h b/dozeu.h index 410febc..85ef0f0 100644 --- a/dozeu.h +++ b/dozeu.h @@ -221,7 +221,11 @@ dz_static_assert(sizeof(struct dz_node_s) % sizeof(__m128i) == 0); * range stored immediately after it. This is because we dynamically stop * slices early, and we put the final range actually used once we know it. * - * We are able to break this structure in memory! If the next column's slice and cap could be too big to fit right after the previous column in the contiguous memory available, we start a new head column with the previous column's cap as a forefront, and all the forefront data except the cap left uninitialized. + * We are able to break this structure in memory! If the next column's slice + * and cap could be too big to fit right after the previous column in the + * contiguous memory available, we start a new head column with the previous + * column's cap as a forefront, and all the forefront data except the cap left + * uninitialized. We call this an "internal bridge". */ /* Score vector item. Array of these will be followed by a dz_cap_s, which leads with a range. */ @@ -309,7 +313,7 @@ struct dz_mem_block_s { struct dz_mem_block_s *next; size_t size; }; * * Exists at the beginning of the first block, just after its block header. */ -struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t _pad[3]; }; +struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t being_allocated; uint64_t _pad[2]; }; /* * Represents a Dozeu memory allocation arena. * @@ -440,6 +444,7 @@ void dz_mem_flush( // TODO: i think this margin is redundant, since dz_malloc adds the margin past the end // of the requested size mem->stack.end = (uint8_t *)mem + mem->blk.size;// - DZ_MEM_MARGIN_SIZE; + mem->stack.being_allocated = 0; return; } @@ -562,6 +567,7 @@ void *dz_mem_malloc( if(dz_mem_stack_rem(mem) < size) { if(dz_mem_add_stack(mem, size)) { /* Report a failed allocation. */ + debug("Allocation failed"); return NULL; } } @@ -576,34 +582,128 @@ void *dz_mem_malloc( * Returns 0 if successful, and 1 on failure. */ static __dz_force_inline -uint64_t dz_mem_run( +uint64_t dz_mem_stream_reserve( struct dz_mem_s *mem, size_t size) { + debug("Reserve %lu bytes", size); /* We cheat and just use a whole block as a run. */ - if(dz_mem_stack_rem(mem) < size) { + if(dz_likely(dz_mem_stack_rem(mem) < size)) { return dz_mem_add_stack(mem, size); } return 0; } /* - * Allocate a number of bytes from the current contiguous run. + * Get the amount of stream reservation remaining. + */ +static __dz_force_inline +uint64_t dz_mem_stream_remaining( + struct dz_mem_s *mem) +{ + return dz_mem_stack_rem(mem); +} + +/* + * Begin an allocation of up to the given number of bytes from the current + * stream reservation. * * Returns a pointer to them, or NULL if they do not fit. */ static __dz_force_inline -void *dz_mem_from_run( +void *dz_mem_stream_alloc_begin( struct dz_mem_s *mem, size_t size) { - if(dz_mem_stack_rem(mem) < size) { + if(dz_mem_stack_rem(mem) < size || mem->stack.being_allocated != 0) { + debug("Not allowed"); return NULL; } void *ptr = (void *)mem->stack.top; - mem->stack.top += size; + debug("Begin allocating %lu bytes (%p)", size, ptr); + /* Reserve this area above the stack */ + mem->stack.being_allocated = size; return(ptr); } + +/* + * Get the start address of the active allocation. + * + * Returns NULL if no allocation is active. + */ +static __dz_force_inline +void *dz_mem_stream_alloc_current( + struct dz_mem_s *mem) +{ + if(mem->stack.being_allocated == 0) { + debug("Nothing currently being allocated"); + return NULL; + } + return (void *)mem->stack.top; +} + + +/* + * End an active allocation, having used the given number of bytes. + * + * Returns 0 on success, or 1 if too many bytes were used. + */ +static __dz_force_inline +uint64_t dz_mem_stream_alloc_end( + struct dz_mem_s *mem, + size_t size) +{ + debug("Finish allocating %lu bytes (%p)", size, mem->stack.top); + if(size > mem->stack.being_allocated) { + /* Too much memory was used vs. what we set aside for the stream. */ + debug("That was too many"); + return 1; + } + mem->stack.top += size; + mem->stack.being_allocated = 0; + return 0; +} + +/* + * Allocate the given number of bytes from the current stream reservation. + * + * Returns a pointer to them, or NULL if they do not fit. + */ +static __dz_force_inline +void *dz_mem_stream_alloc( + struct dz_mem_s *mem, + size_t size) +{ + /* TODO: Optimize instead of using the async primitives. */ + void *ptr = dz_mem_stream_alloc_begin(mem, size); + if(dz_mem_stream_alloc_end(mem, size)) { + ptr = NULL; + } + return ptr; +} + +/* + * Allocate the given number of items of the given size from the current + * reservation, right-justified, in a block aligned to the given alignemnt. + * + * Returns a pointer to them, or NULL if they do not fit. + */ +static __dz_force_inline +void *dz_mem_stream_right_alloc( + struct dz_mem_s *mem, + size_t items, + size_t size, + size_t alignment) +{ + size_t data_size = size * items; + size_t alloc_size = dz_roundup(data_size, alignment); + + void *ptr = dz_mem_stream_alloc(mem, alloc_size); + if(ptr == NULL) { + return ptr; + } + return (void*)((uint8_t*)ptr + (alloc_size - data_size)); +} #endif // DZ_INCLUDE_ONCE @@ -626,23 +726,27 @@ unittest() { * vector update macros */ +#define _calc_max_slice_size(_sp, _ep) (2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s)) /* * Determine how many bytes are needed to store forefront pointers, the cap, and a column vector. */ #define _calc_next_size(_sp, _ep, _nt) ({ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \ - size_t est_column_size = 2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s); \ + size_t est_column_size = _calc_max_slice_size(_sp, _ep); \ size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_cap_s); \ /* debug("est_column_size(%lu), next_req(%lu)", est_column_size, next_req); */ \ next_req; \ }) /* - * Save an array of forefront pointers to the stack, followed by a head cap that indicates the number of forefronts before it. + * Save an array of forefront pointers to the stack, followed by a head cap + * that indicates the number of forefronts before it. + * + * Space must have been reserved in the arena. */ #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \ /* push forefront pointers */ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \ - struct dz_forefront_s const **dst = ((struct dz_forefront_s const **)(dz_mem(self)->stack.top + forefront_arr_size)) - ((int64_t)(_n_forefronts)); \ + struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem_stream_right_alloc(dz_mem(self), (int64_t)(_n_forefronts), sizeof(struct dz_forefront_s *), sizeof(__m128i))); \ struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \ for(size_t i = 0; i < (_n_forefronts); i++) { dst[i] = src[i]; } \ /* push head-cap info */ \ @@ -668,16 +772,18 @@ unittest() { #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ - /* reserve run from heap */ \ - dz_mem_run(dz_mem(self), next_req); \ + /* reserve stream of memory to push onto */ \ + dz_mem_stream_reserve(dz_mem(self), next_req); \ /* push head-cap */ \ struct dz_cap_s *cap = _init_cap(_adj, 0xff, _forefronts, _n_forefronts); \ + /* start array slice allocation */ \ + struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size(_spos, _epos))); \ /* return array pointer */ \ - (struct dz_swgv_s *)(cap + 1) - (_spos); \ + slice_data - (_spos); \ }) /* * Fill in a terminating cap for the most recent column passed to _end_column, - * using the range information already above the allocator stack. + * using the range information in the current active allocation. * Either immediately after it or in a new contiguous run of memory, reserve * space for a new column. If the new column is not immediately after the old * column, copy a (fake?) version of the old column's cap to be before it. @@ -691,21 +797,24 @@ unittest() { */ #define _begin_column(_w, _rch, _rlen) ({ \ /* push cap info */ \ - struct dz_cap_s *cap = dz_cap(dz_mem(self)->stack.top); \ - /* calculate sizes */ \ - size_t next_req = _calc_next_size((_w).fr.spos, (_w).fr.epos, 0); \ - /* allocate from heap */ \ + struct dz_cap_s *cap = dz_cap(dz_mem_stream_alloc_current(dz_mem(self))); \ + dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_cap_s)); \ cap->rch = (_rch); /* record rch for the next column */ \ cap->rrem = (_rlen); /* record rlen for use in traceback */ \ - if(dz_likely(dz_mem_stack_rem(dz_mem(self)) < next_req)) { \ - /* We start a new head column with the previous column's cap pointed to as if it were a forefront. */ - /* TODO: editing to make sure we ask for enough memory */ \ - dz_mem_add_stack(dz_mem(self), next_req); /* 0); }*/ \ + /* calculate size of next column if it is here */ \ + size_t next_req_here = _calc_next_size((_w).fr.spos, (_w).fr.epos, 0); \ + if(dz_likely(dz_mem_stream_remaining(dz_mem(self)) < next_req_here)) { \ + /* Next column will not fit here. */ \ + size_t next_req_split = _calc_next_size((_w).fr.spos, (_w).fr.epos, 1); \ + /* We start a new head column with the previous column's cap pointed to as if it were a forefront. */ \ + dz_mem_stream_reserve(dz_mem(self), next_req_split); \ cap = _init_cap(0, _rch, &cap, 1); \ } \ debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ + /* start array slice allocation */ \ + struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size((_w).fr.spos, (_w).fr.epos))); \ /* return array pointer */ \ - (struct dz_swgv_s *)(cap + 1) - (_w).fr.spos; \ + slice_data - (_w).fr.spos; \ }) /* * Given the slice array address for a column begun with _begin_column() or @@ -715,28 +824,32 @@ unittest() { * vector, and save range data after that vector recording the range its slice * covers. * - * Returns the address of the filled-in range, which it leaves just above the - * allocator stack, to be made into a cap by _begin_column() or into a - * forefront by _end_matrix(). + * Returns the address of the filled-in range, which it stores in a current + * allocation, to be made into a cap by _begin_column() or into a forefront by + * _end_matrix(). */ #define _end_column(_p, _spos, _epos) ({ \ - /* write back the stack pointer and return a cap */ \ - struct dz_range_s *r = dz_range(&dz_swgv(_p)[(_epos)]); \ + /* finish the slice data allocation with what was actually used */ \ + dz_mem_stream_alloc_end(dz_mem(self), ((_epos) - (_spos)) * sizeof(struct dz_swgv_s)); \ + /* allocate up to a forefront, as a range */ \ + struct dz_range_s *r = dz_range(dz_mem_stream_alloc_begin(dz_mem(self), sizeof(struct dz_forefront_s))); \ debug("create range(%p), [%u, %u)", r, (_spos), (_epos)); \ - dz_mem(self)->stack.top = (uint8_t *)r; \ r->spos = (_spos); r->epos = (_epos); \ + /* Return it as a cap */ \ (struct dz_cap_s *)r; \ }) /* * Given the slice array address of the final column, which has been ended (so - * its filled-in range is above the allocator stack), turn that filled-in range - * into a full forefront on top of the allocator stack. + * its filled-in range is in an active forefront-sized allocation), turn that + * filled-in range into a full forefront. * - * After calling this, the arena can be used again. + * After calling this, no allocation will be active and no memory adjacency + * relationships will need to be maintained. */ #define _end_matrix(_p, _wp, _rrem) ({ \ /* create forefront object */ \ - struct dz_forefront_s *forefront = dz_forefront(&(dz_swgv(_p))[(_wp)->r.epos]); \ + struct dz_forefront_s *forefront = dz_forefront(dz_mem_stream_alloc_current(dz_mem(self))); \ + dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_forefront_s)); \ forefront->rid = (_wp)->rid; \ forefront->rlen = (_wp)->rlen; \ forefront->fr = (_wp)->fr; \ @@ -747,8 +860,6 @@ unittest() { forefront->query = (_wp)->query; \ forefront->mcap = (_wp)->mcap; \ debug("create forefront(%p), [%u, %u), [%u, %u), max(%d), inc(%d), rlen(%d), query(%p), rrem(%d)", forefront, (_wp)->r.spos, (_wp)->r.epos, (_wp)->fr.spos, (_wp)->fr.epos, (_wp)->max, (_wp)->inc, (int32_t)(_wp)->rlen, (_wp)->query, (int32_t)(_rrem)); \ - /* write back stack pointer */ \ - dz_mem(self)->stack.top = (uint8_t *)(forefront + 1); \ /* return the forefront pointer */ \ (struct dz_forefront_s const *)forefront; \ }) diff --git a/example.c b/example.c index 800d631..9b75fa7 100644 --- a/example.c +++ b/example.c @@ -23,6 +23,7 @@ #include #define UNITTEST 1 +#define DEBUG 1 #define DZ_FULL_LENGTH_BONUS /* use full-length bonus feature */ #define DZ_CIGAR_OP 0x44493d58 /* 'D', 'I', '=', 'X'; the default is 0x04030201 */ #include "dozeu.h" From 370dab377b11c1e6eaa5ba601e87dbf1c6a2b35c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 15:48:23 -0400 Subject: [PATCH 33/43] Add debugging and remember to actually allocate the head cap --- dozeu.h | 12 +++++++++--- example.c | 1 - 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dozeu.h b/dozeu.h index 85ef0f0..883e7c4 100644 --- a/dozeu.h +++ b/dozeu.h @@ -702,7 +702,9 @@ void *dz_mem_stream_right_alloc( if(ptr == NULL) { return ptr; } - return (void*)((uint8_t*)ptr + (alloc_size - data_size)); + void* right_aligned_ptr = (void*)((uint8_t*)ptr + (alloc_size - data_size)); + debug("Allocate %lu items of %lu bytes each padded to %lu: block %p and right-aligned array %p", items, size, alignment, ptr, right_aligned_ptr); + return right_aligned_ptr; } #endif // DZ_INCLUDE_ONCE @@ -750,7 +752,7 @@ unittest() { struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \ for(size_t i = 0; i < (_n_forefronts); i++) { dst[i] = src[i]; } \ /* push head-cap info */ \ - struct dz_head_s *_head = (struct dz_head_s *)(dst + ((int64_t)(_n_forefronts))); \ + struct dz_head_s *_head = (struct dz_head_s *)(dz_mem_stream_alloc(dz_mem(self), sizeof(struct dz_head_s))); \ (_head)->r.spos = (_adj); /* save merging adjustment */ \ (_head)->r.epos = 0; /* head marked as zero */ \ (_head)->rch = (_rch); /* rch for the first column */ \ @@ -781,6 +783,9 @@ unittest() { /* return array pointer */ \ slice_data - (_spos); \ }) + +#define _unwind_cap(_c) ( dz_ccap(dz_cswgv(_c) - (_c)->r.epos + (_c)->r.spos) - 1 ) + /* * Fill in a terminating cap for the most recent column passed to _end_column, * using the range information in the current active allocation. @@ -805,12 +810,13 @@ unittest() { size_t next_req_here = _calc_next_size((_w).fr.spos, (_w).fr.epos, 0); \ if(dz_likely(dz_mem_stream_remaining(dz_mem(self)) < next_req_here)) { \ /* Next column will not fit here. */ \ + debug("create internal bridge"); \ size_t next_req_split = _calc_next_size((_w).fr.spos, (_w).fr.epos, 1); \ /* We start a new head column with the previous column's cap pointed to as if it were a forefront. */ \ dz_mem_stream_reserve(dz_mem(self), next_req_split); \ cap = _init_cap(0, _rch, &cap, 1); \ } \ - debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ + debug("create column(%p) pcap(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, _unwind_cap(cap), (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ /* start array slice allocation */ \ struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size((_w).fr.spos, (_w).fr.epos))); \ /* return array pointer */ \ diff --git a/example.c b/example.c index 9b75fa7..800d631 100644 --- a/example.c +++ b/example.c @@ -23,7 +23,6 @@ #include #define UNITTEST 1 -#define DEBUG 1 #define DZ_FULL_LENGTH_BONUS /* use full-length bonus feature */ #define DZ_CIGAR_OP 0x44493d58 /* 'D', 'I', '=', 'X'; the default is 0x04030201 */ #include "dozeu.h" From 9282fc78cb6ad9777e9fff8679f1a47661864876 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 8 Sep 2023 15:49:01 -0400 Subject: [PATCH 34/43] Stop early unwinding --- dozeu.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dozeu.h b/dozeu.h index 883e7c4..fae2ad6 100644 --- a/dozeu.h +++ b/dozeu.h @@ -784,8 +784,6 @@ unittest() { slice_data - (_spos); \ }) -#define _unwind_cap(_c) ( dz_ccap(dz_cswgv(_c) - (_c)->r.epos + (_c)->r.spos) - 1 ) - /* * Fill in a terminating cap for the most recent column passed to _end_column, * using the range information in the current active allocation. @@ -816,7 +814,7 @@ unittest() { dz_mem_stream_reserve(dz_mem(self), next_req_split); \ cap = _init_cap(0, _rch, &cap, 1); \ } \ - debug("create column(%p) pcap(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, _unwind_cap(cap), (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ + debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ /* start array slice allocation */ \ struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size((_w).fr.spos, (_w).fr.epos))); \ /* return array pointer */ \ From 1e0d445c39879e59d86caec37414161e1162c936 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 18 Sep 2023 12:16:45 -0700 Subject: [PATCH 35/43] Add more logging and rip out unsafe debugging stack arrays --- dozeu.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dozeu.h b/dozeu.h index fae2ad6..379e74a 100644 --- a/dozeu.h +++ b/dozeu.h @@ -677,6 +677,7 @@ void *dz_mem_stream_alloc( /* TODO: Optimize instead of using the async primitives. */ void *ptr = dz_mem_stream_alloc_begin(mem, size); if(dz_mem_stream_alloc_end(mem, size)) { + debug("Stream alloc failed"); ptr = NULL; } return ptr; @@ -700,6 +701,7 @@ void *dz_mem_stream_right_alloc( void *ptr = dz_mem_stream_alloc(mem, alloc_size); if(ptr == NULL) { + debug("Stream right alloc failed"); return ptr; } void* right_aligned_ptr = (void*)((uint8_t*)ptr + (alloc_size - data_size)); @@ -2201,7 +2203,6 @@ uint64_t dz_calc_max_pos(struct dz_forefront_s const *forefront) } \ _push_span(dz_cff(pcap)->rid); /* push segment info */ \ _score = _s(_l, pcap, idx); \ - *drp++ = ':'; *dqp++ = ':'; \ } \ /* return the reference-side base */ \ ref_length++; pcap->rch; \ @@ -2240,9 +2241,6 @@ struct dz_alignment_s *dz_trace( struct dz_query_s const *query = forefront->query; int32_t score = _s(s, pcap, idx), cnt[4] = { 0 }; - uint8_t debug_ref[1024], debug_query[1024]; - uint8_t *drp = debug_ref, *dqp = debug_query; - #ifdef DZ_QUAL_ADJ # define _pair_score(_self, _q, _r, _i) ( dz_qual_adj_pair_score((_self), (_q), (_r), (_i)) ) #else From 05412de10988640c90ece590ce2b844d98e35f2a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Oct 2023 10:58:34 -0700 Subject: [PATCH 36/43] Account for columns possibly needing to end in forefronts, and stop if reservations are wrong --- Makefile | 2 +- dozeu.h | 90 ++++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index fd6ebfe..7b149da 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC=gcc OFLAGS=-O3 -CFLAGS=$(OFLAGS) -std=c99 -Wall -Wno-unused-variable -Wno-unused-function -march=native +CFLAGS=$(OFLAGS) -std=c99 -Wall -Wno-unused-variable -Wno-unused-function -march=native -g all: example example.2bit example.protein diff --git a/dozeu.h b/dozeu.h index 379e74a..b8a5f3f 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,6 +1,6 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c //#ifdef DZ_QUAL_ADJ -//#define DEBUG +#define DEBUG //#endif //#define DZ_PRINT_VECTOR /** @@ -143,6 +143,13 @@ unittest() { debug("hello"); } # define trap() ; #endif +#define die(...) { \ + die_impl(__VA_ARGS__, ""); \ +} +#define die_impl(fmt, ...) { \ + fprintf(stderr, "[%s: %s(%d)] " fmt "%s\n", __FILE__, __func__, __LINE__, __VA_ARGS__); \ + exit(1); \ +} #if defined(DZ_NUCL_ASCII) # define DZ_UNITTEST_INDEX 0 @@ -313,7 +320,7 @@ struct dz_mem_block_s { struct dz_mem_block_s *next; size_t size; }; * * Exists at the beginning of the first block, just after its block header. */ -struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t being_allocated; uint64_t _pad[2]; }; +struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t being_allocated; uint64_t reservation_remaining; uint64_t _pad[1]; }; /* * Represents a Dozeu memory allocation arena. * @@ -445,6 +452,7 @@ void dz_mem_flush( // of the requested size mem->stack.end = (uint8_t *)mem + mem->blk.size;// - DZ_MEM_MARGIN_SIZE; mem->stack.being_allocated = 0; + mem->stack.reservation_remaining = 0; return; } @@ -567,8 +575,7 @@ void *dz_mem_malloc( if(dz_mem_stack_rem(mem) < size) { if(dz_mem_add_stack(mem, size)) { /* Report a failed allocation. */ - debug("Allocation failed"); - return NULL; + die("Allocation of %lu bytes failed", size); } } void *ptr = (void *)mem->stack.top; @@ -586,11 +593,12 @@ uint64_t dz_mem_stream_reserve( struct dz_mem_s *mem, size_t size) { - debug("Reserve %lu bytes", size); + debug("Reserve %lu bytes for stream", size); /* We cheat and just use a whole block as a run. */ if(dz_likely(dz_mem_stack_rem(mem) < size)) { return dz_mem_add_stack(mem, size); } + mem->stack.reservation_remaining = size; return 0; } @@ -601,7 +609,7 @@ static __dz_force_inline uint64_t dz_mem_stream_remaining( struct dz_mem_s *mem) { - return dz_mem_stack_rem(mem); + return mem->stack.reservation_remaining; } /* @@ -616,11 +624,17 @@ void *dz_mem_stream_alloc_begin( size_t size) { if(dz_mem_stack_rem(mem) < size || mem->stack.being_allocated != 0) { - debug("Not allowed"); - return NULL; + if (dz_mem_stack_rem(mem) < size) { + die("Asked to allocate %lu bytes, but only %lu bytes remain on stack", size, dz_mem_stack_rem(mem)); + } else if (mem->stack.being_allocated != 0) { + die("Asked to allocate %lu bytes, but another allocaton of %lu bytes is already in progress", size, mem->stack.being_allocated); + } } + if (mem->stack.reservation_remaining < size) { + die("Asked to allocate %lu bytes, but only %lu bytes remain in reservation", size, mem->stack.reservation_remaining); + } void *ptr = (void *)mem->stack.top; - debug("Begin allocating %lu bytes (%p)", size, ptr); + debug("Begin allocating %lu bytes (%p) from %lu in reservation", size, ptr, mem->stack.reservation_remaining); /* Reserve this area above the stack */ mem->stack.being_allocated = size; return(ptr); @@ -636,8 +650,7 @@ void *dz_mem_stream_alloc_current( struct dz_mem_s *mem) { if(mem->stack.being_allocated == 0) { - debug("Nothing currently being allocated"); - return NULL; + die("Nothing currently being allocated"); } return (void *)mem->stack.top; } @@ -661,6 +674,7 @@ uint64_t dz_mem_stream_alloc_end( } mem->stack.top += size; mem->stack.being_allocated = 0; + mem->stack.reservation_remaining -= size; return 0; } @@ -730,15 +744,24 @@ unittest() { * vector update macros */ -#define _calc_max_slice_size(_sp, _ep) (2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s)) +#define _calc_max_slice_size(_sp, _ep) ({ \ + size_t max_slice_size = (2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s)); \ + debug("sp(%lu), ep(%lu), max_slice_size(%lu)", (size_t) (_sp), (size_t) (_ep), max_slice_size); \ + max_slice_size; \ +}) /* - * Determine how many bytes are needed to store forefront pointers, the cap, and a column vector. + * Determine how many bytes are needed to store forefront pointers, the column + * vector, and the cap at the end from _end_column, which may need to be a + * (larger) forefront if this ends up being the last column. + * _sp: start position in the column + * _ep: end position in the column + * _nt: number of forefronts that need pointers */ #define _calc_next_size(_sp, _ep, _nt) ({ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \ size_t est_column_size = _calc_max_slice_size(_sp, _ep); \ - size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_cap_s); \ - /* debug("est_column_size(%lu), next_req(%lu)", est_column_size, next_req); */ \ + size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_forefront_s); \ + debug("sp(%lu), ep(%lu), nt(%lu), est_column_size(%lu), next_req(%lu)", (size_t) (_sp), (size_t) (_ep), (size_t) (_nt), est_column_size, next_req); \ next_req; \ }) /* @@ -750,10 +773,12 @@ unittest() { #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \ /* push forefront pointers */ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \ - struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem_stream_right_alloc(dz_mem(self), (int64_t)(_n_forefronts), sizeof(struct dz_forefront_s *), sizeof(__m128i))); \ + debug("Allocate %lu forefronts", ((size_t) (_n_forefronts))); \ + struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem_stream_right_alloc(dz_mem(self), (int64_t)(_n_forefronts), sizeof(struct dz_forefront_s *), sizeof(__m128i))); \ struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \ for(size_t i = 0; i < (_n_forefronts); i++) { dst[i] = src[i]; } \ /* push head-cap info */ \ + debug("Allocate head"); \ struct dz_head_s *_head = (struct dz_head_s *)(dz_mem_stream_alloc(dz_mem(self), sizeof(struct dz_head_s))); \ (_head)->r.spos = (_adj); /* save merging adjustment */ \ (_head)->r.epos = 0; /* head marked as zero */ \ @@ -774,7 +799,8 @@ unittest() { * _end_matrix() before the arena can be used again for something else. */ #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ - /* calculate sizes */ \ + debug("Begin column head"); \ + /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ /* reserve stream of memory to push onto */ \ dz_mem_stream_reserve(dz_mem(self), next_req); \ @@ -797,10 +823,11 @@ unittest() { * corresponds to the column vector. * * The returned pointer must be passed to _end_column() before another column - * can be begin, and the last column must be passed to _end_matrix() before the + * can be begun, and the last column must be passed to _end_matrix() before the * arena can be used again. */ #define _begin_column(_w, _rch, _rlen) ({ \ + debug("Begin column"); \ /* push cap info */ \ struct dz_cap_s *cap = dz_cap(dz_mem_stream_alloc_current(dz_mem(self))); \ dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_cap_s)); \ @@ -819,6 +846,7 @@ unittest() { debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ /* start array slice allocation */ \ struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size((_w).fr.spos, (_w).fr.epos))); \ + debug("Column begun"); \ /* return array pointer */ \ slice_data - (_w).fr.spos; \ }) @@ -835,9 +863,10 @@ unittest() { * _end_matrix(). */ #define _end_column(_p, _spos, _epos) ({ \ + debug("End column"); \ /* finish the slice data allocation with what was actually used */ \ dz_mem_stream_alloc_end(dz_mem(self), ((_epos) - (_spos)) * sizeof(struct dz_swgv_s)); \ - /* allocate up to a forefront, as a range */ \ + /* immediately next in memory, allocate up to a forefront, as a range */ \ struct dz_range_s *r = dz_range(dz_mem_stream_alloc_begin(dz_mem(self), sizeof(struct dz_forefront_s))); \ debug("create range(%p), [%u, %u)", r, (_spos), (_epos)); \ r->spos = (_spos); r->epos = (_epos); \ @@ -1030,14 +1059,24 @@ struct dz_s *dz_init_intl( debug("failed to malloc memory"); return(NULL); } + + // We need to work out the contiguous block size we need for the dz_s, its matrix data, and the terminal cap. + // Since they need to be adjacent (TODO: do they?) we use the streaming allocator. + size_t self_and_matrix_size = sizeof(struct dz_s); + #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) + #ifdef DZ_QUAL_ADJ + self_and_matrix_size += 2 * DZ_QUAL_MATRIX_SIZE); + #endif + #else + self_and_matrix_size += DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i); + #endif + size_t cap_size = _calc_next_size(0, 0, 0); + dz_mem_stream_reserve(mem, self_and_matrix_size + cap_size); + // TODO: what is the point of the latter 16 bytes all being zero? #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) /* allocate with or without space afterwards for the quality adjusted matrices */ - #ifdef DZ_QUAL_ADJ - struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + 2 * DZ_QUAL_MATRIX_SIZE); - #else - struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s)); - #endif + struct dz_s *self = (struct dz_s *)dz_mem_stream_alloc(mem, self_and_matrix_size); /* constants */ __m128i const tmat = _mm_loadu_si128((__m128i const *)score_matrix); @@ -1053,7 +1092,7 @@ struct dz_s *dz_init_intl( } #endif #else - struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i)); + struct dz_s *self = (struct dz_s *)dz_mem_stream_alloc(mem, self_and_matrix_size); /* clear the first matrix field for protein */ _mm_store_si128((__m128i *)&self->matrix[0], _mm_setzero_si128()); @@ -1747,6 +1786,7 @@ unittest() { } \ /* paste the last vectors */ \ for(size_t i = 0; i < n_forefronts; i++) { \ + debug("Add column %lu", i); \ struct dz_swgv_s const *tdp = dz_cswgv(forefronts[i]) - forefronts[i]->r.epos; \ __m128i const adjv = _mm_set1_epi16(init_s == 0 ? 0 : adj[i]); \ for(uint64_t p = forefronts[i]->fr.spos; p < forefronts[i]->fr.epos; p++) { \ From d15473faf6bb45bf2ab3c3708a6f0757caed1954 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Oct 2023 11:00:01 -0700 Subject: [PATCH 37/43] Remove extra debugging --- dozeu.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/dozeu.h b/dozeu.h index b8a5f3f..1632c64 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1,6 +1,6 @@ // $(CC) -O3 -march=native -DMAIN -o dozeu dozeu.c //#ifdef DZ_QUAL_ADJ -#define DEBUG +//#define DEBUG //#endif //#define DZ_PRINT_VECTOR /** @@ -746,7 +746,7 @@ unittest() { #define _calc_max_slice_size(_sp, _ep) ({ \ size_t max_slice_size = (2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s)); \ - debug("sp(%lu), ep(%lu), max_slice_size(%lu)", (size_t) (_sp), (size_t) (_ep), max_slice_size); \ + /* debug("sp(%lu), ep(%lu), max_slice_size(%lu)", (size_t) (_sp), (size_t) (_ep), max_slice_size); */ \ max_slice_size; \ }) /* @@ -761,7 +761,7 @@ unittest() { size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \ size_t est_column_size = _calc_max_slice_size(_sp, _ep); \ size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_forefront_s); \ - debug("sp(%lu), ep(%lu), nt(%lu), est_column_size(%lu), next_req(%lu)", (size_t) (_sp), (size_t) (_ep), (size_t) (_nt), est_column_size, next_req); \ + /* debug("sp(%lu), ep(%lu), nt(%lu), est_column_size(%lu), next_req(%lu)", (size_t) (_sp), (size_t) (_ep), (size_t) (_nt), est_column_size, next_req); */ \ next_req; \ }) /* @@ -773,12 +773,10 @@ unittest() { #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \ /* push forefront pointers */ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \ - debug("Allocate %lu forefronts", ((size_t) (_n_forefronts))); \ struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem_stream_right_alloc(dz_mem(self), (int64_t)(_n_forefronts), sizeof(struct dz_forefront_s *), sizeof(__m128i))); \ struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \ for(size_t i = 0; i < (_n_forefronts); i++) { dst[i] = src[i]; } \ /* push head-cap info */ \ - debug("Allocate head"); \ struct dz_head_s *_head = (struct dz_head_s *)(dz_mem_stream_alloc(dz_mem(self), sizeof(struct dz_head_s))); \ (_head)->r.spos = (_adj); /* save merging adjustment */ \ (_head)->r.epos = 0; /* head marked as zero */ \ @@ -799,7 +797,6 @@ unittest() { * _end_matrix() before the arena can be used again for something else. */ #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ - debug("Begin column head"); \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ /* reserve stream of memory to push onto */ \ @@ -827,7 +824,6 @@ unittest() { * arena can be used again. */ #define _begin_column(_w, _rch, _rlen) ({ \ - debug("Begin column"); \ /* push cap info */ \ struct dz_cap_s *cap = dz_cap(dz_mem_stream_alloc_current(dz_mem(self))); \ dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_cap_s)); \ @@ -846,7 +842,6 @@ unittest() { debug("create column(%p), [%u, %u), span(%u), rrem(%ld), max(%d), inc(%d)", cap, (_w).fr.spos, (_w).fr.epos, (_w).r.epos - (_w).r.spos, (_rlen), (_w).max, (_w).inc); \ /* start array slice allocation */ \ struct dz_swgv_s *slice_data = dz_swgv(dz_mem_stream_alloc_begin(dz_mem(self), _calc_max_slice_size((_w).fr.spos, (_w).fr.epos))); \ - debug("Column begun"); \ /* return array pointer */ \ slice_data - (_w).fr.spos; \ }) @@ -863,7 +858,6 @@ unittest() { * _end_matrix(). */ #define _end_column(_p, _spos, _epos) ({ \ - debug("End column"); \ /* finish the slice data allocation with what was actually used */ \ dz_mem_stream_alloc_end(dz_mem(self), ((_epos) - (_spos)) * sizeof(struct dz_swgv_s)); \ /* immediately next in memory, allocate up to a forefront, as a range */ \ From fc5703756d61855d1081ab00b397974b68295fa1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Oct 2023 11:10:20 -0700 Subject: [PATCH 38/43] Remove wayward parenthese --- dozeu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dozeu.h b/dozeu.h index 1632c64..fb0735c 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1059,7 +1059,7 @@ struct dz_s *dz_init_intl( size_t self_and_matrix_size = sizeof(struct dz_s); #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) #ifdef DZ_QUAL_ADJ - self_and_matrix_size += 2 * DZ_QUAL_MATRIX_SIZE); + self_and_matrix_size += 2 * DZ_QUAL_MATRIX_SIZE; #endif #else self_and_matrix_size += DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i); From 1a70aec5e25fd5bcf8a8cce1e886f31d1dcc488b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Oct 2023 12:06:48 -0700 Subject: [PATCH 39/43] Account for the dz_head_s when guessing column sizes --- dozeu.h | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/dozeu.h b/dozeu.h index fb0735c..5d27362 100644 --- a/dozeu.h +++ b/dozeu.h @@ -143,14 +143,16 @@ unittest() { debug("hello"); } # define trap() ; #endif -#define die(...) { \ - die_impl(__VA_ARGS__, ""); \ +#define dz_die(...) { \ + dz_die_impl(__VA_ARGS__, ""); \ } -#define die_impl(fmt, ...) { \ +#define dz_die_impl(fmt, ...) { \ fprintf(stderr, "[%s: %s(%d)] " fmt "%s\n", __FILE__, __func__, __LINE__, __VA_ARGS__); \ exit(1); \ } +#define dz_error dz_die + #if defined(DZ_NUCL_ASCII) # define DZ_UNITTEST_INDEX 0 #elif defined(DZ_NUCL_2BIT) @@ -575,7 +577,8 @@ void *dz_mem_malloc( if(dz_mem_stack_rem(mem) < size) { if(dz_mem_add_stack(mem, size)) { /* Report a failed allocation. */ - die("Allocation of %lu bytes failed", size); + dz_die("Allocation of %lu bytes failed", size); + return NULL; } } void *ptr = (void *)mem->stack.top; @@ -594,11 +597,15 @@ uint64_t dz_mem_stream_reserve( size_t size) { debug("Reserve %lu bytes for stream", size); + mem->stack.reservation_remaining = size; /* We cheat and just use a whole block as a run. */ if(dz_likely(dz_mem_stack_rem(mem) < size)) { - return dz_mem_add_stack(mem, size); + if (dz_mem_add_stack(mem, size)) { + dz_error("Could not reserve %lu bytes", size); + return 1; + } + return 0; } - mem->stack.reservation_remaining = size; return 0; } @@ -625,13 +632,15 @@ void *dz_mem_stream_alloc_begin( { if(dz_mem_stack_rem(mem) < size || mem->stack.being_allocated != 0) { if (dz_mem_stack_rem(mem) < size) { - die("Asked to allocate %lu bytes, but only %lu bytes remain on stack", size, dz_mem_stack_rem(mem)); + dz_error("Asked to allocate %lu bytes, but only %lu bytes remain on stack", size, dz_mem_stack_rem(mem)); } else if (mem->stack.being_allocated != 0) { - die("Asked to allocate %lu bytes, but another allocaton of %lu bytes is already in progress", size, mem->stack.being_allocated); + dz_error("Asked to allocate %lu bytes, but another allocaton of %lu bytes is already in progress", size, mem->stack.being_allocated); } + return NULL; } if (mem->stack.reservation_remaining < size) { - die("Asked to allocate %lu bytes, but only %lu bytes remain in reservation", size, mem->stack.reservation_remaining); + dz_error("Asked to allocate %lu bytes, but only %lu bytes remain in reservation", size, mem->stack.reservation_remaining); + return NULL; } void *ptr = (void *)mem->stack.top; debug("Begin allocating %lu bytes (%p) from %lu in reservation", size, ptr, mem->stack.reservation_remaining); @@ -650,7 +659,8 @@ void *dz_mem_stream_alloc_current( struct dz_mem_s *mem) { if(mem->stack.being_allocated == 0) { - die("Nothing currently being allocated"); + dz_error("Nothing currently being allocated"); + return NULL; } return (void *)mem->stack.top; } @@ -719,7 +729,7 @@ void *dz_mem_stream_right_alloc( return ptr; } void* right_aligned_ptr = (void*)((uint8_t*)ptr + (alloc_size - data_size)); - debug("Allocate %lu items of %lu bytes each padded to %lu: block %p and right-aligned array %p", items, size, alignment, ptr, right_aligned_ptr); + debug("Allocated %lu items of %lu bytes each padded to %lu: block %p and right-aligned array %p", items, size, alignment, ptr, right_aligned_ptr); return right_aligned_ptr; } @@ -746,13 +756,13 @@ unittest() { #define _calc_max_slice_size(_sp, _ep) ({ \ size_t max_slice_size = (2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s)); \ - /* debug("sp(%lu), ep(%lu), max_slice_size(%lu)", (size_t) (_sp), (size_t) (_ep), max_slice_size); */ \ + debug("sp(%lu), ep(%lu), max_slice_size(%lu)", (size_t) (_sp), (size_t) (_ep), max_slice_size); \ max_slice_size; \ }) /* - * Determine how many bytes are needed to store forefront pointers, the column - * vector, and the cap at the end from _end_column, which may need to be a - * (larger) forefront if this ends up being the last column. + * Determine how many bytes are needed to store forefront pointers, the head + * cap, the column vector, and the cap at the end from _end_column, which may + * need to be a (larger) forefront if this ends up being the last column. * _sp: start position in the column * _ep: end position in the column * _nt: number of forefronts that need pointers @@ -760,8 +770,10 @@ unittest() { #define _calc_next_size(_sp, _ep, _nt) ({ \ size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \ size_t est_column_size = _calc_max_slice_size(_sp, _ep); \ - size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_forefront_s); \ - /* debug("sp(%lu), ep(%lu), nt(%lu), est_column_size(%lu), next_req(%lu)", (size_t) (_sp), (size_t) (_ep), (size_t) (_nt), est_column_size, next_req); */ \ + size_t head_cap_size = sizeof(struct dz_head_s); \ + size_t end_cap_size = sizeof(struct dz_forefront_s); \ + size_t next_req = forefront_arr_size + head_cap_size + est_column_size + end_cap_size; \ + debug("sp(%lu), ep(%lu), nt(%lu), forefront_arr_size(%lu), head_cap_size(%lu), est_column_size(%lu), end_cap_size(%lu), next_req(%lu)", (size_t) (_sp), (size_t) (_ep), (size_t) (_nt), forefront_arr_size, head_cap_size, est_column_size, end_cap_size, next_req); \ next_req; \ }) /* @@ -797,6 +809,7 @@ unittest() { * _end_matrix() before the arena can be used again for something else. */ #define _begin_column_head(_spos, _epos, _adj, _forefronts, _n_forefronts) ({ \ + debug("Beginning column head"); \ /* calculate sizes */ \ size_t next_req = _calc_next_size(_spos, _epos, _n_forefronts); \ /* reserve stream of memory to push onto */ \ @@ -824,6 +837,7 @@ unittest() { * arena can be used again. */ #define _begin_column(_w, _rch, _rlen) ({ \ + debug("Beginning column"); \ /* push cap info */ \ struct dz_cap_s *cap = dz_cap(dz_mem_stream_alloc_current(dz_mem(self))); \ dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_cap_s)); \ @@ -858,6 +872,7 @@ unittest() { * _end_matrix(). */ #define _end_column(_p, _spos, _epos) ({ \ + debug("Ending column"); \ /* finish the slice data allocation with what was actually used */ \ dz_mem_stream_alloc_end(dz_mem(self), ((_epos) - (_spos)) * sizeof(struct dz_swgv_s)); \ /* immediately next in memory, allocate up to a forefront, as a range */ \ @@ -876,6 +891,7 @@ unittest() { * relationships will need to be maintained. */ #define _end_matrix(_p, _wp, _rrem) ({ \ + debug("Ending matrix"); \ /* create forefront object */ \ struct dz_forefront_s *forefront = dz_forefront(dz_mem_stream_alloc_current(dz_mem(self))); \ dz_mem_stream_alloc_end(dz_mem(self), sizeof(struct dz_forefront_s)); \ From 8d27629cdb63efc0aa4bff1b84f6445993f6a48c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 22 Nov 2023 10:04:05 -0800 Subject: [PATCH 40/43] Make dz_flush actually consistent with dz_init --- dozeu.h | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/dozeu.h b/dozeu.h index 5d27362..2f4472c 100644 --- a/dozeu.h +++ b/dozeu.h @@ -322,7 +322,7 @@ struct dz_mem_block_s { struct dz_mem_block_s *next; size_t size; }; * * Exists at the beginning of the first block, just after its block header. */ -struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t being_allocated; uint64_t reservation_remaining; uint64_t _pad[1]; }; +struct dz_stack_s { struct dz_mem_block_s *curr; uint8_t *top, *end; uint64_t being_allocated; uint64_t reservation_remaining; uint8_t *dz_flush_top; }; /* * Represents a Dozeu memory allocation arena. * @@ -441,7 +441,7 @@ unittest() { /* * "Flush" a memory arena. * - * Retain all blocks allocated from the system allcoator, but internally amrk + * Retain all blocks allocated from the system allcoator, but internally mark * all the space as free. */ static __dz_force_inline @@ -1148,6 +1148,11 @@ struct dz_s *dz_init_intl( // precompute the end-cap, which will always live next to the dz_s in this mem block struct dz_cap_s *cap = _init_cap(0, 0xff, NULL, 0); + + // Save the stack top so we can tell if we flushed correctly. + // This isn't quite sufficient to just reconstruct by restoring stored state; we also need curr to go to the right block. + // TODO: Just store curr and recreate top/end from it and the known allocation size? + dz_mem(self).stack.dz_flush_top = dz_mem(self).stack.top; return(self); } @@ -1280,14 +1285,40 @@ void dz_flush( // point the mem back at the initial block dz_mem_flush(dz_mem(self)); - // move stack pointers past the dz_s, maybe the qual matrices, and the dz_cap_s that follows - void *bottom = (void *)(self + 1); - #ifdef DZ_QUAL_ADJ - bottom = (void *)(((int8_t *)bottom) + 2 * DZ_QUAL_MATRIX_SIZE); + // Re-make the initial allocation, which will re-use any additional memory block it needed. + // TODO: Share code with dz_init_intl/dz_qual_adj_init_intl which must match this to be correct. + size_t self_and_matrix_size = sizeof(struct dz_s); + #if defined(DZ_NUCL_ASCII) || defined(DZ_NUCL_2BIT) + #ifdef DZ_QUAL_ADJ + self_and_matrix_size += 2 * DZ_QUAL_MATRIX_SIZE; + #endif + #else + self_and_matrix_size += DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i); #endif - bottom = (void *)(((struct dz_cap_s *)bottom) + 1); + size_t cap_size = _calc_next_size(0, 0, 0); + dz_mem_stream_reserve(mem, self_and_matrix_size + cap_size); + + + // Re-allocate self and the matrices + struct dz_s *new_self = (struct dz_s *)dz_mem_stream_alloc(dz_mem(self), self_and_matrix_size); + if (new_self != self) { + dz_error("Lost ourself when resetting memory arena! We should be at %p but actually re-allocated at %p!", self, new_self); + } + // Re-allocate the head cap (with no forefronts and no writing). + // TODO: Must match the behavior of _init_cap + if (dz_mem_stream_alloc(dz_mem(self), sizeof(struct dz_head_s)) == NULL) { + dz_error("Could not re-allocate cap after self and matrix!"); + } + + // Now all the stack fields should be right, in particular stack.curr and + // stack.end, which are now in the block we had to go to to fit all this, + // if DZ_MEM_INIT_SIZE was too small. - dz_mem(self)->stack.top = (uint8_t *)bottom; + if(dz_mem(self).stack.top != dz_mem(self).stack.dz_flush_top) { + // We didn't get the right stack top at the end of all that. + dz_error("Could not recreate post-init state when flushing! Stack top should be %p but is actually %p!", dz_mem(self).stack.dz_flush_top, dz_mem(self).stack.top); + } + return; } From b8edacf39baa367b89f1afe662963204d2eab3f9 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 22 Nov 2023 10:39:10 -0800 Subject: [PATCH 41/43] Make example demo using flush --- dozeu.h | 29 +++++++++++-- example.c | 123 +++++++++++++++++++++++++++++------------------------- 2 files changed, 90 insertions(+), 62 deletions(-) diff --git a/dozeu.h b/dozeu.h index 2f4472c..151d437 100644 --- a/dozeu.h +++ b/dozeu.h @@ -555,6 +555,21 @@ uint64_t dz_mem_add_stack( return(0); } +/* + * Dump stack information to standard error. + */ +static __dz_force_inline +void dz_mem_log_stack( + struct dz_mem_s *mem) +{ + fprintf(stderr, "Dozeu stack blocks:\n"); + struct dz_mem_block_s *blk = &(mem->blk); + while (blk != NULL) { + fprintf(stderr, "\t%p\t%lu\n", blk, blk->size); + blk = blk->next; + } +} + /* * Allocate memory from an arena. * @@ -1152,7 +1167,7 @@ struct dz_s *dz_init_intl( // Save the stack top so we can tell if we flushed correctly. // This isn't quite sufficient to just reconstruct by restoring stored state; we also need curr to go to the right block. // TODO: Just store curr and recreate top/end from it and the known allocation size? - dz_mem(self).stack.dz_flush_top = dz_mem(self).stack.top; + dz_mem(self)->stack.dz_flush_top = dz_mem(self)->stack.top; return(self); } @@ -1282,6 +1297,9 @@ void dz_flush( #endif struct dz_s *self) { + fprintf(stderr, "Stack before flush:\n"); + dz_mem_log_stack(dz_mem(self)); + // point the mem back at the initial block dz_mem_flush(dz_mem(self)); @@ -1296,7 +1314,7 @@ void dz_flush( self_and_matrix_size += DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i); #endif size_t cap_size = _calc_next_size(0, 0, 0); - dz_mem_stream_reserve(mem, self_and_matrix_size + cap_size); + dz_mem_stream_reserve(dz_mem(self), self_and_matrix_size + cap_size); // Re-allocate self and the matrices @@ -1314,11 +1332,14 @@ void dz_flush( // stack.end, which are now in the block we had to go to to fit all this, // if DZ_MEM_INIT_SIZE was too small. - if(dz_mem(self).stack.top != dz_mem(self).stack.dz_flush_top) { + if(dz_mem(self)->stack.top != dz_mem(self)->stack.dz_flush_top) { // We didn't get the right stack top at the end of all that. - dz_error("Could not recreate post-init state when flushing! Stack top should be %p but is actually %p!", dz_mem(self).stack.dz_flush_top, dz_mem(self).stack.top); + dz_error("Could not recreate post-init state when flushing! Stack top should be %p but is actually %p!", dz_mem(self)->stack.dz_flush_top, dz_mem(self)->stack.top); } + fprintf(stderr, "Stack after flush:\n"); + dz_mem_log_stack(dz_mem(self)); + return; } diff --git a/example.c b/example.c index 800d631..9ff8006 100644 --- a/example.c +++ b/example.c @@ -47,65 +47,72 @@ int main(int argc, char *argv[]) score_matrix, GI, GE ); - /* Initialize root of alignment */ - struct dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); - - /* pack query */ - char const *query = "ACACTTCTAGACTTTACCACTA"; - struct dz_query_s const *q = dz_pack_query_forward( - dz, - query, /* char const *seq: query sequence */ - full_length_bonus, - strlen(query) /* length */ - ); - - /* init node array */ - struct dz_forefront_s const *ff[10] = { 0 }; - - /* fill root: node 0 */ - ff[0] = dz_extend( - dz, q, - &aln_init.root, 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; */ - "ACAC", strlen("ACAC"), 0, /* reference-side sequence, its length, and node id */ - aln_init.xt /* x-drop threshold */ - ); - - /* branching paths: 1, 2 -> 3 */ - ff[1] = dz_extend(dz, q, &ff[0], 1, "TTGT", strlen("TTGT"), 1, aln_init.xt); - ff[2] = dz_extend(dz, q, &ff[0], 1, "ATCC", strlen("ATCC"), 2, aln_init.xt); - ff[3] = dz_extend(dz, q, &ff[1], 2, "AGAC", strlen("AGAC"), 3, aln_init.xt); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ - - /* insertion: -, 4 -> 5 */ - ff[4] = dz_extend(dz, q, &ff[3], 1, "T", strlen("T"), 4, aln_init.xt); - ff[5] = dz_extend(dz, q, &ff[3], 2, "TTCTA", strlen("TTCTA"), 5, aln_init.xt); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ - - /* SNVs: 6, 7, 8 -> 9 */ - ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6, aln_init.xt); - ff[7] = dz_extend(dz, q, &ff[5], 1, "C", strlen("C"), 7, aln_init.xt); - ff[8] = dz_extend(dz, q, &ff[5], 1, "G", strlen("G"), 8, aln_init.xt); - ff[9] = dz_extend(dz, q, &ff[6], 3, "CACGG", strlen("CACGG"), 9, aln_init.xt); - - /* detect max */ - struct dz_forefront_s const *max = NULL; - for(size_t i = 0; i < 10; i++) { - if(max == NULL || ff[i]->max > max->max) { max = ff[i]; } - } - - /* traceback */ - struct dz_alignment_s const *aln = dz_trace( - dz, - max - ); - printf("ref_length(%u), query_length(%u), score(%d), path(%s)\n", aln->ref_length, aln->query_length, aln->score, aln->path); - for(size_t i = 0; i < aln->span_length; i++) { - struct dz_path_span_s const *s = &aln->span[i]; - printf("node_id(%u), subpath_length(%u), subpath(%.*s)\n", - s->id, - s[1].offset - s[0].offset, - s[1].offset - s[0].offset, &aln->path[s->offset] - ); - } + for (int repeat = 0; repeat < 3; ++repeat) { + + /* Initialize root of alignment */ + struct dz_alignment_init_s aln_init = dz_align_init(dz, max_gap_length); + + /* pack query */ + char const *query = "ACACTTCTAGACTTTACCACTA"; + struct dz_query_s const *q = dz_pack_query_forward( + dz, + query, /* char const *seq: query sequence */ + full_length_bonus, + strlen(query) /* length */ + ); + + /* init node array */ + struct dz_forefront_s const *ff[10] = { 0 }; + + /* fill root: node 0 */ + ff[0] = dz_extend( + dz, q, + &aln_init.root, 1, /* struct dz_forefront_s const *ff[], uint64_t n_ffs; incoming forefront and degree; */ + "ACAC", strlen("ACAC"), 0, /* reference-side sequence, its length, and node id */ + aln_init.xt /* x-drop threshold */ + ); + + /* branching paths: 1, 2 -> 3 */ + ff[1] = dz_extend(dz, q, &ff[0], 1, "TTGT", strlen("TTGT"), 1, aln_init.xt); + ff[2] = dz_extend(dz, q, &ff[0], 1, "ATCC", strlen("ATCC"), 2, aln_init.xt); + ff[3] = dz_extend(dz, q, &ff[1], 2, "AGAC", strlen("AGAC"), 3, aln_init.xt); /* "&ff[1], 2" indicates ff[1] and ff[2] are incoming nodes */ + + /* insertion: -, 4 -> 5 */ + ff[4] = dz_extend(dz, q, &ff[3], 1, "T", strlen("T"), 4, aln_init.xt); + ff[5] = dz_extend(dz, q, &ff[3], 2, "TTCTA", strlen("TTCTA"), 5, aln_init.xt); /* "&ff[3], 2" indicates ff[3] and ff[4] are incoming nodes */ + + /* SNVs: 6, 7, 8 -> 9 */ + ff[6] = dz_extend(dz, q, &ff[5], 1, "A", strlen("A"), 6, aln_init.xt); + ff[7] = dz_extend(dz, q, &ff[5], 1, "C", strlen("C"), 7, aln_init.xt); + ff[8] = dz_extend(dz, q, &ff[5], 1, "G", strlen("G"), 8, aln_init.xt); + ff[9] = dz_extend(dz, q, &ff[6], 3, "CACGG", strlen("CACGG"), 9, aln_init.xt); + + /* detect max */ + struct dz_forefront_s const *max = NULL; + for(size_t i = 0; i < 10; i++) { + if(max == NULL || ff[i]->max > max->max) { max = ff[i]; } + } + + /* traceback */ + struct dz_alignment_s const *aln = dz_trace( + dz, + max + ); + + printf("ref_length(%u), query_length(%u), score(%d), path(%s)\n", aln->ref_length, aln->query_length, aln->score, aln->path); + for(size_t i = 0; i < aln->span_length; i++) { + struct dz_path_span_s const *s = &aln->span[i]; + printf("node_id(%u), subpath_length(%u), subpath(%.*s)\n", + s->id, + s[1].offset - s[0].offset, + s[1].offset - s[0].offset, &aln->path[s->offset] + ); + } + + /* clean up for re-use */ + dz_flush(dz); + } /* clean up */ dz_destroy(dz); From 3625c9961ec12d4941db4969cd0df9c5f160354f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 22 Nov 2023 12:20:22 -0800 Subject: [PATCH 42/43] Add a way to limit retained memory in a dz_s without remaking it --- dozeu.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/dozeu.h b/dozeu.h index 151d437..76a17b0 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1278,6 +1278,9 @@ struct dz_alignment_init_s dz_align_init( #endif // DZ_INCLUDE_ONCE #ifndef DZ_INCLUDE_ONCE +/* + * Destroy a dz_s object when you are done with it. + */ static __dz_vectorize void dz_destroy( struct dz_s *self) @@ -1287,6 +1290,40 @@ void dz_destroy( dz_mem_destroy(dz_mem(self)); return; } + +/* + * Limit the bytes of memory retained by a dz_s object to under the given limit. + * Limit must be greater than the memory required for the initialized, empty state. + * Can only be called after dz_flush/dz_qual_adj_flush. + */ +static __dz_vectorize +void dz_trim( + struct dz_s *self, + size_t max_bytes) +{ + fprintf(stderr, "Stack before trim:\n"); + dz_mem_log_stack(dz_mem(self)); + + struct dz_mem_block_s *keep_block = &(dz_mem(self)->blk); + size_t bytes_found = 0; + while (keep_block != NULL) { + bytes_found += keep_block->size; + if (keep_block->next != NULL && bytes_found + keep_block->next->size > max_bytes) { + /* Cut the free list past here. */ + struct dz_mem_block_s *drop_block = keep_block->next; + while (drop_block != NULL) { + struct dz_mem_block_s *next = drop_block->next; + dz_free(drop_block); + drop_block = next; + } + keep_block->next = NULL; + } + keep_block = keep_block->next; + } + + fprintf(stderr, "Stack after trim:\n"); + dz_mem_log_stack(dz_mem(self)); +} #endif // DZ_INCLUDE_ONCE static __dz_vectorize From c7dce486aadc1f085811939d035ced2562f6c005 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 22 Nov 2023 12:21:40 -0800 Subject: [PATCH 43/43] Stop logging stack --- dozeu.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/dozeu.h b/dozeu.h index 76a17b0..5ba9a28 100644 --- a/dozeu.h +++ b/dozeu.h @@ -1301,9 +1301,6 @@ void dz_trim( struct dz_s *self, size_t max_bytes) { - fprintf(stderr, "Stack before trim:\n"); - dz_mem_log_stack(dz_mem(self)); - struct dz_mem_block_s *keep_block = &(dz_mem(self)->blk); size_t bytes_found = 0; while (keep_block != NULL) { @@ -1321,8 +1318,7 @@ void dz_trim( keep_block = keep_block->next; } - fprintf(stderr, "Stack after trim:\n"); - dz_mem_log_stack(dz_mem(self)); + return; } #endif // DZ_INCLUDE_ONCE @@ -1334,9 +1330,6 @@ void dz_flush( #endif struct dz_s *self) { - fprintf(stderr, "Stack before flush:\n"); - dz_mem_log_stack(dz_mem(self)); - // point the mem back at the initial block dz_mem_flush(dz_mem(self)); @@ -1374,9 +1367,6 @@ void dz_flush( dz_error("Could not recreate post-init state when flushing! Stack top should be %p but is actually %p!", dz_mem(self)->stack.dz_flush_top, dz_mem(self)->stack.top); } - fprintf(stderr, "Stack after flush:\n"); - dz_mem_log_stack(dz_mem(self)); - return; }