Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment: Fast path for exact matches #461

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,43 @@ void align_or_map_single(
auto query_randstrobes = randstrobes_query(record.seq, index_parameters);
statistics.tot_construct_strobemers += strobe_timer.duration();

// Try an optimization if the first randstrobe is unique
if (query_randstrobes.size() > 0) {
auto qr = query_randstrobes.front();

size_t pos = index.find_full(qr.hash);
if (pos != index.end() && index.is_unique(pos)) {
size_t ref_pos = index.get_strobe1_position(pos);
if (ref_pos >= qr.start) {
size_t ref_start = ref_pos - qr.start;
size_t ref_id = index.reference_index(pos);
std::string_view refseq = references.sequences[ref_id];

if (refseq.substr(ref_start, record.seq.length()) == record.seq) {
bool is_rc = false; // TODO
std::vector<Nam> nams1{Nam::make_exact_match(ref_id, ref_start, record.seq.length(), is_rc)};
switch (map_param.output_format) {
case OutputFormat::Abundance:
abundances[ref_id] += float(record.seq.length());
break;
case OutputFormat::PAF:
output_hits_paf(outstring, nams1, record.name, references,
record.seq.length());
break;
case OutputFormat::SAM:
align_single(
aligner, sam, nams1, record, index_parameters.syncmer.k,
references, details, map_param.dropoff_threshold, map_param.max_tries,
map_param.max_secondary, random_engine
);
break;
}
return;
}
}
}
}

// Find NAMs
Timer nam_timer;
auto [nonrepetitive_fraction, n_hits, nams] = find_nams(query_randstrobes, index, map_param.use_mcs);
Expand Down
4 changes: 4 additions & 0 deletions src/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ struct StrobemerIndex {
return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift);
}

bool is_unique(bucket_index_t position) const {
return get_hash(position) != get_hash(position + 1);
}

unsigned int get_strobe1_position(bucket_index_t position) const {
return randstrobes[position].position;
}
Expand Down
16 changes: 16 additions & 0 deletions src/nam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@ struct Nam {
int projected_ref_start() const {
return std::max(0, ref_start - query_start);
}

static Nam make_exact_match(int ref_id, int ref_start, int length, bool is_rc) {
return Nam{
0, // nam_id
0, // query_start
length, // query_end
0, // query_prev_match_startpos
ref_start,
ref_start + length, // ref_end
0, // ref_prev_match_startpos
0, // n_matches
ref_id,
0, // score
is_rc
};
}
};

std::tuple<float, int, std::vector<Nam>> find_nams(
Expand Down