From fa8ee9e2a69f88d36c3902586aed8b18c9edd8a8 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 18 Nov 2021 20:28:54 -0500 Subject: [PATCH 1/8] start deletion filterer --- Makefile | 18 +- filter-paf-deletions.cpp | 412 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+), 3 deletions(-) create mode 100644 filter-paf-deletions.cpp diff --git a/Makefile b/Makefile index 09bf3eb..93f17ad 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip +all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -38,12 +38,18 @@ ifeq ($(shell ldd halUnclip | grep "not a dynamic" | wc -l), $(shell ls halUncli else $(error ldd found dnymaic linked dependency in halUnclip) endif +ifeq ($(shell ldd filter-paf-deletions | grep "not a dynamic" | wc -l), $(shell ls filter-paf-deletions | wc -l)) + $(info ldd verified that filter-paf-deletions static) +else + $(error ldd found dnymaic linked dependency in filter-paf-deletions) +endif + cleanFast : - rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o clean : - rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o cd deps/sonLib && make clean cd deps/pinchesAndCacti && make clean cd deps/hal && make clean @@ -91,6 +97,12 @@ halUnclip.o : halUnclip.cpp subpaths.h ${basicLibsDependencies} halUnclip : halUnclip.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread halUnclip.o ${basicLibs} -o halUnclip +filter-paf-deletions.o : filter-paf-deletions.cpp subpaths.h ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . filter-paf-deletions.cpp -c + +filter-paf-deletions : filter-paf-deletions.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread filter-paf-deletions.o ${basicLibs} -o filter-paf-deletions + test : make cd tests && prove -v t diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp new file mode 100644 index 0000000..48fa077 --- /dev/null +++ b/filter-paf-deletions.cpp @@ -0,0 +1,412 @@ +// Filter big deletions from PAF. These can sometimes arise from minigraph split alignments. They are rare, but can really +// mess up topology of graph if even one gets into cactus. + +// 1) Estimate anchors along reference path for every node in input graph +// 2) Scan every query in PAF in order, and look at target blocks +// 3) Use table from 1) in order to estimate the distances between target blocks +// 4) If two conesecutive target blocks span too big of a distance, delete the smaller block + +//#define debug + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" + +using namespace std; +using namespace handlegraph; +using namespace bdsg; + +struct Anchor { + path_handle_t path_handle; + int64_t max_offset; + int64_t min_offset; +}; + +static unique_ptr load_graph(istream& graph_stream); +static pair, unordered_map> load_trans(const string& trans_path); +static vector &split_delims(const string &s, const string& delims, vector &elems); +static unordered_map index_graph(const PathHandleGraph* graph, + const string& ref_prefix); +static void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, + const unordered_map& mg_to_vg, int64_t max_deletion); + +void help(char** argv) { + cerr << "usage: " << argv[0] << " [options] \n" << endl + << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl + << " : minigraph as obtained from vg convert -g graph.gfa" << endl + << " : node translation from vg convert -g -T" << endl + << " : paf alignment from cactus-graphmap" << endl + << " : prefix of reference path(s) in graph" + << endl + << "options: " << endl + << " -p, --progress Print progress" << endl + << " -t, --threads N number of threads to use (used only for computing snarls) [default: all available]" << endl + << endl; +} + +int main(int argc, char** argv) { + + bool progress = false; + int c; + optind = 1; + while (true) { + + static const struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"progress", no_argument, 0, 'p'}, + {"threads", required_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "hpt:", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'p': + progress = true; + break; + case 't': + { + int num_threads = stoi(optarg); + if (num_threads <= 0) { + cerr << "[filter-paf-deletions] error: Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 5) { + cerr << "[filter-paf-deletions] error: too few arguments\n" << endl; + help(argv); + return 1; + } + + // Parse the positional argument + if (optind >= argc) { + cerr << "[filter-paf-deletions] error: too few arguments\n" << endl; + help(argv); + return 1; + } + + if (optind != argc - 5) { + cerr << "[filter-paf-deletions] error: too many arguments\n" << endl; + help(argv); + return 1; + } + + string graph_path = argv[optind++]; + string trans_path = argv[optind++]; + string paf_path = argv[optind++]; + string ref_prefix = argv[optind++]; + int64_t max_deletion = stol(argv[optind++]); + + // load the graph + ifstream graph_stream(graph_path); + if (!graph_stream) { + cerr << "[filter-paf-deletions] error: Unable to open input graph " << graph_path << endl; + return 1; + } + unique_ptr graph = load_graph(graph_stream); + graph_stream.close(); + if (progress) { + cerr << "[filter-paf-deletions]: Loaded graph" << endl; + } + + // load the minigraph <-> vg id translation table (because our PAF is expressed in terms of the minigraph + // ids but we lose them when converting to vg.) + unordered_map mg_to_vg; + unordered_map vg_to_mg; + std::tie(mg_to_vg, vg_to_mg) = load_trans(trans_path); + + // open the paf + ifstream paf_file(paf_path); + if (!paf_file) { + cerr << "[filter-paf-deletions] error: Unable to open PAF" << endl; + return 1; + } + + // index the minigraph + // this maps each node in the graph to a (maximal) reference interval + unordered_map ref_index = index_graph(graph.get(), ref_prefix); + +#ifdef debug + for (auto fam : ref_index) { + cerr << fam.first << " -> " << graph->get_path_name(fam.second.path_handle) << " " << fam.second.min_offset << " " << fam.second.max_offset << endl; + } +#endif + + // we have everything needed to filter the paf + filter_paf(graph.get(), paf_file, ref_index, mg_to_vg, max_deletion); + + return 0; +} + +static string strip_prefix(const string& name) { + if (name.compare(0, 3, "id=") == 0) { + size_t p = name.find('|', 3); + assert(p != string::npos); + return name.substr(p + 1); + } + return name; +} + +void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, + const unordered_map& mg_to_vg, int64_t max_deletion) { + + string buffer; + + unordered_set query_set; + string prev_query_name; + int64_t prev_query_start; + int64_t prev_query_end; + nid_t prev_target_id = 0; + int64_t prev_ref_start; + int64_t prev_ref_end; + + while (getline(paf_file, buffer)) { + vector toks; + split_delims(buffer, "\t\n", toks); + + // pretty silly not to have a paf parser in this repo + const string& query_name = toks[0]; + int64_t query_start = stol(toks[2]); + int64_t query_end = stol(toks[3]); + string target_name = strip_prefix(toks[5]); + int64_t target_length = stol(toks[6]); + int64_t target_start = stol(toks[7]); + int64_t target_end = stol(toks[8]); + + // map the target interval onto the reference path + if (!mg_to_vg.count(target_name)) { + cerr << "[filter-paf-deletions] error: target name from PAF not found in translation: " << target_name << endl; + exit(1); + } + nid_t target_id = mg_to_vg.at(target_name); + const Anchor& anchor = ref_index.at(target_id); + int64_t ref_start = anchor.min_offset + target_start; + int64_t ref_end = anchor.max_offset - (target_length - target_end); + +#ifdef debug + cerr << "anchor " << target_id << " = " << anchor.min_offset << " , " << anchor.max_offset << endl; + cerr << "ref start = " << anchor.min_offset << " + " << target_start << " ref_end = " << anchor.max_offset << " - (" + << target_length << " - " << target_end << ");" << endl; +#endif + + if (prev_query_name == query_name) { + int64_t query_delta = query_start - prev_query_end; + int64_t ref_delta = ref_start - prev_ref_end; + int64_t delta = ref_delta - query_delta; + + if (delta > max_deletion && false) { + cerr << "detected deletion of " << delta << " on following paf line" + << " derive prev query " << prev_query_start << "," << prev_query_end + << " prev target " << prev_ref_start << "," << prev_ref_end + << " query " << query_start << "," << query_end + << " target " << ref_start << "," << ref_end + + << " \n" << buffer << endl; + } + + cerr << ref_start << "\t" << ref_end << "\t" << delta << "\t" << buffer << endl; + } + + prev_query_name = query_name; + prev_query_start = query_start; + prev_query_end = query_end; + prev_target_id = target_id; + prev_ref_start = ref_start; + prev_ref_end = ref_end; + } + +} + +unordered_map index_graph(const PathHandleGraph* graph, + const string& ref_prefix) { + + // start by making a path position index + // minigraph assumption: no more than one path per handle! + unordered_map position_index; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (graph->get_path_name(path_handle).compare(0, ref_prefix.length(), ref_prefix) == 0) { + size_t offset = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + size_t len = graph->get_length(handle); + assert(len > 0); + assert(!position_index.count(handle)); + position_index[handle] = offset; + position_index[graph->flip(handle)] = offset + len - 1; + offset += len; + }); + } + }); + + if (position_index.empty()) { + cerr << "[filter-paf-deletions] error: no reference path found" << endl; + exit(0); + } + + vector> thread_results(get_thread_count()); + + // really slow brute-force relies on minigraph not having too many nodes + graph->for_each_handle([&](handle_t handle) { + unordered_map& ref_index = thread_results[omp_get_thread_num()]; + + // find all reference nodes that are connected via BFS + unordered_set context; + unordered_set ref_handles; + vector cur_handles = {handle}; + while (!cur_handles.empty()) { + vector next_handles; + for (auto& h : cur_handles) { + if (!context.count(h)) { + context.insert(h); + if (position_index.count(h)) { + // dead-end on reference + ref_handles.insert(h); + } else { + graph->follow_edges(h, false, [&](handle_t n) { + next_handles.push_back(n); + }); + graph->follow_edges(h, true, [&](handle_t p) { + next_handles.push_back(p); + }); + } + } + } + cur_handles = std::move(next_handles); + } + + // update the index with reference offsets + unordered_set ref_path_set; + int64_t min_ref_offset = numeric_limits::max(); + int64_t max_ref_offset = -1; + for (handle_t ref_handle : ref_handles) { + vector steps = graph->steps_of_handle(ref_handle); + assert(steps.size() == 1); + path_handle_t ref_path_handle = graph->get_path_handle_of_step(steps.back()); + ref_path_set.insert(ref_path_handle); + // assumption: only one reference path in component + // (fair for minigraph, but may need to do better than prefix for path selection) + assert(ref_path_set.size() == 1); + int64_t ref_offset = position_index.at(ref_handle); + int64_t ref_offset_rev = position_index.at(graph->flip(ref_handle)); + min_ref_offset = std::min(min_ref_offset, min(ref_offset, ref_offset_rev)); + max_ref_offset = std::max(max_ref_offset, max(ref_offset, ref_offset_rev)); + assert(max_ref_offset >= min_ref_offset); + } + if (!ref_path_set.empty()) { + assert(ref_path_set.size() == 1); + Anchor& anchor = ref_index[graph->get_id(handle)]; + anchor.path_handle = *ref_path_set.begin(); + anchor.min_offset = min_ref_offset; + anchor.max_offset = max_ref_offset; + assert(anchor.max_offset >= anchor.min_offset); + } + }, true); + + // merge up the indexes + for (size_t i = 1; i < thread_results.size(); ++i) { + for (const auto& id_anchor : thread_results[i]) { + thread_results[0][id_anchor.first] = id_anchor.second; + } + thread_results[i].clear(); + } + + return thread_results[0]; +} + + +pair, unordered_map> load_trans(const string& trans_path) { + ifstream trans_file(trans_path); + if (!trans_file) { + cerr << "[filter-paf-deletions] error: Unable to load trans file" << endl; + exit(1); + } + + unordered_map mg_to_vg; + unordered_map vg_to_mg; + + string buffer; + while (getline(trans_file, buffer)) { + vector toks; + split_delims(buffer, "\t\n", toks); + assert(toks.size() == 3 && toks[0] == "T"); + mg_to_vg[toks[1]] = stol(toks[2]); + vg_to_mg[stol(toks[2])] = toks[1]; + } + + return make_pair(mg_to_vg, vg_to_mg); +} + + +vector &split_delims(const string &s, const string& delims, vector &elems) { + size_t start = string::npos; + for (size_t i = 0; i < s.size(); ++i) { + if (delims.find(s[i]) != string::npos) { + if (start != string::npos && i > start) { + elems.push_back(s.substr(start, i - start)); + } + start = string::npos; + } else if (start == string::npos) { + start = i; + } + } + if (start != string::npos && start < s.size()) { + elems.push_back(s.substr(start, s.size() - start)); + } + return elems; +} + +unique_ptr load_graph(istream& graph_stream) { + + char magic_bytes[4]; + graph_stream.read(magic_bytes, 4); + uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); + graph_stream.clear(); + graph_stream.seekg(0, ios::beg); + + MutablePathMutableHandleGraph* graph; + if (magic_number == PackedGraph().get_magic_number()) { + graph = new PackedGraph(); + } else if (magic_number == HashGraph().get_magic_number()) { + graph = new HashGraph(); + } else if (magic_number == ODGI().get_magic_number()) { + graph = new ODGI(); + } else { + cerr << "Unable to parse input graph with magic number " << magic_number << endl; + exit(1); + } + dynamic_cast(graph)->deserialize(graph_stream); + + return unique_ptr(graph); +} + From 4ef5eace4b5846c6ae5115b2bf42817791644370 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 18 Nov 2021 21:42:39 -0500 Subject: [PATCH 2/8] implement filter and fix bugs --- filter-paf-deletions.cpp | 91 +++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 15 deletions(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index 48fa077..9b41e26 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -38,7 +38,7 @@ static vector &split_delims(const string &s, const string& delims, vecto static unordered_map index_graph(const PathHandleGraph* graph, const string& ref_prefix); static void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const unordered_map& mg_to_vg, int64_t max_deletion); + const unordered_map& mg_to_vg, int64_t max_deletion, bool progress); void help(char** argv) { cerr << "usage: " << argv[0] << " [options] \n" << endl @@ -146,6 +146,10 @@ int main(int argc, char** argv) { unordered_map vg_to_mg; std::tie(mg_to_vg, vg_to_mg) = load_trans(trans_path); + if (progress) { + cerr << "[filter-paf-deletions]: Loaded " << mg_to_vg.size() << " translations." << endl; + } + // open the paf ifstream paf_file(paf_path); if (!paf_file) { @@ -157,6 +161,10 @@ int main(int argc, char** argv) { // this maps each node in the graph to a (maximal) reference interval unordered_map ref_index = index_graph(graph.get(), ref_prefix); + if (progress) { + cerr << "[filter-paf-deletions]: Created reference path index" << endl; + } + #ifdef debug for (auto fam : ref_index) { cerr << fam.first << " -> " << graph->get_path_name(fam.second.path_handle) << " " << fam.second.min_offset << " " << fam.second.max_offset << endl; @@ -164,7 +172,7 @@ int main(int argc, char** argv) { #endif // we have everything needed to filter the paf - filter_paf(graph.get(), paf_file, ref_index, mg_to_vg, max_deletion); + filter_paf(graph.get(), paf_file, ref_index, mg_to_vg, max_deletion, progress); return 0; } @@ -179,7 +187,7 @@ static string strip_prefix(const string& name) { } void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const unordered_map& mg_to_vg, int64_t max_deletion) { + const unordered_map& mg_to_vg, int64_t max_deletion, bool progress) { string buffer; @@ -189,8 +197,14 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere int64_t prev_query_end; nid_t prev_target_id = 0; int64_t prev_ref_start; - int64_t prev_ref_end; - + int64_t prev_ref_end; + + // store matches in current block (run of mappings from same query that don't span a max_deletion) + vector line_to_block; // map paf line number to a block (this vector contains offsets to below vector) + vector blocks; + unordered_set cut_blocks; // offsets in above that mark block before max_deletion + + size_t line_no = 0; while (getline(paf_file, buffer)) { vector toks; split_delims(buffer, "\t\n", toks); @@ -203,6 +217,7 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere int64_t target_length = stol(toks[6]); int64_t target_start = stol(toks[7]); int64_t target_end = stol(toks[8]); + int64_t matches = stol(toks[9]); // map the target interval onto the reference path if (!mg_to_vg.count(target_name)) { @@ -219,24 +234,34 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere cerr << "ref start = " << anchor.min_offset << " + " << target_start << " ref_end = " << anchor.max_offset << " - (" << target_length << " - " << target_end << ");" << endl; #endif + bool continues_block = query_name == prev_query_name; + if (continues_block) { - if (prev_query_name == query_name) { + if (prev_query_start > query_start) { + cerr << "Input paf not in order. Sort with \"sort -k 1,1 -k 3,3n\" first. Offending line:\n" << buffer << endl; + exit(1); + } int64_t query_delta = query_start - prev_query_end; int64_t ref_delta = ref_start - prev_ref_end; int64_t delta = ref_delta - query_delta; - if (delta > max_deletion && false) { - cerr << "detected deletion of " << delta << " on following paf line" - << " derive prev query " << prev_query_start << "," << prev_query_end - << " prev target " << prev_ref_start << "," << prev_ref_end - << " query " << query_start << "," << query_end - << " target " << ref_start << "," << ref_end - - << " \n" << buffer << endl; + if (delta > max_deletion) { + if (progress) { + cerr << "[filter-paf-deletions]: detected deletion of size " << delta << " on following paf line:" + << " \n " << buffer << endl; + } + continues_block = false; + cut_blocks.insert(blocks.size() - 1); } + } - cerr << ref_start << "\t" << ref_end << "\t" << delta << "\t" << buffer << endl; + // update the current block may adding the matches (or assigng a new block if necessary) + if (!continues_block) { + blocks.push_back(0); } + assert(!blocks.empty()); + blocks.back() += matches; + line_to_block.push_back(blocks.size() - 1); prev_query_name = query_name; prev_query_start = query_start; @@ -244,8 +269,44 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere prev_target_id = target_id; prev_ref_start = ref_start; prev_ref_end = ref_end; + + ++line_no; } + + // second pass to do filtering now that we have block sizes + paf_file.clear(); + paf_file.seekg(0, ios::beg) ; + + size_t filtered_line_count = 0; + line_no = 0; + while (getline(paf_file, buffer)) { + + bool filter_out = false; + int64_t cur_block = line_to_block[line_no]; + if (cur_block > 0) { + int64_t prev_block = cur_block - 1; + if (cut_blocks.count(prev_block) && blocks[prev_block] >= blocks[cur_block]) { + filter_out = true; + } + } + if (!filter_out && cur_block < blocks.size() - 1) { + int64_t next_block = cur_block + 1; + if (cut_blocks.count(cur_block) && blocks[next_block] > blocks[cur_block]) { + filter_out =true; + } + } + + if (!filter_out) { + cout << buffer << "\n"; + } else { + ++filtered_line_count; + } + ++line_no; + } + if (progress) { + cerr << "[filter-paf-deletions]: Filtered " << filtered_line_count << " paf lines" << endl; + } } unordered_map index_graph(const PathHandleGraph* graph, From 61c98347b60d2626e572d76a7f69a401529a0ede Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 19 Nov 2021 09:01:12 -0500 Subject: [PATCH 3/8] add overlap check --- filter-paf-deletions.cpp | 69 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index 9b41e26..29005c3 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -22,6 +22,8 @@ #include "bdsg/hash_graph.hpp" #include "bdsg/odgi.hpp" +#include "IntervalTree.h" + using namespace std; using namespace handlegraph; using namespace bdsg; @@ -37,8 +39,11 @@ static pair, unordered_map> load_tra static vector &split_delims(const string &s, const string& delims, vector &elems); static unordered_map index_graph(const PathHandleGraph* graph, const string& ref_prefix); +static IntervalTree index_deletions(const PathHandleGraph* graph, const unordered_map& index); static void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const unordered_map& mg_to_vg, int64_t max_deletion, bool progress); + const IntervalTree& ref_deletions, + const unordered_map& mg_to_vg, int64_t max_deletion, + double overlap_threshold, bool progress); void help(char** argv) { cerr << "usage: " << argv[0] << " [options] \n" << endl @@ -57,6 +62,9 @@ void help(char** argv) { int main(int argc, char** argv) { bool progress = false; + // only filter deletions that don't overlap an existing deletion by at least this much + // (doesn't seem to a factor -- most big deletions not in minigraph) + double overlap_threshold = 0.5; int c; optind = 1; while (true) { @@ -160,11 +168,15 @@ int main(int argc, char** argv) { // index the minigraph // this maps each node in the graph to a (maximal) reference interval unordered_map ref_index = index_graph(graph.get(), ref_prefix); - if (progress) { cerr << "[filter-paf-deletions]: Created reference path index" << endl; } + IntervalTree ref_deletions = index_deletions(graph.get(), ref_index); + if (progress) { + cerr << "[filter-paf-deletions]: Created reference deletion index" << endl; + } + #ifdef debug for (auto fam : ref_index) { cerr << fam.first << " -> " << graph->get_path_name(fam.second.path_handle) << " " << fam.second.min_offset << " " << fam.second.max_offset << endl; @@ -172,7 +184,7 @@ int main(int argc, char** argv) { #endif // we have everything needed to filter the paf - filter_paf(graph.get(), paf_file, ref_index, mg_to_vg, max_deletion, progress); + filter_paf(graph.get(), paf_file, ref_index, ref_deletions, mg_to_vg, max_deletion, overlap_threshold, progress); return 0; } @@ -187,7 +199,9 @@ static string strip_prefix(const string& name) { } void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const unordered_map& mg_to_vg, int64_t max_deletion, bool progress) { + const IntervalTree& ref_deletions, + const unordered_map& mg_to_vg, int64_t max_deletion, + double overlap_threshold, bool progress) { string buffer; @@ -245,9 +259,20 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere int64_t ref_delta = ref_start - prev_ref_end; int64_t delta = ref_delta - query_delta; - if (delta > max_deletion) { + int64_t ref_overlap_size = 0; + if (delta > 1) { + vector> overlaps = ref_deletions.findOverlapping(prev_query_end, query_start); + for (const auto& overlap : overlaps) { + int64_t intersection_start = max(prev_query_end, overlap.start); + int64_t intersection_stop = min(query_start, overlap.stop); + ref_overlap_size = max(ref_overlap_size, intersection_stop - intersection_start); + } + } + + if (delta > max_deletion && ((double)ref_overlap_size / delta < overlap_threshold)) { if (progress) { - cerr << "[filter-paf-deletions]: detected deletion of size " << delta << " on following paf line:" + cerr << "[filter-paf-deletions]: detected deletion of size " << delta << " with overlap " << ref_overlap_size << + " on following paf line:" << " \n " << buffer << endl; } continues_block = false; @@ -405,6 +430,38 @@ unordered_map index_graph(const PathHandleGraph* graph, return thread_results[0]; } +IntervalTree index_deletions(const PathHandleGraph* graph, const unordered_map& index) { + + vector>> thread_deletions(get_thread_count()); + + // get approximate deletion intervals using the index + graph->for_each_edge([&](edge_t edge) { + const Anchor& a1 = index.at(graph->get_id(edge.first)); + const Anchor& a2 = index.at(graph->get_id(edge.second)); + Interval interval(0, 0, 0); + if (a1.min_offset < a2.min_offset) { + interval.start = a1.max_offset; + interval.stop = a2.min_offset; + } else { + interval.start = a2.max_offset; + interval.stop = a1.min_offset; + } + interval.value = interval.stop - interval.start; + if (interval.value > 1) { + thread_deletions[omp_get_thread_num()].push_back(interval); + } + }, true); + + for (size_t i = 1; i < thread_deletions.size(); ++i) { + for (const auto& interval : thread_deletions[i]) { + thread_deletions[0].push_back(interval); + } + thread_deletions[i].clear(); + } + IntervalTree tree(thread_deletions[0]); + return tree; +} + pair, unordered_map> load_trans(const string& trans_path) { ifstream trans_file(trans_path); From da884256719f225c38fc528203afb83222dbac80 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 19 Nov 2021 09:44:50 -0500 Subject: [PATCH 4/8] multi iteration --- filter-paf-deletions.cpp | 102 +++++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index 29005c3..02b5e4c 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -40,28 +40,30 @@ static vector &split_delims(const string &s, const string& delims, vecto static unordered_map index_graph(const PathHandleGraph* graph, const string& ref_prefix); static IntervalTree index_deletions(const PathHandleGraph* graph, const unordered_map& index); -static void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const IntervalTree& ref_deletions, - const unordered_map& mg_to_vg, int64_t max_deletion, - double overlap_threshold, bool progress); +static int64_t filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, + const IntervalTree& ref_deletions, + const unordered_map& mg_to_vg, int64_t max_deletion, + double overlap_threshold, vector& filtered_lines, bool progress, bool verbose); void help(char** argv) { cerr << "usage: " << argv[0] << " [options] \n" << endl - << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl - << " : minigraph as obtained from vg convert -g graph.gfa" << endl - << " : node translation from vg convert -g -T" << endl - << " : paf alignment from cactus-graphmap" << endl - << " : prefix of reference path(s) in graph" - << endl - << "options: " << endl - << " -p, --progress Print progress" << endl - << " -t, --threads N number of threads to use (used only for computing snarls) [default: all available]" << endl + << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl + << " : minigraph as obtained from vg convert -g graph.gfa" << endl + << " : node translation from vg convert -g -T" << endl + << " : paf alignment from cactus-graphmap" << endl + << " : prefix of reference path(s) in graph" + << endl + << "options: " << endl + << " -p, --progress Print progress" << endl + << " -v, --verbose Print deletions" << endl + << " -t, --threads N number of threads to use (used only for computing snarls) [default: all available]" << endl << endl; } int main(int argc, char** argv) { bool progress = false; + bool verbose = false; // only filter deletions that don't overlap an existing deletion by at least this much // (doesn't seem to a factor -- most big deletions not in minigraph) double overlap_threshold = 0.5; @@ -72,13 +74,14 @@ int main(int argc, char** argv) { static const struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"progress", no_argument, 0, 'p'}, + {"verbose", no_argument, 0, 'v'}, {"threads", required_argument, 0, 't'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hpt:", + c = getopt_long (argc, argv, "hpvt:", long_options, &option_index); // Detect the end of the options. @@ -87,6 +90,9 @@ int main(int argc, char** argv) { switch (c) { + case 'v': + verbose = true; + break; case 'p': progress = true; break; @@ -184,7 +190,36 @@ int main(int argc, char** argv) { #endif // we have everything needed to filter the paf - filter_paf(graph.get(), paf_file, ref_index, ref_deletions, mg_to_vg, max_deletion, overlap_threshold, progress); + vector filtered_lines; + int64_t filtered_line_total = 0; + int64_t filtered_line_it = 0; + int64_t iteration = 0; + do { + filtered_line_it = filter_paf(graph.get(), paf_file, ref_index, ref_deletions, mg_to_vg, max_deletion, + overlap_threshold, filtered_lines, progress, verbose); + filtered_line_total += filtered_line_it; + if (progress) { + cerr << "[filter-paf-deletions]: Iteration " << iteration << ": Found " << filtered_line_it << " lines to filter" << endl; + } + ++iteration; + } while (filtered_line_it > 0); + + if (progress) { + cerr << "[filter-paf-deletions]: Filtering out " << filtered_line_total << " paf lines" << endl; + } + + // output the unfiltered lines + paf_file.clear(); + paf_file.seekg(0, ios::beg) ; + string buffer; + for (int64_t line_no = 0; line_no < filtered_lines.size(); ++line_no) { + const auto& ret = getline(paf_file, buffer); + assert(ret); + if (filtered_lines[line_no] == false) { + cout << buffer << "\n"; + } + } + cout << flush; return 0; } @@ -198,11 +233,13 @@ static string strip_prefix(const string& name) { return name; } -void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, - const IntervalTree& ref_deletions, - const unordered_map& mg_to_vg, int64_t max_deletion, - double overlap_threshold, bool progress) { +int64_t filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordered_map& ref_index, + const IntervalTree& ref_deletions, + const unordered_map& mg_to_vg, int64_t max_deletion, + double overlap_threshold, vector& filtered_lines, bool progress, bool verbose) { + paf_file.clear(); + paf_file.seekg(0, ios::beg) ; string buffer; unordered_set query_set; @@ -219,7 +256,15 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere unordered_set cut_blocks; // offsets in above that mark block before max_deletion size_t line_no = 0; + size_t unfiltered_line_no = 0; while (getline(paf_file, buffer)) { + if (unfiltered_line_no >= filtered_lines.size()) { + // first pass + filtered_lines.push_back(false); + } else if (filtered_lines[unfiltered_line_no]) { + ++unfiltered_line_no; + continue; + } vector toks; split_delims(buffer, "\t\n", toks); @@ -270,7 +315,7 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere } if (delta > max_deletion && ((double)ref_overlap_size / delta < overlap_threshold)) { - if (progress) { + if (verbose) { cerr << "[filter-paf-deletions]: detected deletion of size " << delta << " with overlap " << ref_overlap_size << " on following paf line:" << " \n " << buffer << endl; @@ -296,6 +341,7 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere prev_ref_end = ref_end; ++line_no; + ++unfiltered_line_no; } // second pass to do filtering now that we have block sizes @@ -305,7 +351,12 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere size_t filtered_line_count = 0; line_no = 0; + unfiltered_line_no = 0; while (getline(paf_file, buffer)) { + if (filtered_lines[unfiltered_line_no]) { + ++unfiltered_line_no; + continue; + } bool filter_out = false; int64_t cur_block = line_to_block[line_no]; @@ -322,16 +373,15 @@ void filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unordere } } - if (!filter_out) { - cout << buffer << "\n"; - } else { + if (filter_out) { + filtered_lines[unfiltered_line_no] = true; ++filtered_line_count; } + ++unfiltered_line_no; ++line_no; } - if (progress) { - cerr << "[filter-paf-deletions]: Filtered " << filtered_line_count << " paf lines" << endl; - } + + return filtered_line_count; } unordered_map index_graph(const PathHandleGraph* graph, From 35df71b064cc15871d7ffdacf5d710d46f7e538c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 19 Nov 2021 10:46:13 -0500 Subject: [PATCH 5/8] optional ref prefix --- filter-paf-deletions.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index 02b5e4c..3d1a83c 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -46,14 +46,15 @@ static int64_t filter_paf(const PathHandleGraph* graph, ifstream& paf_file, cons double overlap_threshold, vector& filtered_lines, bool progress, bool verbose); void help(char** argv) { - cerr << "usage: " << argv[0] << " [options] \n" << endl + cerr << "usage: " << argv[0] << " [options] \n" << endl << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl << " : minigraph as obtained from vg convert -g graph.gfa" << endl << " : node translation from vg convert -g -T" << endl << " : paf alignment from cactus-graphmap" << endl - << " : prefix of reference path(s) in graph" + << " : only remove deletions greater than this" << endl << endl << "options: " << endl + << " -r, --ref-prefix STR Only consider paths whose names start with STR" << endl << " -p, --progress Print progress" << endl << " -v, --verbose Print deletions" << endl << " -t, --threads N number of threads to use (used only for computing snarls) [default: all available]" << endl @@ -62,6 +63,7 @@ void help(char** argv) { int main(int argc, char** argv) { + string ref_prefix; bool progress = false; bool verbose = false; // only filter deletions that don't overlap an existing deletion by at least this much @@ -72,6 +74,7 @@ int main(int argc, char** argv) { while (true) { static const struct option long_options[] = { + {"ref-prefix", required_argument, 0, 'r'}, {"help", no_argument, 0, 'h'}, {"progress", no_argument, 0, 'p'}, {"verbose", no_argument, 0, 'v'}, @@ -90,6 +93,9 @@ int main(int argc, char** argv) { switch (c) { + case 'r': + ref_prefix = optarg; + break; case 'v': verbose = true; break; @@ -117,7 +123,7 @@ int main(int argc, char** argv) { } } - if (argc <= 5) { + if (argc <= 4) { cerr << "[filter-paf-deletions] error: too few arguments\n" << endl; help(argv); return 1; @@ -130,7 +136,7 @@ int main(int argc, char** argv) { return 1; } - if (optind != argc - 5) { + if (optind != argc - 4) { cerr << "[filter-paf-deletions] error: too many arguments\n" << endl; help(argv); return 1; @@ -139,7 +145,6 @@ int main(int argc, char** argv) { string graph_path = argv[optind++]; string trans_path = argv[optind++]; string paf_path = argv[optind++]; - string ref_prefix = argv[optind++]; int64_t max_deletion = stol(argv[optind++]); // load the graph From 3bb36b606051b6b3150948c358177b746ecfff23 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 22 Nov 2021 11:12:05 -0500 Subject: [PATCH 6/8] allow negative strand gaps! --- filter-paf-deletions.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index 3d1a83c..cde2915 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -305,9 +305,12 @@ int64_t filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unord cerr << "Input paf not in order. Sort with \"sort -k 1,1 -k 3,3n\" first. Offending line:\n" << buffer << endl; exit(1); } - int64_t query_delta = query_start - prev_query_end; - int64_t ref_delta = ref_start - prev_ref_end; - int64_t delta = ref_delta - query_delta; + int64_t query_delta = abs(query_start - prev_query_end); + int64_t ref_delta = abs(ref_start - prev_ref_end); + int64_t delta = abs(ref_delta - query_delta); +#ifdef debug + cerr << "query delta " << query_delta << " ref delta = " << ref_delta << " delta = " << delta << endl; +#endif int64_t ref_overlap_size = 0; if (delta > 1) { From e08680001dd8d0bdfe9950ce491719889f883144 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 22 Nov 2021 16:18:55 -0500 Subject: [PATCH 7/8] fix --- filter-paf-deletions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-paf-deletions.cpp b/filter-paf-deletions.cpp index cde2915..b5cf524 100644 --- a/filter-paf-deletions.cpp +++ b/filter-paf-deletions.cpp @@ -314,7 +314,7 @@ int64_t filter_paf(const PathHandleGraph* graph, ifstream& paf_file, const unord int64_t ref_overlap_size = 0; if (delta > 1) { - vector> overlaps = ref_deletions.findOverlapping(prev_query_end, query_start); + vector> overlaps = ref_deletions.findOverlapping(min(prev_query_end, query_start), max(prev_query_end, query_start)); for (const auto& overlap : overlaps) { int64_t intersection_start = max(prev_query_end, overlap.start); int64_t intersection_stop = min(query_start, overlap.stop); From ce52e533e5a39489652dacde255201f5301b37d1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 Dec 2021 11:51:59 -0500 Subject: [PATCH 8/8] fix up self-loop handling in forwardizer --- clip-vg.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/clip-vg.cpp b/clip-vg.cpp index 9d461ce..771bfcb 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -768,11 +768,25 @@ void forwardize_paths(MutablePathMutableHandleGraph* graph, const string& ref_pr if (graph->get_is_reverse(handle)) { handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); graph->follow_edges(handle, true, [&](handle_t prev_handle) { - graph->create_edge(prev_handle, flipped_handle); + if (graph->get_id(prev_handle) != graph->get_id(handle)) { + graph->create_edge(prev_handle, flipped_handle); + } }); graph->follow_edges(handle, false, [&](handle_t next_handle) { - graph->create_edge(flipped_handle, next_handle); + if (graph->get_id(handle) != graph->get_id(next_handle)) { + graph->create_edge(flipped_handle, next_handle); + } }); + // self-loop cases we punted on above: + if (graph->has_edge(handle, handle)) { + graph->create_edge(flipped_handle, flipped_handle); + } + if (graph->has_edge(handle, graph->flip(handle))) { + graph->create_edge(flipped_handle, graph->flip(flipped_handle)); + } + if (graph->has_edge(graph->flip(handle), handle)) { + graph->create_edge(graph->flip(flipped_handle), flipped_handle); + } vector steps = graph->steps_of_handle(handle); size_t ref_count = 0; for (step_handle_t step : steps) {