diff --git a/clip-vg.cpp b/clip-vg.cpp index b4d94d1..046dab8 100644 --- a/clip-vg.cpp +++ b/clip-vg.cpp @@ -67,48 +67,19 @@ static unordered_map>> get_clipped_interva const unordered_map>>& input_intervals, const unordered_map>>& output_intervals); -// from path.cpp in vg -// Check if using subpath naming scheme. If it is return true, -// the root path name, and the offset (false otherwise) -// formats accepted: -// name[start] (0-based) -// name[start-end] (0-based, open-ended like BED, end defaults to 0 in above case) -static tupleparse_subpath_name(const string& path_name) { - size_t tag_offset = path_name.rfind("["); - if (tag_offset == string::npos || tag_offset + 2 >= path_name.length() || path_name.back() != ']') { - return make_tuple(false, "", 0, 0); - } else { - string offset_str = path_name.substr(tag_offset + 1, path_name.length() - tag_offset - 2); - size_t sep_offset = offset_str.find("-"); - string end_offset_str; - if (sep_offset != string::npos && !offset_str.empty() && sep_offset < offset_str.length() - 1) { - end_offset_str = offset_str.substr(sep_offset + 1, offset_str.length() - sep_offset - 1); - offset_str = offset_str.substr(0, sep_offset); - } - size_t offset_val; - size_t end_offset_val = 0; - try { - offset_val = std::stol(offset_str); - if (!end_offset_str.empty()) { - end_offset_val = std::stol(end_offset_str); - } - } catch(...) { - return make_tuple(false, "", 0, 0); - } - return make_tuple(true, path_name.substr(0, tag_offset), offset_val, end_offset_val); - } -} - // Create a subpath name (todo: make same function in vg consistent (it only includes start)) static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) { - tuple parsed_name = parse_subpath_name(path_name); - size_t cur_offset = 0; - const string* cur_name = &path_name; - if (get<0>(parsed_name)) { - cur_offset = get<2>(parsed_name); - cur_name = &get<1>(parsed_name); - } - return *cur_name + "[" + std::to_string(cur_offset + offset) + "-" + std::to_string(cur_offset + offset + length) + "]"; + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); + subrange.first = subrange != PathMetadata::NO_SUBRANGE ? subrange.first : 0; + subrange.first += offset; + subrange.second = subrange.first + length; + return PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); } int main(int argc, char** argv) { @@ -1029,15 +1000,12 @@ unordered_map>> get_path_intervals(const P unordered_map>> path_intervals; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); - tuple subpath_info = parse_subpath_name(path_name); size_t path_len = 0; graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { path_len += graph->get_length(graph->get_handle_of_step(step_handle)); }); - int64_t start = get<0>(subpath_info) ? get<2>(subpath_info) : 0; - if (get<0>(subpath_info)) { - path_name = get<1>(subpath_info); - } + subrange_t path_range = graph->get_subrange(path_handle); + int64_t start = path_range == PathMetadata::NO_SUBRANGE ? 0 : path_range.second; vector>& intervals = path_intervals[path_name]; intervals.push_back(make_pair(start, start + path_len)); }); diff --git a/deps/hal b/deps/hal index 908bcc1..375b84f 160000 --- a/deps/hal +++ b/deps/hal @@ -1 +1 @@ -Subproject commit 908bcc132e9e53fed30f678e9f2b6ee27e8589da +Subproject commit 375b84f51f9c20e0af85945988da7d0b0ff9876c diff --git a/deps/libbdsg-easy b/deps/libbdsg-easy index e37cf0e..2e5211b 160000 --- a/deps/libbdsg-easy +++ b/deps/libbdsg-easy @@ -1 +1 @@ -Subproject commit e37cf0e1981abb8bb8a93e879d11e8b3b8a07fdd +Subproject commit 2e5211bfa830399e011dddfaf5e827ddff5a4213 diff --git a/deps/sonLib b/deps/sonLib index ea0b939..a4d45c4 160000 --- a/deps/sonLib +++ b/deps/sonLib @@ -1 +1 @@ -Subproject commit ea0b939828ba24d998a7c1aa407ff5a016912f56 +Subproject commit a4d45c46c6b9ee580fbac89db10894bac8844fd9 diff --git a/hal2vg.cpp b/hal2vg.cpp index 56cf41a..0e9e623 100644 --- a/hal2vg.cpp +++ b/hal2vg.cpp @@ -30,6 +30,9 @@ using namespace handlegraph; static void initParser(CLParser* optionsParser) { optionsParser->addArgument("halFile", "input hal file"); + optionsParser->addOption("refGenomes", + "comma-separated (no spaces) genomes to treat as reference paths with all others as haplotype paths (default=all genomes)", + "\"\""); optionsParser->addOption("rootGenome", "process only genomes in clade with specified root" " (HAL root if empty)", @@ -44,11 +47,6 @@ static void initParser(CLParser* optionsParser) { optionsParser->addOption("ignoreGenomes", "comma-separated (no spaces) list of genomes to ignore", "\"\""); - optionsParser->addOptionFlag("onlySequenceNames", - "use only sequence names for output names. By " - "default, the UCSC convention of " - "Genome.Sequence is used", - false); optionsParser->addOption("outputFormat", "output graph format in {pg, hg, odgi} [default=pg]", "pg"); @@ -65,20 +63,17 @@ static void initParser(CLParser* optionsParser) { static void add_genome_threads(const Genome* genome, stPinchThreadSet* threads, vector& IDToName, - unordered_map& nameToID, - bool fullNames); + unordered_map& nameToID); static void pinch_genome(const Genome* genome, stPinchThreadSet* threads, unordered_map& nameToID, - bool fullNames, const vector& targetNames, unordered_map>& snp_cache); static void pinch_snp(const Genome* genome, stPinchThreadSet* threads, unordered_map& nameToID, - bool fullNames, const TopSegmentIteratorPtr& topIt, int64_t topOffset, ColumnIteratorPtr& colIt, @@ -92,32 +87,34 @@ static void pinch_to_handle(const Genome* genome, const unordered_map& nameToID, unordered_map& blockToNode, MutablePathMutableHandleGraph& graph, - bool fullNames); + const vector& refNames); static void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength); -static void resolve_subpath_naming(string& path_name); +static subrange_t resolve_subpath_naming(string& path_name); + +static size_t resolve_haplotype_naming(string& genome_name); int main(int argc, char** argv) { CLParser optionsParser; initParser(&optionsParser); string halPath; + string refGenomes; string rootGenomeName; string targetGenomes; bool noAncestors; string ignoreGenomes; - bool fullNames; string outputFormat; size_t maxNodeLength; bool progress; try { optionsParser.parseOptions(argc, argv); halPath = optionsParser.getArgument("halFile"); + refGenomes = optionsParser.getOption("refGenomes"); rootGenomeName = optionsParser.getOption("rootGenome"); targetGenomes = optionsParser.getOption("targetGenomes"); noAncestors = optionsParser.getFlag("noAncestors"); ignoreGenomes = optionsParser.getOption("ignoreGenomes"); - fullNames = !optionsParser.getFlag("onlySequenceNames"); outputFormat = optionsParser.getOption("outputFormat"); if (outputFormat != "pg" && outputFormat != "hg" && outputFormat != "odgi") { throw hal_exception("--outputFormat must be one of {pg, hg, odgi}"); @@ -141,6 +138,12 @@ int main(int argc, char** argv) { throw hal_exception("input hal alignmenet is empty"); } + vector refNames; + if (refGenomes != "\"\"") { + refNames = chopString(refGenomes, ","); + std::sort(refNames.begin(), refNames.end()); + } + // default to alignment root if none specified bool givenRoot = true; if (rootGenomeName == "\"\"") { @@ -261,7 +264,7 @@ int main(int argc, char** argv) { if (progress && !(!curParent.empty() && genomeName != rootGenomeName)) { cerr << "adding threads from " << genome->getName() << endl; } - add_genome_threads(genome, threadSet, IDToName, nameToID, fullNames); + add_genome_threads(genome, threadSet, IDToName, nameToID); } if (!ignoreGenome && !curParent.empty() && genomeName != rootGenomeName) { @@ -302,7 +305,7 @@ int main(int argc, char** argv) { if (progress) { cerr << "pinching " << pinchGenomes[i] << endl; } - pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, fullNames, targetNames, snp_cache); + pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, targetNames, snp_cache); } snp_cache.clear(); @@ -347,7 +350,7 @@ int main(int argc, char** argv) { cerr << "converting " << genomeName << " with " << genome->getNumSequences() << " sequences and total length " << genome->getSequenceLength() << endl; } - pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, fullNames); + pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, refNames); alignment->closeGenome(genome); } @@ -388,15 +391,14 @@ int main(int argc, char** argv) { // Add every sequence from the genome into the pinch graph void add_genome_threads(const Genome* genome, - stPinchThreadSet* threads, - vector& IDToName, - unordered_map& nameToID, - bool fullNames) { + stPinchThreadSet* threads, + vector& IDToName, + unordered_map& nameToID) { for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { const Sequence *sequence = seqIt->getSequence(); hal_size_t seqLen = sequence->getSequenceLength(); - string name = fullNames ? sequence->getFullName() : sequence->getName(); + string name = sequence->getFullName(); // update lookups to map hal sequence to numeric id int64_t seqID = IDToName.size(); nameToID[name] = seqID; @@ -413,7 +415,6 @@ void add_genome_threads(const Genome* genome, void pinch_genome(const Genome* genome, stPinchThreadSet* threads, unordered_map& nameToID, - bool fullNames, const vector& targetNames, unordered_map>& snp_cache) { @@ -451,8 +452,8 @@ void pinch_genome(const Genome* genome, botIt->toParent(topIt); // todo: lots of string lookups - int64_t topID = nameToID[fullNames ? topIt->tseg()->getSequence()->getFullName() : topIt->tseg()->getSequence()->getName()]; - int64_t botID = nameToID[fullNames ? botIt->bseg()->getSequence()->getFullName() : botIt->bseg()->getSequence()->getName()]; + int64_t topID = nameToID[topIt->tseg()->getSequence()->getFullName()]; + int64_t botID = nameToID[botIt->bseg()->getSequence()->getFullName()]; if (topIt->tseg()->getSequence() != topSeq) { topSeq = topIt->tseg()->getSequence(); @@ -483,7 +484,7 @@ void pinch_genome(const Genome* genome, } last_match = i; } else if (colIt.get() != NULL) { - pinch_snp(genome, threads, nameToID, fullNames, topIt, i, colIt, + pinch_snp(genome, threads, nameToID, topIt, i, colIt, std::toupper(topString[i]), topThread, snp_cache); } if (std::toupper(topString[i]) != std::toupper(botString[i]) || i == (int64_t)topString.length() - 1) { @@ -565,7 +566,6 @@ void pinch_genome(const Genome* genome, void pinch_snp(const Genome* genome, stPinchThreadSet* threads, unordered_map& nameToID, - bool fullNames, const TopSegmentIteratorPtr& topIt, int64_t topOffset, ColumnIteratorPtr& colIt, @@ -596,7 +596,7 @@ void pinch_snp(const Genome* genome, for (ColumnIterator::DNASet::const_iterator dsi = cmi->second->begin(); dsi != cmi->second->end(); ++dsi) { char botBase = std::toupper((*dsi)->getBase()); - int64_t otherID = nameToID[fullNames ? sequence->getFullName() : sequence->getName()]; + int64_t otherID = nameToID[sequence->getFullName()]; stPinchThread* otherThread = stPinchThreadSet_getThread(threads, otherID); hal_index_t otherStart = (*dsi)->getArrayIndex() - sequence->getStartPosition(); @@ -634,22 +634,36 @@ void pinch_to_handle(const Genome* genome, const unordered_map& nameToID, unordered_map& blockToNode, MutablePathMutableHandleGraph& graph, - bool fullNames) { + const vector& refNames) { // iterate over the sequences of the genome for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { const Sequence *sequence = seqIt->getSequence(); - string seqName = fullNames ? sequence->getFullName() : sequence->getName(); - int64_t seqID = nameToID.find(seqName)->second; + PathSense sense = PathSense::REFERENCE; + if (!refNames.empty() && !std::binary_search(refNames.begin(), refNames.end(), genome->getName())) { + sense = PathSense::HAPLOTYPE; + } + int64_t seqID = nameToID.find(sequence->getFullName())->second; stPinchThread* thread = stPinchThreadSet_getThread(threadSet, seqID); // cactus_graphmap_split can make paths like contig_sub_1_3. here we convert that // into a format vg can (sometimes) understand contig[1-3]. // (the reason we go through this is that assembly hubs can't handle any special characters apparently) - resolve_subpath_naming(seqName); - + string parsed_name = sequence->getName(); + subrange_t subpath = resolve_subpath_naming(parsed_name); + string parsed_genome_name = genome->getName(); + size_t haplotype = resolve_haplotype_naming(parsed_genome_name); + if (sense == PathSense::HAPLOTYPE && haplotype == PathMetadata::NO_HAPLOTYPE) { + haplotype = 0; + } // create the path - path_handle_t pathHandle = graph.create_path_handle(seqName); + path_handle_t pathHandle = graph.create_path(sense, + parsed_genome_name, + parsed_name, + haplotype, + sense == PathSense::HAPLOTYPE ? 0 : PathMetadata::NO_PHASE_BLOCK, + subpath, + false); string pathString; // iterate over the segments of the sequence @@ -731,7 +745,7 @@ void pinch_to_handle(const Genome* genome, string halPathString; sequence->getString(halPathString); if (pathString.length() != halPathString.length()) { - throw runtime_error("Incorrect length in coverted path for " + seqName + ": " + std::to_string(pathString.length()) + + throw runtime_error("Incorrect length in coverted path for " + sequence->getFullName() + ": " + std::to_string(pathString.length()) + ". Should be: " + std::to_string(halPathString.length())); } vector mismatches; @@ -742,7 +756,7 @@ void pinch_to_handle(const Genome* genome, } if (!mismatches.empty()) { stringstream msg; - msg << mismatches.size() << " mismatches found in converted path for " << seqName << ":\n"; + msg << mismatches.size() << " mismatches found in converted path for " << sequence->getFullName() << ":\n"; for (size_t i = 0; i < mismatches.size() && i < 10; ++i) { msg << " path[" << mismatches[i] << "]=" << pathString[mismatches[i]] << ". should be " << halPathString[mismatches[i]] << "\n"; } @@ -771,9 +785,10 @@ void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength) { } } -void resolve_subpath_naming(string& path_name) { +subrange_t resolve_subpath_naming(string& path_name) { size_t first_length = 0; size_t start_offset = 0; + bool found_subpath = false; while (true) { size_t sp = path_name.rfind("_sub_"); if (sp != string::npos) { @@ -781,12 +796,8 @@ void resolve_subpath_naming(string& path_name) { if (up != string::npos && up > sp + 1) { int64_t start; int64_t end; - try { - start = stol(path_name.substr(sp + 5, up - sp - 5)); - end = stol(path_name.substr(up + 1)); - } catch (...) { - return; - } + start = stol(path_name.substr(sp + 5, up - sp - 5)); + end = stol(path_name.substr(up + 1)); stringstream new_name; start_offset += start; // final offset is sum of all nested offsets if (first_length == 0) { @@ -797,12 +808,30 @@ void resolve_subpath_naming(string& path_name) { // be derived from the start, plus the length of the "top" path end = start_offset + first_length; } - new_name << path_name.substr(0, sp) << "[" << start_offset << "-" << end << "]"; + new_name << path_name.substr(0, sp); path_name = new_name.str(); + found_subpath = true; } } else { break; } } - return; + if (found_subpath) { + return make_pair(start_offset, start_offset + first_length); + } else { + return PathMetadata::NO_SUBRANGE; + } +} + +size_t resolve_haplotype_naming(string& genome_name) { + size_t haplotype = PathMetadata::NO_HAPLOTYPE; + size_t dp = genome_name.rfind("."); + if (dp != string::npos) { + try { + haplotype = stol(genome_name.substr(dp + 1)); + genome_name = genome_name.substr(0, dp); + } catch(...) { + } + } + return haplotype; } diff --git a/include.mk b/include.mk index 76666d1..409e5c4 100644 --- a/include.mk +++ b/include.mk @@ -13,7 +13,7 @@ libbdsgPath=${rootPath}/deps/libbdsg-easy include ${sonLibRootPath}/include.mk CFLAGS += -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -CXXFLAGS += -std=c++11 -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${libbdsgPath}/include -UNDEBUG +CXXFLAGS += -std=c++14 -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${libbdsgPath}/include -UNDEBUG basicLibs = ${halPath}/libHal.a ${sonLibPath}/stPinchesAndCacti.a ${sonLibPath}/sonLib.a ${sonLibPath}/cuTest.a ${libbdsgPath}/lib/libbdsg.a ${libbdsgPath}/lib/libhandlegraph.a ${libbdsgPath}/lib/libsdsl.a ${libbdsgPath}/lib/libdivsufsort.a ${libbdsgPath}/lib/libdivsufsort64.a basicLibsDependencies = ${basicLibs} diff --git a/tests/small/truth.json b/tests/small/truth.json index 8b2142f..52e8e81 100644 --- a/tests/small/truth.json +++ b/tests/small/truth.json @@ -173,7 +173,7 @@ "rank": "6" } ], - "name": "cat.3" + "name": "cat#3" }, { "mapping": [ @@ -264,7 +264,7 @@ "rank": "7" } ], - "name": "chimp.2" + "name": "chimp#2" }, { "mapping": [ @@ -368,7 +368,7 @@ "rank": "8" } ], - "name": "human.1" + "name": "human#1" } ] } diff --git a/tests/t/merge.t b/tests/t/merge.t index 19fda86..0ef503d 100644 --- a/tests/t/merge.t +++ b/tests/t/merge.t @@ -27,8 +27,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg vg view small.vg | sort > small.gfa vg view small2.vg | sort > small2.gfa -vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa -vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa +vg find -x merged1.vg -p cat#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/cat human chimp/human chimp cat/g" > merged1.comp1.gfa +vg find -x merged1.vg -p cow#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cow/cow human chimp/g" > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa @@ -60,8 +60,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg vg view small.vg | sort > small.gfa vg view small2.vg | sort > small2.gfa -vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa -vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa +vg find -x merged1.vg -p cat#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/cat human chimp/human chimp cat/g" > merged1.comp1.gfa +vg find -x merged1.vg -p cow#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cow/cow human chimp/g" > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa