From f4de368acaf021cee97eec212c217e31a6d82f3d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Mar 2021 15:48:19 -0400 Subject: [PATCH 1/6] add halMergeChroms --- Makefile | 18 +++- halMergeChroms.cpp | 232 +++++++++++++++++++++++++++++++++++++++++++++ halRemoveDupes.cpp | 3 - 3 files changed, 247 insertions(+), 6 deletions(-) create mode 100644 halMergeChroms.cpp diff --git a/Makefile b/Makefile index 5a8f5a5..bf0445d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg clip-vg halRemoveDupes +all : hal2vg clip-vg halRemoveDupes halMergeChroms # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -28,12 +28,18 @@ ifeq ($(shell ldd halRemoveDupes | grep "not a dynamic" | wc -l), $(shell ls hal else $(error ldd found dnymaic linked dependency in halRemoveDupes) endif +ifeq ($(shell ldd halMergeChroms | grep "not a dynamic" | wc -l), $(shell ls halMergeChroms | wc -l)) + $(info ldd verified that halMergeChroms static) +else + $(error ldd found dnymaic linked dependency in halMergeChroms) +endif + cleanFast : - rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o clean : - rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o cd deps/sonLib && make clean cd deps/pinchesAndCacti && make clean cd deps/hal && make clean @@ -69,6 +75,12 @@ halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies} halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o ${basicLibs} -o halRemoveDupes +halMergeChroms.o : halMergeChroms.cpp ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . halMergeChroms.cpp -c + +halMergeChroms : halMergeChroms.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread halMergeChroms.o ${basicLibs} -o halMergeChroms + test : make cd tests && prove -v t diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp new file mode 100644 index 0000000..08519de --- /dev/null +++ b/halMergeChroms.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu) + * + * Released under the MIT license, see LICENSE.txt + */ + +// Merge chromosome HAL files into one big one. Only star trees supported, with same root name supported. + +//#define debug + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hal.h" + +using namespace std; +using namespace hal; + +static void initParser(CLParser* optionsParser) { + optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge"); + optionsParser->addArgument("outFile", "output HAL file"); + optionsParser->setDescription("Merge chromosome HALs into combined file. Ancestral sequences are renamed as needed to avoid conflicts" + ". Star trees only"); +} + +// get the dimensions from a file +static pair>> get_hal_dimensions(CLParser* optionsParser, const string& hal_path) { + + // open the hal file + AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS)); + + // our output map + unordered_map> dimensions; + + // for every genome + vector genome_names = alignment->getChildNames(alignment->getRootName()); + genome_names.push_back(alignment->getRootName()); + for (const string& genome_name : genome_names) { + const Genome* genome = alignment->openGenome(genome_name); + vector& genome_dimensions = dimensions[genome_name]; + // for every sequence + for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { + const Sequence *sequence = seqIt->getSequence(); + genome_dimensions.emplace_back(sequence->getName(), + sequence->getSequenceLength(), + sequence->getNumTopSegments(), + sequence->getNumBottomSegments()); + } + alignment->closeGenome(genome); + } + + return make_pair(alignment->getRootName(), dimensions); +} + +// append each input hal to the out_alignment, one after another. all arrays are copied over, +// but need to be adjsuted to reflect their new offsets. +static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector& in_paths) { + + // keep track of where we are in the output + vector top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0); + size_t bot_offset = 0; + + for (size_t i = 0; i < in_paths.size(); ++i) { + AlignmentConstPtr in_alignment(openHalAlignment(in_paths[i], optionsParser, READ_ACCESS)); + const Genome* in_root = in_alignment->openGenome(in_alignment->getRootName()); + Genome* out_root = out_alignment->openGenome(in_alignment->getRootName()); + assert(in_root->getName() == out_root->getName()); + size_t in_root_degree = in_root->getNumChildren(); + vector in_genomes = {in_root}; + + // make a child index map (in -> out) for the root genome + // assume : all genomes in in_genome present in out_genome + vector in_ci_to_out_ci(in_root->getNumChildren()); + for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { + in_ci_to_out_ci.at(in_root->getChildIndex(in_alignment->openGenome(in_child_name))) = + out_root->getChildIndex(out_alignment->openGenome(in_child_name)); + } + + // copy over the bottom segments of the root + BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0); + BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset); + for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) { + // set the segment in the root genome + assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset); + out_bi->bseg()->setTopParseIndex(NULL_INDEX); + // determine the sequence-relative coordinate in the input + const Sequence* in_sequence = in_bi->bseg()->getSequence(); + int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition(); + // set the sequence relative coordinate in the output + const Sequence* out_sequence = out_root->getSequence(in_sequence->getName()); + int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; + out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength()); + + // set the segment in the child genomes + for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) { + size_t out_ci = in_ci_to_out_ci[in_ci]; + out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]); + out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci)); + } + } + + // for every child genome, copy over the top segments + for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { + const Genome* in_child = in_alignment->openGenome(in_child_name); + Genome* out_child = out_alignment->openGenome(in_child_name); + + size_t in_ci = in_root->getChildIndex(in_child); + size_t out_ci = in_ci_to_out_ci[in_ci]; + size_t top_offset = top_offsets[out_ci]; + TopSegmentIteratorPtr in_ti = in_child->getTopSegmentIterator(0); + TopSegmentIteratorPtr out_ti = out_child->getTopSegmentIterator(top_offsets[out_ci]); + + for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) { + // set the segment in the child genome + assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset); + out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset); + out_ti->tseg()->setBottomParseIndex(NULL_INDEX); + // determine the sequence-relative coordinate in the input + const Sequence* in_sequence = in_ti->tseg()->getSequence(); + int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition(); + // set the sequence relative coordinate in the output + const Sequence* out_sequence = out_root->getSequence(in_sequence->getName()); + int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; + out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength()); + // set the paralogy edge + if (in_ti->tseg()->hasNextParalogy()) { + out_ti->tseg()->setNextParalogyIndex(in_ti->tseg()->getNextParalogyIndex() + top_offset); + } else { + out_ti->tseg()->setNextParalogyIndex(NULL_INDEX); + } + } + in_genomes.push_back(in_child); + } + + // copy the dna sequence by sequence + for (const Genome* in_genome : in_genomes) { + Genome* out_genome = out_alignment->openGenome(in_genome->getName()); + for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) { + const Sequence* in_sequence = in_si->getSequence(); + Sequence* out_sequence = out_genome->getSequence(in_sequence->getName()); + string dna; + in_sequence->getString(dna); + out_sequence->setString(dna); + } + } + + // update the offsets to move past the current alignment in all genomes + bot_offset += in_root->getNumBottomSegments(); + for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { + const Genome* in_child = in_alignment->openGenome(in_child_name); + size_t in_ci = in_root->getChildIndex(in_child); + size_t out_ci = in_ci_to_out_ci[in_ci]; + top_offsets[out_ci] += in_child->getNumTopSegments(); + } + } +} + +int main(int argc, char** argv) { + CLParser optionsParser(WRITE_ACCESS); + initParser(&optionsParser); + string in_hal_paths; + string out_hal_path; + try { + optionsParser.parseOptions(argc, argv); + in_hal_paths = optionsParser.getArgument("inFiles"); + out_hal_path = optionsParser.getArgument("outFile"); + } + catch(exception& e) { + cerr << e.what() << endl; + optionsParser.printUsage(cerr); + exit(1); + } + + vector in_paths = chopString(in_hal_paths, ","); + + // map genome -> dimensions for each input alignment + vector>> dimensions_list; + string root_name; + for (const string& in_path : in_paths) { + auto rd = get_hal_dimensions(&optionsParser, in_path); + dimensions_list.push_back(rd.second); + if (root_name.empty()) { + root_name = rd.first; + assert(in_path == in_paths[0]); + } else if (rd.first != root_name) { + throw hal_exception("Root mismatch: " + rd.first + " vs " + root_name); + } + } + + // check uniqueness and sum up + // todo: resolve ancestor name collisions + unordered_set sequence_names; + unordered_map> total_dimensions; + for (const unordered_map>& dimensions : dimensions_list) { + for (const auto& kv : dimensions) { + for (const Sequence::Info& sequence_info : kv.second) { + string name = kv.first + "." + sequence_info._name; + if (sequence_names.count(name)) { + throw hal_exception("Conflict: sequence name found in more than one file: " + name); + } + sequence_names.insert(name); + } + total_dimensions[kv.first].insert(total_dimensions[kv.first].begin(), kv.second.begin(), kv.second.end()); + } + } + + // create the new file + AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS)); + + // set up the size of each genome + Genome* root_genome = alignment->addRootGenome(root_name); + root_genome->setDimensions(total_dimensions.at(root_name)); + for (const auto& gd : total_dimensions) { + if (gd.first != root_name) { + Genome* leaf_genome = alignment->addLeafGenome(gd.first, root_name, 1); + leaf_genome->setDimensions(gd.second); + } + } + + // copy over over everything + merge_hals(&optionsParser, alignment, in_paths); + + + return 0; +} + diff --git a/halRemoveDupes.cpp b/halRemoveDupes.cpp index 3cd10cd..e0880d5 100644 --- a/halRemoveDupes.cpp +++ b/halRemoveDupes.cpp @@ -4,9 +4,6 @@ * Released under the MIT license, see LICENSE.txt */ -// This file was created by merging hal2sg.cpp and sg2vg.cpp with -// a small amount of glue for the interface. - //#define debug #include From af1fd0a2a7e62c19a94cba03839082f560458766 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Mar 2021 19:37:57 -0400 Subject: [PATCH 2/6] get halMergeChroms running --- halMergeChroms.cpp | 173 +++++++++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 76 deletions(-) diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp index 08519de..561c230 100644 --- a/halMergeChroms.cpp +++ b/halMergeChroms.cpp @@ -29,33 +29,62 @@ static void initParser(CLParser* optionsParser) { ". Star trees only"); } -// get the dimensions from a file -static pair>> get_hal_dimensions(CLParser* optionsParser, const string& hal_path) { - - // open the hal file - AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS)); +// we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding +// .i to them where i is the file's position in the input. +static string anc_seq_name(const string& seq_name, size_t idx) { + return seq_name + ".hmc" + to_string(idx); +} +// undo the above +static string orig_seq_name(const string& seq_name) { + return seq_name.substr(0, seq_name.rfind(".hmc")); +} - // our output map +// get the dimensions from all genomes in all input files +static pair>> get_hal_dimensions(CLParser* optionsParser, + const vector& hal_paths) { + // genome -> dimensions (covering all input) unordered_map> dimensions; + // to check uniqueness + unordered_set sequence_names; + + string root_name; + for (size_t i = 0; i < hal_paths.size(); ++i) { + const string& hal_path = hal_paths[i]; + + // open the hal file + AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS)); - // for every genome - vector genome_names = alignment->getChildNames(alignment->getRootName()); - genome_names.push_back(alignment->getRootName()); - for (const string& genome_name : genome_names) { - const Genome* genome = alignment->openGenome(genome_name); - vector& genome_dimensions = dimensions[genome_name]; - // for every sequence - for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { - const Sequence *sequence = seqIt->getSequence(); - genome_dimensions.emplace_back(sequence->getName(), - sequence->getSequenceLength(), - sequence->getNumTopSegments(), - sequence->getNumBottomSegments()); + // for every genome + vector genome_names = alignment->getChildNames(alignment->getRootName()); + genome_names.push_back(alignment->getRootName()); + if (root_name.empty()) { + root_name = alignment->getRootName(); + } else if (alignment->getRootName() != root_name) { + throw hal_exception("Root mismatch: " + root_name + " vs " + alignment->getRootName()); + } + for (const string& genome_name : genome_names) { + const Genome* genome = alignment->openGenome(genome_name); + vector& genome_dimensions = dimensions[genome_name]; + // for every sequence + for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { + const Sequence *sequence = seqIt->getSequence(); + // add a little suffix to make ancestral sequences unique + string seq_name = genome->getParent() ? sequence->getName() : anc_seq_name(sequence->getName(), i); + genome_dimensions.emplace_back(seq_name, + sequence->getSequenceLength(), + sequence->getNumTopSegments(), + sequence->getNumBottomSegments()); + string full_name = genome_name + "." + seq_name; + if (sequence_names.count(full_name)) { + throw hal_exception("Duplicate sequence name found: " + full_name); + } else { + sequence_names.insert(full_name); + } + } + alignment->closeGenome(genome); } - alignment->closeGenome(genome); } - - return make_pair(alignment->getRootName(), dimensions); + return make_pair(root_name, dimensions); } // append each input hal to the out_alignment, one after another. all arrays are copied over, @@ -73,6 +102,26 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons assert(in_root->getName() == out_root->getName()); size_t in_root_degree = in_root->getNumChildren(); vector in_genomes = {in_root}; + for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { + in_genomes.push_back(in_alignment->openGenome(in_child_name)); + } + + // copy the dna sequence by sequence + for (const Genome* in_genome : in_genomes) { + cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl; + Genome* out_genome = out_alignment->openGenome(in_genome->getName()); + for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) { + const Sequence* in_sequence = in_si->getSequence(); + string out_seq_name = in_genome->getParent() ? in_sequence->getName() : anc_seq_name(in_sequence->getName(), i); + Sequence* out_sequence = out_genome->getSequence(out_seq_name); + DnaIteratorPtr in_di = in_sequence->getDnaIterator(0); + DnaIteratorPtr out_di = out_sequence->getDnaIterator(0); + assert(in_sequence->getSequenceLength() == out_sequence->getSequenceLength()); + string dna; + in_sequence->getString(dna); + out_sequence->setString(dna); + } + } // make a child index map (in -> out) for the root genome // assume : all genomes in in_genome present in out_genome @@ -83,30 +132,40 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons } // copy over the bottom segments of the root + cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i] + << " with bseg offset " << bot_offset << endl; BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0); BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset); for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) { // set the segment in the root genome assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset); + assert(out_bi->bseg()->getNumChildren() >= in_bi->bseg()->getNumChildren()); out_bi->bseg()->setTopParseIndex(NULL_INDEX); // determine the sequence-relative coordinate in the input const Sequence* in_sequence = in_bi->bseg()->getSequence(); int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition(); + assert(in_start_coord >= 0 && in_start_coord < in_sequence->getSequenceLength()); // set the sequence relative coordinate in the output - const Sequence* out_sequence = out_root->getSequence(in_sequence->getName()); + const Sequence* out_sequence = out_root->getSequence(anc_seq_name(in_sequence->getName(), i)); int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; + assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength()); out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength()); - // set the segment in the child genomes for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) { - size_t out_ci = in_ci_to_out_ci[in_ci]; - out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]); - out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci)); + size_t out_ci = in_ci_to_out_ci.at(in_ci); + assert(out_ci < out_bi->bseg()->getNumChildren()); + if (in_bi->bseg()->hasChild(in_ci)) { + out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]); + out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci)); + } else { + out_bi->bseg()->setChildIndex(out_ci, NULL_INDEX); + } } } // for every child genome, copy over the top segments for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { + cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl; const Genome* in_child = in_alignment->openGenome(in_child_name); Genome* out_child = out_alignment->openGenome(in_child_name); @@ -125,7 +184,7 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons const Sequence* in_sequence = in_ti->tseg()->getSequence(); int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition(); // set the sequence relative coordinate in the output - const Sequence* out_sequence = out_root->getSequence(in_sequence->getName()); + const Sequence* out_sequence = out_child->getSequence(in_sequence->getName()); int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength()); // set the paralogy edge @@ -135,19 +194,6 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons out_ti->tseg()->setNextParalogyIndex(NULL_INDEX); } } - in_genomes.push_back(in_child); - } - - // copy the dna sequence by sequence - for (const Genome* in_genome : in_genomes) { - Genome* out_genome = out_alignment->openGenome(in_genome->getName()); - for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) { - const Sequence* in_sequence = in_si->getSequence(); - Sequence* out_sequence = out_genome->getSequence(in_sequence->getName()); - string dna; - in_sequence->getString(dna); - out_sequence->setString(dna); - } } // update the offsets to move past the current alignment in all genomes @@ -180,48 +226,23 @@ int main(int argc, char** argv) { vector in_paths = chopString(in_hal_paths, ","); // map genome -> dimensions for each input alignment - vector>> dimensions_list; - string root_name; - for (const string& in_path : in_paths) { - auto rd = get_hal_dimensions(&optionsParser, in_path); - dimensions_list.push_back(rd.second); - if (root_name.empty()) { - root_name = rd.first; - assert(in_path == in_paths[0]); - } else if (rd.first != root_name) { - throw hal_exception("Root mismatch: " + rd.first + " vs " + root_name); - } - } - - // check uniqueness and sum up - // todo: resolve ancestor name collisions - unordered_set sequence_names; - unordered_map> total_dimensions; - for (const unordered_map>& dimensions : dimensions_list) { - for (const auto& kv : dimensions) { - for (const Sequence::Info& sequence_info : kv.second) { - string name = kv.first + "." + sequence_info._name; - if (sequence_names.count(name)) { - throw hal_exception("Conflict: sequence name found in more than one file: " + name); - } - sequence_names.insert(name); - } - total_dimensions[kv.first].insert(total_dimensions[kv.first].begin(), kv.second.begin(), kv.second.end()); - } - } + pair>> rd = get_hal_dimensions(&optionsParser, in_paths); + string& root_name = rd.first; + unordered_map>& dimensions = rd.second; // create the new file AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS)); - // set up the size of each genome + // set up the size of each genome, staring with the root Genome* root_genome = alignment->addRootGenome(root_name); - root_genome->setDimensions(total_dimensions.at(root_name)); - for (const auto& gd : total_dimensions) { - if (gd.first != root_name) { - Genome* leaf_genome = alignment->addLeafGenome(gd.first, root_name, 1); - leaf_genome->setDimensions(gd.second); + for (auto& kv : dimensions) { + if (kv.first != root_name) { + Genome* leaf_genome = alignment->addLeafGenome(kv.first, root_name, 1); + leaf_genome->setDimensions(kv.second); } } + // important to set root dimensions after adding leaves so bottom segments have right number of slots + root_genome->setDimensions(dimensions.at(root_name)); // copy over over everything merge_hals(&optionsParser, alignment, in_paths); From bd4aa1fcf22fe7028a01fb09f8467b2e8a84823e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Mar 2021 20:09:39 -0400 Subject: [PATCH 3/6] add tests --- halMergeChroms.cpp | 40 +++++++++++++++++++++++++++++++--------- tests/small/small2.maf | 15 +++++++++++++++ tests/t/merge.t | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 tests/small/small2.maf create mode 100644 tests/t/merge.t diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp index 561c230..74d817e 100644 --- a/halMergeChroms.cpp +++ b/halMergeChroms.cpp @@ -4,7 +4,7 @@ * Released under the MIT license, see LICENSE.txt */ -// Merge chromosome HAL files into one big one. Only star trees supported, with same root name supported. +// Merge chromosome HAL files into one big one. Only star trees with same root name supported (ie what comes out of cactus-align-batch). //#define debug @@ -25,6 +25,9 @@ using namespace hal; static void initParser(CLParser* optionsParser) { optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge"); optionsParser->addArgument("outFile", "output HAL file"); + optionsParser->addOptionFlag("progress", + "show progress", + false); optionsParser->setDescription("Merge chromosome HALs into combined file. Ancestral sequences are renamed as needed to avoid conflicts" ". Star trees only"); } @@ -89,7 +92,7 @@ static pair>> get_hal_dimen // append each input hal to the out_alignment, one after another. all arrays are copied over, // but need to be adjsuted to reflect their new offsets. -static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector& in_paths) { +static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector& in_paths, bool progress) { // keep track of where we are in the output vector top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0); @@ -101,6 +104,7 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons Genome* out_root = out_alignment->openGenome(in_alignment->getRootName()); assert(in_root->getName() == out_root->getName()); size_t in_root_degree = in_root->getNumChildren(); + size_t out_root_degree = out_root->getNumChildren(); vector in_genomes = {in_root}; for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { in_genomes.push_back(in_alignment->openGenome(in_child_name)); @@ -108,7 +112,9 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons // copy the dna sequence by sequence for (const Genome* in_genome : in_genomes) { - cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl; + if (progress) { + cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl; + } Genome* out_genome = out_alignment->openGenome(in_genome->getName()); for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) { const Sequence* in_sequence = in_si->getSequence(); @@ -132,8 +138,10 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons } // copy over the bottom segments of the root - cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i] - << " with bseg offset " << bot_offset << endl; + if (progress) { + cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i] + << " with bseg offset " << bot_offset << endl; + } BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0); BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset); for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) { @@ -151,21 +159,24 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength()); out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength()); // set the segment in the child genomes + for (size_t out_ci = 0; out_ci < out_root_degree; ++out_ci) { + out_bi->bseg()->setChildIndex(out_ci, NULL_INDEX); + } for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) { size_t out_ci = in_ci_to_out_ci.at(in_ci); assert(out_ci < out_bi->bseg()->getNumChildren()); if (in_bi->bseg()->hasChild(in_ci)) { out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]); out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci)); - } else { - out_bi->bseg()->setChildIndex(out_ci, NULL_INDEX); } } } // for every child genome, copy over the top segments for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { - cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl; + if (progress) { + cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl; + } const Genome* in_child = in_alignment->openGenome(in_child_name); Genome* out_child = out_alignment->openGenome(in_child_name); @@ -212,10 +223,12 @@ int main(int argc, char** argv) { initParser(&optionsParser); string in_hal_paths; string out_hal_path; + bool progress; try { optionsParser.parseOptions(argc, argv); in_hal_paths = optionsParser.getArgument("inFiles"); out_hal_path = optionsParser.getArgument("outFile"); + progress = optionsParser.getFlag("progress"); } catch(exception& e) { cerr << e.what() << endl; @@ -226,11 +239,17 @@ int main(int argc, char** argv) { vector in_paths = chopString(in_hal_paths, ","); // map genome -> dimensions for each input alignment + if (progress) { + cerr << "[halMergeChroms]: Scanning dimensions of " << in_paths.size() << " input files." << endl; + } pair>> rd = get_hal_dimensions(&optionsParser, in_paths); string& root_name = rd.first; unordered_map>& dimensions = rd.second; // create the new file + if (progress) { + cerr << "[halMergeChroms]: Creating empty alignment: " << out_hal_path << endl; + } AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS)); // set up the size of each genome, staring with the root @@ -245,8 +264,11 @@ int main(int argc, char** argv) { root_genome->setDimensions(dimensions.at(root_name)); // copy over over everything - merge_hals(&optionsParser, alignment, in_paths); + merge_hals(&optionsParser, alignment, in_paths, progress); + if (progress) { + cerr << "[halMergeChroms]: Writing merged alignment" << endl; + } return 0; } diff --git a/tests/small/small2.maf b/tests/small/small2.maf new file mode 100644 index 0000000..4688386 --- /dev/null +++ b/tests/small/small2.maf @@ -0,0 +1,15 @@ +##maf version=1 + +# SNP +a score=0 mafExtractor_splicedBlock=true splice_id=1_0 +s human.1 0 3 + 10 GCA +s chimp.3 0 3 + 8 GCA + +# Indel and strand change +a score=0 mafExtractor_splicedBlock=true splice_id=1_0 +s human.1 3 7 + 10 GCAGAAT +s chimp.3 3 5 + 8 GCAG--T +s cow.3 0 4 - 7 --A-AAT + + + diff --git a/tests/t/merge.t b/tests/t/merge.t new file mode 100644 index 0000000..4573b51 --- /dev/null +++ b/tests/t/merge.t @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +BASH_TAP_ROOT=./bash-tap +. ${BASH_TAP_ROOT}/bash-tap-bootstrap + +PATH=../bin:$PATH +PATH=../deps/hal:$PATH + +plan tests 4 + +maf2hal small/small.maf small.hal +maf2hal small/small2.maf small2.hal +halMergeChroms small.hal,small2.hal merged1.hal +hal2vg small.hal --noAncestors | vg ids -s - > small.vg +hal2vg small2.hal --noAncestors | vg ids -s - > small2.vg +hal2vg merged1.hal --noAncestors | vg ids -s - > merged1.vg +vg view small.vg > small.gfa +vg view small2.vg > small2.gfa +vg find -x merged1.vg -p cat.3:1 -c 1000 | vg view - > merged1.comp1.gfa +vg find -x merged1.vg -p cow.3:1 -c 1000 | vg view - > merged1.comp2.gfa +diff small.gfa merged1.comp1.gfa +is $? 0 "First component of merged graph identical to first input graph" +diff small2.gfa merged1.comp2.gfa +is $? 0 "Second component of merged graph identical to second input graph" + +# repeat with merge in different order +halMergeChroms small2.hal,small.hal merged2.hal +hal2vg merged2.hal --noAncestors | vg ids -s - > merged2.vg +vg find -x merged2.vg -p cat.3:1 -c 1000 | vg view - > merged2.comp1.gfa +vg find -x merged2.vg -p cow.3:1 -c 1000 | vg view - > merged2.comp2.gfa +diff small.gfa merged1.comp1.gfa +is $? 0 "First component of merged graph2 identical to first input graph" +diff small2.gfa merged1.comp2.gfa +is $? 0 "Second component of merged graph2 identical to second input graph" + +rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa +rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa +rm -f merged2.hal merged2.vg merged2.comp1.gfa merged2.comp2.gfa From 92247aa255d83d8d99cd5c88f03b1314bd9c57fc Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Mar 2021 21:11:11 -0400 Subject: [PATCH 4/6] fix tests --- halMergeChroms.cpp | 7 ++++- tests/t/merge.t | 67 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp index 74d817e..3a02092 100644 --- a/halMergeChroms.cpp +++ b/halMergeChroms.cpp @@ -189,7 +189,12 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) { // set the segment in the child genome assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset); - out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset); + if (in_ti->tseg()->hasParent()) { + out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset); + out_ti->tseg()->setParentReversed(in_ti->tseg()->getParentReversed()); + } else { + out_ti->tseg()->setParentIndex(NULL_INDEX); + } out_ti->tseg()->setBottomParseIndex(NULL_INDEX); // determine the sequence-relative coordinate in the input const Sequence* in_sequence = in_ti->tseg()->getSequence(); diff --git a/tests/t/merge.t b/tests/t/merge.t index 4573b51..1426bab 100644 --- a/tests/t/merge.t +++ b/tests/t/merge.t @@ -6,33 +6,68 @@ BASH_TAP_ROOT=./bash-tap PATH=../bin:$PATH PATH=../deps/hal:$PATH -plan tests 4 +plan tests 10 maf2hal small/small.maf small.hal maf2hal small/small2.maf small2.hal halMergeChroms small.hal,small2.hal merged1.hal -hal2vg small.hal --noAncestors | vg ids -s - > small.vg -hal2vg small2.hal --noAncestors | vg ids -s - > small2.vg -hal2vg merged1.hal --noAncestors | vg ids -s - > merged1.vg -vg view small.vg > small.gfa -vg view small2.vg > small2.gfa -vg find -x merged1.vg -p cat.3:1 -c 1000 | vg view - > merged1.comp1.gfa -vg find -x merged1.vg -p cow.3:1 -c 1000 | vg view - > merged1.comp2.gfa +halValidate merged1.hal +is $? 0 "halMergeChroms produces valid hal" +hal2fasta small.hal chimp > chimp.fa +hal2fasta small2.hal chimp >> chimp.fa +hal2fasta merged1.hal chimp > chimp.merge.fa +diff chimp.fa chimp.merge.fa +is $? 0 "halMergeChroms preserves chimp sequence" +hal2fasta small.hal cat > cat.fa +hal2fasta merged1.hal cat > cat.merge.fa +diff cat.fa cat.merge.fa +is $? 0 "halMergeChroms preserves cat sequence" +hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg +hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg +hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg +vg view small.vg | sort > small.gfa +vg view small2.vg | sort > small2.gfa +vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa +vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa is $? 0 "Second component of merged graph identical to second input graph" -# repeat with merge in different order -halMergeChroms small2.hal,small.hal merged2.hal -hal2vg merged2.hal --noAncestors | vg ids -s - > merged2.vg -vg find -x merged2.vg -p cat.3:1 -c 1000 | vg view - > merged2.comp1.gfa -vg find -x merged2.vg -p cow.3:1 -c 1000 | vg view - > merged2.comp2.gfa +rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa +rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa +rm -f chimp.fa chimp.merge.fa +rm -f cat.fa cat.merge.fa + +### copy paste above but change order ### + +maf2hal small/small.maf small.hal +maf2hal small/small2.maf small2.hal +halMergeChroms small2.hal,small.hal merged1.hal +halValidate merged1.hal +is $? 0 "halMergeChroms produces valid hal" +hal2fasta small2.hal chimp > chimp.fa +hal2fasta small.hal chimp >> chimp.fa +hal2fasta merged1.hal chimp > chimp.merge.fa +diff chimp.fa chimp.merge.fa +is $? 0 "halMergeChroms preserves chimp sequence" +hal2fasta small.hal cat > cat.fa +hal2fasta merged1.hal cat > cat.merge.fa +diff cat.fa cat.merge.fa +is $? 0 "halMergeChroms preserves cat sequence" +hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg +hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg +hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg +vg view small.vg | sort > small.gfa +vg view small2.vg | sort > small2.gfa +vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa +vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa -is $? 0 "First component of merged graph2 identical to first input graph" +is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa -is $? 0 "Second component of merged graph2 identical to second input graph" +is $? 0 "Second component of merged graph identical to second input graph" rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa -rm -f merged2.hal merged2.vg merged2.comp1.gfa merged2.comp2.gfa +rm -f chimp.fa chimp.merge.fa +rm -f cat.fa cat.merge.fa From 057565ab8efe0e3440e93ad08a957480e3033917 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Mar 2021 21:12:07 -0400 Subject: [PATCH 5/6] add halMergeChroms to bin release --- build-tools/makeBinRelease | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease index 2dec00a..2dc3897 100755 --- a/build-tools/makeBinRelease +++ b/build-tools/makeBinRelease @@ -37,5 +37,5 @@ else make check-static fi -cp hal2vg clip-vg halRemoveDupes ${buildDir}/ +cp hal2vg clip-vg halRemoveDupes halMergeChroms ${buildDir}/ From 8a3a99c068b95da847c207384cddcda3539afdf4 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 30 Mar 2021 08:38:03 -0400 Subject: [PATCH 6/6] just use underscore for names --- halMergeChroms.cpp | 8 ++------ tests/t/merge.t | 8 ++++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp index 3a02092..e849662 100644 --- a/halMergeChroms.cpp +++ b/halMergeChroms.cpp @@ -33,13 +33,9 @@ static void initParser(CLParser* optionsParser) { } // we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding -// .i to them where i is the file's position in the input. +// _i to them where i is the file's position in the input. static string anc_seq_name(const string& seq_name, size_t idx) { - return seq_name + ".hmc" + to_string(idx); -} -// undo the above -static string orig_seq_name(const string& seq_name) { - return seq_name.substr(0, seq_name.rfind(".hmc")); + return seq_name + "_" + to_string(idx); } // get the dimensions from all genomes in all input files diff --git a/tests/t/merge.t b/tests/t/merge.t index 1426bab..19fda86 100644 --- a/tests/t/merge.t +++ b/tests/t/merge.t @@ -27,8 +27,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg vg view small.vg | sort > small.gfa vg view small2.vg | sort > small2.gfa -vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa -vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa +vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa +vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa @@ -60,8 +60,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg vg view small.vg | sort > small.gfa vg view small2.vg | sort > small2.gfa -vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa -vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa +vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa +vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa diff small.gfa merged1.comp1.gfa is $? 0 "First component of merged graph identical to first input graph" diff small2.gfa merged1.comp2.gfa