From f4de368acaf021cee97eec212c217e31a6d82f3d Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Mar 2021 15:48:19 -0400
Subject: [PATCH 1/6] add halMergeChroms

---
 Makefile           |  18 +++-
 halMergeChroms.cpp | 232 +++++++++++++++++++++++++++++++++++++++++++++
 halRemoveDupes.cpp |   3 -
 3 files changed, 247 insertions(+), 6 deletions(-)
 create mode 100644 halMergeChroms.cpp
diff --git a/Makefile b/Makefile
index 5a8f5a5..bf0445d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 rootPath = ./
 include ./include.mk
 
-all : hal2vg clip-vg halRemoveDupes
+all : hal2vg clip-vg halRemoveDupes halMergeChroms
 
 # Note: hdf5 from apt doesn't seem to work for static builds.  It should be installed
 # from source and configured with "--enable-static --disable-shared", then have its
@@ -28,12 +28,18 @@ ifeq ($(shell ldd halRemoveDupes | grep "not a dynamic" | wc -l), $(shell ls hal
 else
 	$(error ldd found dnymaic linked dependency in halRemoveDupes)
 endif
+ifeq ($(shell ldd halMergeChroms | grep "not a dynamic" | wc -l), $(shell ls halMergeChroms | wc -l))
+	$(info ldd verified that halMergeChroms static)
+else
+	$(error ldd found dnymaic linked dependency in halMergeChroms)
+endif
+
 
 cleanFast : 
-	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o
+	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o
 
 clean :
-	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o
+	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o
 	cd deps/sonLib && make clean
 	cd deps/pinchesAndCacti && make clean
 	cd deps/hal && make clean
@@ -69,6 +75,12 @@ halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies}
 halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies}
 	${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o  ${basicLibs}  -o halRemoveDupes
 
+halMergeChroms.o : halMergeChroms.cpp ${basicLibsDependencies}
+	${cpp} ${CXXFLAGS} -I . halMergeChroms.cpp -c
+
+halMergeChroms : halMergeChroms.o ${basicLibsDependencies}
+	${cpp} ${CXXFLAGS} -fopenmp -pthread halMergeChroms.o  ${basicLibs}  -o halMergeChroms
+
 test :
 	make
 	cd tests && prove -v t
diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp
new file mode 100644
index 0000000..08519de
--- /dev/null
+++ b/halMergeChroms.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu)
+ *
+ * Released under the MIT license, see LICENSE.txt
+ */
+
+// Merge chromosome HAL files into one big one.  Only star trees supported, with same root name supported. 
+
+//#define debug
+
+#include <cstdlib>
+#include <cstdlib>
+#include <iostream>
+#include <cassert>
+#include <fstream>
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "hal.h"
+
+using namespace std;
+using namespace hal;
+
+static void initParser(CLParser* optionsParser) {
+    optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge");
+    optionsParser->addArgument("outFile", "output HAL file");
+    optionsParser->setDescription("Merge chromosome HALs into combined file.  Ancestral sequences are renamed as needed to avoid conflicts"
+        ". Star trees only");
+}
+
+// get the dimensions from a file
+static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimensions(CLParser* optionsParser, const string& hal_path) {
+
+    // open the hal file
+    AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS));
+
+    // our output map
+    unordered_map<string, vector<Sequence::Info>> dimensions;
+    
+    // for every genome
+    vector<string> genome_names = alignment->getChildNames(alignment->getRootName());
+    genome_names.push_back(alignment->getRootName());
+    for (const string& genome_name : genome_names) {
+        const Genome* genome = alignment->openGenome(genome_name);
+        vector<Sequence::Info>& genome_dimensions = dimensions[genome_name];
+        // for every sequence
+        for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
+            const Sequence *sequence = seqIt->getSequence();
+            genome_dimensions.emplace_back(sequence->getName(),
+                                           sequence->getSequenceLength(),
+                                           sequence->getNumTopSegments(),
+                                           sequence->getNumBottomSegments());
+        }
+        alignment->closeGenome(genome);        
+    }
+
+    return make_pair(alignment->getRootName(), dimensions);
+}
+
+// append each input hal to the out_alignment, one after another.  all arrays are copied over,
+// but need to be adjsuted to reflect their new offsets.
+static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector<string>& in_paths) {
+
+    // keep track of where we are in the output    
+    vector<size_t> top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0);
+    size_t bot_offset = 0;
+    
+    for (size_t i = 0; i < in_paths.size(); ++i) {
+        AlignmentConstPtr in_alignment(openHalAlignment(in_paths[i], optionsParser, READ_ACCESS));
+        const Genome* in_root = in_alignment->openGenome(in_alignment->getRootName());
+        Genome* out_root = out_alignment->openGenome(in_alignment->getRootName());
+        assert(in_root->getName() == out_root->getName());
+        size_t in_root_degree = in_root->getNumChildren();
+        vector<const Genome*> in_genomes = {in_root};
+
+        // make a child index map (in -> out) for the root genome
+        // assume : all genomes in in_genome present in out_genome
+        vector<size_t> in_ci_to_out_ci(in_root->getNumChildren());
+        for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
+            in_ci_to_out_ci.at(in_root->getChildIndex(in_alignment->openGenome(in_child_name))) =
+                out_root->getChildIndex(out_alignment->openGenome(in_child_name));
+        }
+
+        // copy over the bottom segments of the root
+        BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0);
+        BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset);
+        for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) {
+            // set the segment in the root genome
+            assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset);
+            out_bi->bseg()->setTopParseIndex(NULL_INDEX);
+            // determine the sequence-relative coordinate in the input
+            const Sequence* in_sequence = in_bi->bseg()->getSequence();
+            int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition();
+            // set the sequence relative coordinate in the output
+            const Sequence* out_sequence = out_root->getSequence(in_sequence->getName());
+            int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
+            out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength());
+
+            // set the segment in the child genomes
+            for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) {
+                size_t out_ci = in_ci_to_out_ci[in_ci];
+                out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
+                out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
+            }
+        }
+
+        // for every child genome, copy over the top segments
+        for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
+            const Genome* in_child = in_alignment->openGenome(in_child_name);
+            Genome* out_child = out_alignment->openGenome(in_child_name);
+            
+            size_t in_ci = in_root->getChildIndex(in_child);
+            size_t out_ci = in_ci_to_out_ci[in_ci];
+            size_t top_offset = top_offsets[out_ci];
+            TopSegmentIteratorPtr in_ti = in_child->getTopSegmentIterator(0);
+            TopSegmentIteratorPtr out_ti = out_child->getTopSegmentIterator(top_offsets[out_ci]);
+
+            for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) {
+                // set the segment in the child genome
+                assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset);
+                out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset);
+                out_ti->tseg()->setBottomParseIndex(NULL_INDEX);
+                // determine the sequence-relative coordinate in the input
+                const Sequence* in_sequence = in_ti->tseg()->getSequence();
+                int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition();
+                // set the sequence relative coordinate in the output
+                const Sequence* out_sequence = out_root->getSequence(in_sequence->getName());
+                int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
+                out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength());
+                // set the paralogy edge
+                if (in_ti->tseg()->hasNextParalogy()) {
+                    out_ti->tseg()->setNextParalogyIndex(in_ti->tseg()->getNextParalogyIndex() + top_offset);
+                } else {
+                    out_ti->tseg()->setNextParalogyIndex(NULL_INDEX);
+                }
+            }
+            in_genomes.push_back(in_child);
+        }
+
+        // copy the dna sequence by sequence
+        for (const Genome* in_genome : in_genomes) {
+            Genome* out_genome = out_alignment->openGenome(in_genome->getName());
+            for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
+                const Sequence* in_sequence = in_si->getSequence();
+                Sequence* out_sequence = out_genome->getSequence(in_sequence->getName());
+                string dna;
+                in_sequence->getString(dna);
+                out_sequence->setString(dna);
+            }
+        }
+
+        // update the offsets to move past the current alignment in all genomes
+        bot_offset += in_root->getNumBottomSegments();
+        for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
+            const Genome* in_child = in_alignment->openGenome(in_child_name);
+            size_t in_ci = in_root->getChildIndex(in_child);
+            size_t out_ci = in_ci_to_out_ci[in_ci];
+            top_offsets[out_ci] += in_child->getNumTopSegments();
+        }
+    }
+}
+    
+int main(int argc, char** argv) {
+    CLParser optionsParser(WRITE_ACCESS);
+    initParser(&optionsParser);
+    string in_hal_paths;
+    string out_hal_path;
+    try {
+        optionsParser.parseOptions(argc, argv);
+        in_hal_paths = optionsParser.getArgument<string>("inFiles");
+        out_hal_path = optionsParser.getArgument<string>("outFile");
+    }
+    catch(exception& e) {
+        cerr << e.what() << endl;
+        optionsParser.printUsage(cerr);
+        exit(1);
+    }
+
+    vector<string> in_paths = chopString(in_hal_paths, ",");
+    
+    // map genome -> dimensions for each input alignment
+    vector<unordered_map<string, vector<Sequence::Info>>> dimensions_list;
+    string root_name;
+    for (const string& in_path : in_paths) {
+        auto rd = get_hal_dimensions(&optionsParser, in_path);
+        dimensions_list.push_back(rd.second);
+        if (root_name.empty()) {
+            root_name = rd.first;
+            assert(in_path == in_paths[0]);
+        } else if (rd.first != root_name) {
+            throw hal_exception("Root mismatch: " + rd.first + " vs " + root_name);
+        }
+    }
+
+    // check uniqueness and sum up
+    // todo: resolve ancestor name collisions    
+    unordered_set<string> sequence_names;
+    unordered_map<string, vector<Sequence::Info>> total_dimensions;    
+    for (const unordered_map<string, vector<Sequence::Info>>& dimensions : dimensions_list) {
+        for (const auto& kv : dimensions) {
+            for (const Sequence::Info& sequence_info : kv.second) {
+                string name = kv.first + "." + sequence_info._name;
+                if (sequence_names.count(name)) {
+                    throw hal_exception("Conflict: sequence name found in more than one file: " + name);
+                }
+                sequence_names.insert(name);
+            }
+            total_dimensions[kv.first].insert(total_dimensions[kv.first].begin(), kv.second.begin(), kv.second.end());
+        }
+    }
+
+    // create the new file
+    AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));
+
+    // set up the size of each genome
+    Genome* root_genome = alignment->addRootGenome(root_name);
+    root_genome->setDimensions(total_dimensions.at(root_name));    
+    for (const auto& gd : total_dimensions) {
+        if (gd.first != root_name) {
+            Genome* leaf_genome = alignment->addLeafGenome(gd.first, root_name, 1);
+            leaf_genome->setDimensions(gd.second);
+        }
+    }
+
+    // copy over over everything
+    merge_hals(&optionsParser, alignment, in_paths);
+
+     
+    return 0;
+}
+
diff --git a/halRemoveDupes.cpp b/halRemoveDupes.cpp
index 3cd10cd..e0880d5 100644
--- a/halRemoveDupes.cpp
+++ b/halRemoveDupes.cpp
@@ -4,9 +4,6 @@
  * Released under the MIT license, see LICENSE.txt
  */
 
-// This file was created by merging hal2sg.cpp and sg2vg.cpp with
-// a small amount of glue for the interface. 
-
 //#define debug
 
 #include <cstdlib>

From af1fd0a2a7e62c19a94cba03839082f560458766 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Mar 2021 19:37:57 -0400
Subject: [PATCH 2/6] get halMergeChroms running

---
 halMergeChroms.cpp | 173 +++++++++++++++++++++++++--------------------
 1 file changed, 97 insertions(+), 76 deletions(-)

diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp
index 08519de..561c230 100644
--- a/halMergeChroms.cpp
+++ b/halMergeChroms.cpp
@@ -29,33 +29,62 @@ static void initParser(CLParser* optionsParser) {
         ". Star trees only");
 }
 
-// get the dimensions from a file
-static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimensions(CLParser* optionsParser, const string& hal_path) {
-
-    // open the hal file
-    AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS));
+// we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding
+// .i to them where i is the file's position in the input.
+static string anc_seq_name(const string& seq_name, size_t idx) {
+    return seq_name + ".hmc" + to_string(idx);
+}
+// undo the above
+static string orig_seq_name(const string& seq_name) {
+    return seq_name.substr(0, seq_name.rfind(".hmc"));
+}
 
-    // our output map
+// get the dimensions from all genomes in all input files
+static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimensions(CLParser* optionsParser,
+                                                                                      const vector<string>& hal_paths) {
+    // genome -> dimensions (covering all input)
     unordered_map<string, vector<Sequence::Info>> dimensions;
+    // to check uniqueness
+    unordered_set<string> sequence_names;
+
+    string root_name;
+    for (size_t i = 0; i < hal_paths.size(); ++i) {
+        const string& hal_path = hal_paths[i];
+        
+        // open the hal file
+        AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS));
     
-    // for every genome
-    vector<string> genome_names = alignment->getChildNames(alignment->getRootName());
-    genome_names.push_back(alignment->getRootName());
-    for (const string& genome_name : genome_names) {
-        const Genome* genome = alignment->openGenome(genome_name);
-        vector<Sequence::Info>& genome_dimensions = dimensions[genome_name];
-        // for every sequence
-        for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
-            const Sequence *sequence = seqIt->getSequence();
-            genome_dimensions.emplace_back(sequence->getName(),
-                                           sequence->getSequenceLength(),
-                                           sequence->getNumTopSegments(),
-                                           sequence->getNumBottomSegments());
+        // for every genome
+        vector<string> genome_names = alignment->getChildNames(alignment->getRootName());
+        genome_names.push_back(alignment->getRootName());
+        if (root_name.empty()) {
+            root_name = alignment->getRootName();
+        } else if (alignment->getRootName() != root_name) {
+            throw hal_exception("Root mismatch: " + root_name + " vs " + alignment->getRootName());        
+        }
+        for (const string& genome_name : genome_names) {
+            const Genome* genome = alignment->openGenome(genome_name);
+            vector<Sequence::Info>& genome_dimensions = dimensions[genome_name];
+            // for every sequence
+            for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
+                const Sequence *sequence = seqIt->getSequence();
+                // add a little suffix to make ancestral sequences unique
+                string seq_name = genome->getParent() ? sequence->getName() : anc_seq_name(sequence->getName(), i);
+                genome_dimensions.emplace_back(seq_name,
+                                               sequence->getSequenceLength(),
+                                               sequence->getNumTopSegments(),
+                                               sequence->getNumBottomSegments());
+                string full_name = genome_name + "." + seq_name;
+                if (sequence_names.count(full_name)) {
+                    throw hal_exception("Duplicate sequence name found: " + full_name);
+                } else {
+                    sequence_names.insert(full_name);
+                }
+            }
+            alignment->closeGenome(genome);        
         }
-        alignment->closeGenome(genome);        
     }
-
-    return make_pair(alignment->getRootName(), dimensions);
+    return make_pair(root_name, dimensions);
 }
 
 // append each input hal to the out_alignment, one after another.  all arrays are copied over,
@@ -73,6 +102,26 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
         assert(in_root->getName() == out_root->getName());
         size_t in_root_degree = in_root->getNumChildren();
         vector<const Genome*> in_genomes = {in_root};
+        for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
+            in_genomes.push_back(in_alignment->openGenome(in_child_name));
+        }
+
+        // copy the dna sequence by sequence
+        for (const Genome* in_genome : in_genomes) {
+            cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl;
+            Genome* out_genome = out_alignment->openGenome(in_genome->getName());
+            for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
+                const Sequence* in_sequence = in_si->getSequence();
+                string out_seq_name = in_genome->getParent() ? in_sequence->getName() : anc_seq_name(in_sequence->getName(), i);
+                Sequence* out_sequence = out_genome->getSequence(out_seq_name);
+                DnaIteratorPtr in_di = in_sequence->getDnaIterator(0);
+                DnaIteratorPtr out_di = out_sequence->getDnaIterator(0);
+                assert(in_sequence->getSequenceLength() == out_sequence->getSequenceLength());
+                string dna;
+                in_sequence->getString(dna);
+                out_sequence->setString(dna);
+            }
+        }
 
         // make a child index map (in -> out) for the root genome
         // assume : all genomes in in_genome present in out_genome
@@ -83,30 +132,40 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
         }
 
         // copy over the bottom segments of the root
+        cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i]
+             << " with bseg offset " << bot_offset << endl;
         BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0);
         BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset);
         for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) {
             // set the segment in the root genome
             assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset);
+            assert(out_bi->bseg()->getNumChildren() >= in_bi->bseg()->getNumChildren());
             out_bi->bseg()->setTopParseIndex(NULL_INDEX);
             // determine the sequence-relative coordinate in the input
             const Sequence* in_sequence = in_bi->bseg()->getSequence();
             int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition();
+            assert(in_start_coord >= 0 && in_start_coord < in_sequence->getSequenceLength());
             // set the sequence relative coordinate in the output
-            const Sequence* out_sequence = out_root->getSequence(in_sequence->getName());
+            const Sequence* out_sequence = out_root->getSequence(anc_seq_name(in_sequence->getName(), i));
             int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
+            assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength());
             out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength());
-
             // set the segment in the child genomes
             for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) {
-                size_t out_ci = in_ci_to_out_ci[in_ci];
-                out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
-                out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
+                size_t out_ci = in_ci_to_out_ci.at(in_ci);
+                assert(out_ci < out_bi->bseg()->getNumChildren());
+                if (in_bi->bseg()->hasChild(in_ci)) {
+                    out_bi->bseg()->setChildIndex(out_ci,  in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
+                    out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
+                } else {
+                    out_bi->bseg()->setChildIndex(out_ci,  NULL_INDEX);
+                }
             }
         }
 
         // for every child genome, copy over the top segments
         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
+            cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl;
             const Genome* in_child = in_alignment->openGenome(in_child_name);
             Genome* out_child = out_alignment->openGenome(in_child_name);
             
@@ -125,7 +184,7 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
                 const Sequence* in_sequence = in_ti->tseg()->getSequence();
                 int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition();
                 // set the sequence relative coordinate in the output
-                const Sequence* out_sequence = out_root->getSequence(in_sequence->getName());
+                const Sequence* out_sequence = out_child->getSequence(in_sequence->getName());
                 int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
                 out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength());
                 // set the paralogy edge
@@ -135,19 +194,6 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
                     out_ti->tseg()->setNextParalogyIndex(NULL_INDEX);
                 }
             }
-            in_genomes.push_back(in_child);
-        }
-
-        // copy the dna sequence by sequence
-        for (const Genome* in_genome : in_genomes) {
-            Genome* out_genome = out_alignment->openGenome(in_genome->getName());
-            for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
-                const Sequence* in_sequence = in_si->getSequence();
-                Sequence* out_sequence = out_genome->getSequence(in_sequence->getName());
-                string dna;
-                in_sequence->getString(dna);
-                out_sequence->setString(dna);
-            }
         }
 
         // update the offsets to move past the current alignment in all genomes
@@ -180,48 +226,23 @@ int main(int argc, char** argv) {
     vector<string> in_paths = chopString(in_hal_paths, ",");
     
     // map genome -> dimensions for each input alignment
-    vector<unordered_map<string, vector<Sequence::Info>>> dimensions_list;
-    string root_name;
-    for (const string& in_path : in_paths) {
-        auto rd = get_hal_dimensions(&optionsParser, in_path);
-        dimensions_list.push_back(rd.second);
-        if (root_name.empty()) {
-            root_name = rd.first;
-            assert(in_path == in_paths[0]);
-        } else if (rd.first != root_name) {
-            throw hal_exception("Root mismatch: " + rd.first + " vs " + root_name);
-        }
-    }
-
-    // check uniqueness and sum up
-    // todo: resolve ancestor name collisions    
-    unordered_set<string> sequence_names;
-    unordered_map<string, vector<Sequence::Info>> total_dimensions;    
-    for (const unordered_map<string, vector<Sequence::Info>>& dimensions : dimensions_list) {
-        for (const auto& kv : dimensions) {
-            for (const Sequence::Info& sequence_info : kv.second) {
-                string name = kv.first + "." + sequence_info._name;
-                if (sequence_names.count(name)) {
-                    throw hal_exception("Conflict: sequence name found in more than one file: " + name);
-                }
-                sequence_names.insert(name);
-            }
-            total_dimensions[kv.first].insert(total_dimensions[kv.first].begin(), kv.second.begin(), kv.second.end());
-        }
-    }
+    pair<string, unordered_map<string, vector<Sequence::Info>>> rd = get_hal_dimensions(&optionsParser, in_paths);
+    string& root_name = rd.first;
+    unordered_map<string, vector<Sequence::Info>>& dimensions = rd.second;
 
     // create the new file
     AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));
 
-    // set up the size of each genome
+    // set up the size of each genome, staring with the root
     Genome* root_genome = alignment->addRootGenome(root_name);
-    root_genome->setDimensions(total_dimensions.at(root_name));    
-    for (const auto& gd : total_dimensions) {
-        if (gd.first != root_name) {
-            Genome* leaf_genome = alignment->addLeafGenome(gd.first, root_name, 1);
-            leaf_genome->setDimensions(gd.second);
+    for (auto& kv : dimensions) {
+        if (kv.first != root_name) {
+            Genome* leaf_genome = alignment->addLeafGenome(kv.first, root_name, 1);
+            leaf_genome->setDimensions(kv.second);
         }
     }
+    // important to set root dimensions after adding leaves so bottom segments have right number of slots
+    root_genome->setDimensions(dimensions.at(root_name));
 
     // copy over over everything
     merge_hals(&optionsParser, alignment, in_paths);

From bd4aa1fcf22fe7028a01fb09f8467b2e8a84823e Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Mar 2021 20:09:39 -0400
Subject: [PATCH 3/6] add tests

---
 halMergeChroms.cpp     | 40 +++++++++++++++++++++++++++++++---------
 tests/small/small2.maf | 15 +++++++++++++++
 tests/t/merge.t        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 tests/small/small2.maf
 create mode 100644 tests/t/merge.t

diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp
index 561c230..74d817e 100644
--- a/halMergeChroms.cpp
+++ b/halMergeChroms.cpp
@@ -4,7 +4,7 @@
  * Released under the MIT license, see LICENSE.txt
  */
 
-// Merge chromosome HAL files into one big one.  Only star trees supported, with same root name supported. 
+// Merge chromosome HAL files into one big one.  Only star trees with same root name supported (ie what comes out of cactus-align-batch). 
 
 //#define debug
 
@@ -25,6 +25,9 @@ using namespace hal;
 static void initParser(CLParser* optionsParser) {
     optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge");
     optionsParser->addArgument("outFile", "output HAL file");
+    optionsParser->addOptionFlag("progress",
+                                 "show progress",
+                                 false);
     optionsParser->setDescription("Merge chromosome HALs into combined file.  Ancestral sequences are renamed as needed to avoid conflicts"
         ". Star trees only");
 }
@@ -89,7 +92,7 @@ static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimen
 
 // append each input hal to the out_alignment, one after another.  all arrays are copied over,
 // but need to be adjsuted to reflect their new offsets.
-static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector<string>& in_paths) {
+static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector<string>& in_paths, bool progress) {
 
     // keep track of where we are in the output    
     vector<size_t> top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0);
@@ -101,6 +104,7 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
         Genome* out_root = out_alignment->openGenome(in_alignment->getRootName());
         assert(in_root->getName() == out_root->getName());
         size_t in_root_degree = in_root->getNumChildren();
+        size_t out_root_degree = out_root->getNumChildren();
         vector<const Genome*> in_genomes = {in_root};
         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
             in_genomes.push_back(in_alignment->openGenome(in_child_name));
@@ -108,7 +112,9 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
 
         // copy the dna sequence by sequence
         for (const Genome* in_genome : in_genomes) {
-            cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl;
+            if (progress) {
+                cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl;
+            }
             Genome* out_genome = out_alignment->openGenome(in_genome->getName());
             for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
                 const Sequence* in_sequence = in_si->getSequence();
@@ -132,8 +138,10 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
         }
 
         // copy over the bottom segments of the root
-        cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i]
-             << " with bseg offset " << bot_offset << endl;
+        if (progress) {
+            cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i]
+                 << " with bseg offset " << bot_offset << endl;
+        }
         BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0);
         BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset);
         for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) {
@@ -151,21 +159,24 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
             assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength());
             out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength());
             // set the segment in the child genomes
+            for (size_t out_ci = 0; out_ci < out_root_degree; ++out_ci) {
+                out_bi->bseg()->setChildIndex(out_ci,  NULL_INDEX);
+            }
             for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) {
                 size_t out_ci = in_ci_to_out_ci.at(in_ci);
                 assert(out_ci < out_bi->bseg()->getNumChildren());
                 if (in_bi->bseg()->hasChild(in_ci)) {
                     out_bi->bseg()->setChildIndex(out_ci,  in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
                     out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
-                } else {
-                    out_bi->bseg()->setChildIndex(out_ci,  NULL_INDEX);
                 }
             }
         }
 
         // for every child genome, copy over the top segments
         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
-            cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl;
+            if (progress) { 
+                cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl;
+            }
             const Genome* in_child = in_alignment->openGenome(in_child_name);
             Genome* out_child = out_alignment->openGenome(in_child_name);
             
@@ -212,10 +223,12 @@ int main(int argc, char** argv) {
     initParser(&optionsParser);
     string in_hal_paths;
     string out_hal_path;
+    bool progress;
     try {
         optionsParser.parseOptions(argc, argv);
         in_hal_paths = optionsParser.getArgument<string>("inFiles");
         out_hal_path = optionsParser.getArgument<string>("outFile");
+        progress = optionsParser.getFlag("progress");
     }
     catch(exception& e) {
         cerr << e.what() << endl;
@@ -226,11 +239,17 @@ int main(int argc, char** argv) {
     vector<string> in_paths = chopString(in_hal_paths, ",");
     
     // map genome -> dimensions for each input alignment
+    if (progress) {
+        cerr << "[halMergeChroms]: Scanning dimensions of " << in_paths.size() << " input files." << endl;
+    }
     pair<string, unordered_map<string, vector<Sequence::Info>>> rd = get_hal_dimensions(&optionsParser, in_paths);
     string& root_name = rd.first;
     unordered_map<string, vector<Sequence::Info>>& dimensions = rd.second;
 
     // create the new file
+    if (progress) {
+        cerr << "[halMergeChroms]: Creating empty alignment: " << out_hal_path << endl;
+    }
     AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));
 
     // set up the size of each genome, staring with the root
@@ -245,8 +264,11 @@ int main(int argc, char** argv) {
     root_genome->setDimensions(dimensions.at(root_name));
 
     // copy over over everything
-    merge_hals(&optionsParser, alignment, in_paths);
+    merge_hals(&optionsParser, alignment, in_paths, progress);
 
+    if (progress) {
+      cerr << "[halMergeChroms]: Writing merged alignment" << endl;
+    }
      
     return 0;
 }
diff --git a/tests/small/small2.maf b/tests/small/small2.maf
new file mode 100644
index 0000000..4688386
--- /dev/null
+++ b/tests/small/small2.maf
@@ -0,0 +1,15 @@
+##maf version=1
+
+# SNP
+a score=0 mafExtractor_splicedBlock=true splice_id=1_0
+s human.1           0  3 +  10 GCA
+s chimp.3       0  3 +  8 GCA
+
+# Indel and strand change
+a score=0 mafExtractor_splicedBlock=true splice_id=1_0
+s human.1           3  7 +  10 GCAGAAT
+s chimp.3       3  5 +  8 GCAG--T
+s cow.3    0  4 -  7 --A-AAT
+
+
+
diff --git a/tests/t/merge.t b/tests/t/merge.t
new file mode 100644
index 0000000..4573b51
--- /dev/null
+++ b/tests/t/merge.t
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+BASH_TAP_ROOT=./bash-tap
+. ${BASH_TAP_ROOT}/bash-tap-bootstrap
+
+PATH=../bin:$PATH
+PATH=../deps/hal:$PATH
+
+plan tests 4
+
+maf2hal small/small.maf small.hal
+maf2hal small/small2.maf small2.hal
+halMergeChroms small.hal,small2.hal merged1.hal
+hal2vg small.hal --noAncestors | vg ids -s - > small.vg
+hal2vg small2.hal --noAncestors | vg ids -s - > small2.vg
+hal2vg merged1.hal --noAncestors | vg ids -s - > merged1.vg
+vg view small.vg  > small.gfa
+vg view small2.vg > small2.gfa
+vg find -x merged1.vg -p cat.3:1 -c 1000 | vg view - > merged1.comp1.gfa
+vg find -x merged1.vg -p cow.3:1 -c 1000 | vg view - > merged1.comp2.gfa
+diff small.gfa merged1.comp1.gfa
+is $? 0 "First component of merged graph identical to first input graph"
+diff small2.gfa merged1.comp2.gfa
+is $? 0 "Second component of merged graph identical to second input graph"
+
+# repeat with merge in different order
+halMergeChroms small2.hal,small.hal merged2.hal
+hal2vg merged2.hal --noAncestors | vg ids -s - > merged2.vg
+vg find -x merged2.vg -p cat.3:1 -c 1000 | vg view - > merged2.comp1.gfa
+vg find -x merged2.vg -p cow.3:1 -c 1000 | vg view - > merged2.comp2.gfa
+diff small.gfa merged1.comp1.gfa
+is $? 0 "First component of merged graph2 identical to first input graph"
+diff small2.gfa merged1.comp2.gfa
+is $? 0 "Second component of merged graph2 identical to second input graph"
+
+rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa
+rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa
+rm -f merged2.hal merged2.vg merged2.comp1.gfa merged2.comp2.gfa

From 92247aa255d83d8d99cd5c88f03b1314bd9c57fc Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Mar 2021 21:11:11 -0400
Subject: [PATCH 4/6] fix tests

---
 halMergeChroms.cpp |  7 ++++-
 tests/t/merge.t    | 67 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp
index 74d817e..3a02092 100644
--- a/halMergeChroms.cpp
+++ b/halMergeChroms.cpp
@@ -189,7 +189,12 @@ static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, cons
             for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) {
                 // set the segment in the child genome
                 assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset);
-                out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset);
+                if (in_ti->tseg()->hasParent()) {
+                    out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset);
+                    out_ti->tseg()->setParentReversed(in_ti->tseg()->getParentReversed());
+                } else {
+                    out_ti->tseg()->setParentIndex(NULL_INDEX);
+                }
                 out_ti->tseg()->setBottomParseIndex(NULL_INDEX);
                 // determine the sequence-relative coordinate in the input
                 const Sequence* in_sequence = in_ti->tseg()->getSequence();
diff --git a/tests/t/merge.t b/tests/t/merge.t
index 4573b51..1426bab 100644
--- a/tests/t/merge.t
+++ b/tests/t/merge.t
@@ -6,33 +6,68 @@ BASH_TAP_ROOT=./bash-tap
 PATH=../bin:$PATH
 PATH=../deps/hal:$PATH
 
-plan tests 4
+plan tests 10
 
 maf2hal small/small.maf small.hal
 maf2hal small/small2.maf small2.hal
 halMergeChroms small.hal,small2.hal merged1.hal
-hal2vg small.hal --noAncestors | vg ids -s - > small.vg
-hal2vg small2.hal --noAncestors | vg ids -s - > small2.vg
-hal2vg merged1.hal --noAncestors | vg ids -s - > merged1.vg
-vg view small.vg  > small.gfa
-vg view small2.vg > small2.gfa
-vg find -x merged1.vg -p cat.3:1 -c 1000 | vg view - > merged1.comp1.gfa
-vg find -x merged1.vg -p cow.3:1 -c 1000 | vg view - > merged1.comp2.gfa
+halValidate merged1.hal
+is $? 0 "halMergeChroms produces valid hal"
+hal2fasta small.hal chimp > chimp.fa
+hal2fasta small2.hal chimp >> chimp.fa
+hal2fasta merged1.hal chimp > chimp.merge.fa
+diff chimp.fa chimp.merge.fa
+is $? 0 "halMergeChroms preserves chimp sequence"
+hal2fasta small.hal cat > cat.fa
+hal2fasta merged1.hal cat > cat.merge.fa
+diff cat.fa cat.merge.fa
+is $? 0 "halMergeChroms preserves cat sequence"
+hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg
+hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
+hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
+vg view small.vg | sort > small.gfa
+vg view small2.vg | sort > small2.gfa
+vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa
+vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa
 diff small.gfa merged1.comp1.gfa
 is $? 0 "First component of merged graph identical to first input graph"
 diff small2.gfa merged1.comp2.gfa
 is $? 0 "Second component of merged graph identical to second input graph"
 
-# repeat with merge in different order
-halMergeChroms small2.hal,small.hal merged2.hal
-hal2vg merged2.hal --noAncestors | vg ids -s - > merged2.vg
-vg find -x merged2.vg -p cat.3:1 -c 1000 | vg view - > merged2.comp1.gfa
-vg find -x merged2.vg -p cow.3:1 -c 1000 | vg view - > merged2.comp2.gfa
+rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa
+rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa
+rm -f chimp.fa chimp.merge.fa
+rm -f cat.fa cat.merge.fa
+
+### copy paste above but change order ###
+
+maf2hal small/small.maf small.hal
+maf2hal small/small2.maf small2.hal
+halMergeChroms small2.hal,small.hal merged1.hal
+halValidate merged1.hal
+is $? 0 "halMergeChroms produces valid hal"
+hal2fasta small2.hal chimp > chimp.fa
+hal2fasta small.hal chimp >> chimp.fa
+hal2fasta merged1.hal chimp > chimp.merge.fa
+diff chimp.fa chimp.merge.fa
+is $? 0 "halMergeChroms preserves chimp sequence"
+hal2fasta small.hal cat > cat.fa
+hal2fasta merged1.hal cat > cat.merge.fa
+diff cat.fa cat.merge.fa
+is $? 0 "halMergeChroms preserves cat sequence"
+hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg
+hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
+hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
+vg view small.vg | sort > small.gfa
+vg view small2.vg | sort > small2.gfa
+vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa
+vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa
 diff small.gfa merged1.comp1.gfa
-is $? 0 "First component of merged graph2 identical to first input graph"
+is $? 0 "First component of merged graph identical to first input graph"
 diff small2.gfa merged1.comp2.gfa
-is $? 0 "Second component of merged graph2 identical to second input graph"
+is $? 0 "Second component of merged graph identical to second input graph"
 
 rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa
 rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa
-rm -f merged2.hal merged2.vg merged2.comp1.gfa merged2.comp2.gfa
+rm -f chimp.fa chimp.merge.fa
+rm -f cat.fa cat.merge.fa

From 057565ab8efe0e3440e93ad08a957480e3033917 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 29 Mar 2021 21:12:07 -0400
Subject: [PATCH 5/6] add halMergeChroms to bin release

---
 build-tools/makeBinRelease | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease
index 2dec00a..2dc3897 100755
--- a/build-tools/makeBinRelease
+++ b/build-tools/makeBinRelease
@@ -37,5 +37,5 @@ else
 	 make check-static
 fi
 
-cp hal2vg clip-vg halRemoveDupes ${buildDir}/
+cp hal2vg clip-vg halRemoveDupes halMergeChroms ${buildDir}/
 

From 8a3a99c068b95da847c207384cddcda3539afdf4 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 30 Mar 2021 08:38:03 -0400
Subject: [PATCH 6/6] just use underscore for names

---
 halMergeChroms.cpp | 8 ++------
 tests/t/merge.t    | 8 ++++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/halMergeChroms.cpp b/halMergeChroms.cpp
index 3a02092..e849662 100644
--- a/halMergeChroms.cpp
+++ b/halMergeChroms.cpp
@@ -33,13 +33,9 @@ static void initParser(CLParser* optionsParser) {
 }
 
 // we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding
-// .i to them where i is the file's position in the input.
+// _i to them where i is the file's position in the input.
 static string anc_seq_name(const string& seq_name, size_t idx) {
-    return seq_name + ".hmc" + to_string(idx);
-}
-// undo the above
-static string orig_seq_name(const string& seq_name) {
-    return seq_name.substr(0, seq_name.rfind(".hmc"));
+    return seq_name + "_" + to_string(idx);
 }
 
 // get the dimensions from all genomes in all input files
diff --git a/tests/t/merge.t b/tests/t/merge.t
index 1426bab..19fda86 100644
--- a/tests/t/merge.t
+++ b/tests/t/merge.t
@@ -27,8 +27,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
 hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
 vg view small.vg | sort > small.gfa
 vg view small2.vg | sort > small2.gfa
-vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa
-vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa
+vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa
+vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa
 diff small.gfa merged1.comp1.gfa
 is $? 0 "First component of merged graph identical to first input graph"
 diff small2.gfa merged1.comp2.gfa
@@ -60,8 +60,8 @@ hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
 hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
 vg view small.vg | sort > small.gfa
 vg view small2.vg | sort > small2.gfa
-vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp1.gfa
-vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/.hmc0//g' | sed -e 's/.hmc1//g' > merged1.comp2.gfa
+vg find -x merged1.vg -p cat.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp1.gfa
+vg find -x merged1.vg -p cow.3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' > merged1.comp2.gfa
 diff small.gfa merged1.comp1.gfa
 is $? 0 "First component of merged graph identical to first input graph"
 diff small2.gfa merged1.comp2.gfa