Skip to content

Commit

Permalink
Merge pull request #43 from ComparativeGenomicsToolkit/chop
Browse files Browse the repository at this point in the history
add halMergeChroms tool
  • Loading branch information
glennhickey authored Mar 30, 2021
2 parents a03606a + 8a3a99c commit 1244493
Show file tree
Hide file tree
Showing 6 changed files with 380 additions and 7 deletions.
18 changes: 15 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
rootPath = ./
include ./include.mk

all : hal2vg clip-vg halRemoveDupes
all : hal2vg clip-vg halRemoveDupes halMergeChroms

# Note: hdf5 from apt doesn't seem to work for static builds. It should be installed
# from source and configured with "--enable-static --disable-shared", then have its
Expand All @@ -28,12 +28,18 @@ ifeq ($(shell ldd halRemoveDupes | grep "not a dynamic" | wc -l), $(shell ls hal
else
$(error ldd found dnymaic linked dependency in halRemoveDupes)
endif
ifeq ($(shell ldd halMergeChroms | grep "not a dynamic" | wc -l), $(shell ls halMergeChroms | wc -l))
$(info ldd verified that halMergeChroms static)
else
$(error ldd found dnymaic linked dependency in halMergeChroms)
endif


cleanFast :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o

clean :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o
cd deps/sonLib && make clean
cd deps/pinchesAndCacti && make clean
cd deps/hal && make clean
Expand Down Expand Up @@ -69,6 +75,12 @@ halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies}
halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o ${basicLibs} -o halRemoveDupes

halMergeChroms.o : halMergeChroms.cpp ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -I . halMergeChroms.cpp -c

halMergeChroms : halMergeChroms.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread halMergeChroms.o ${basicLibs} -o halMergeChroms

test :
make
cd tests && prove -v t
2 changes: 1 addition & 1 deletion build-tools/makeBinRelease
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ else
make check-static
fi

cp hal2vg clip-vg halRemoveDupes ${buildDir}/
cp hal2vg clip-vg halRemoveDupes halMergeChroms ${buildDir}/

276 changes: 276 additions & 0 deletions halMergeChroms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
/*
* Copyright (C) 2016 by Glenn Hickey ([email protected])
*
* Released under the MIT license, see LICENSE.txt
*/

// Merge chromosome HAL files into one big one. Only star trees with same root name supported (ie what comes out of cactus-align-batch).

//#define debug

#include <cstdlib>
#include <cstdlib>
#include <iostream>
#include <cassert>
#include <fstream>
#include <deque>
#include <unordered_map>
#include <unordered_set>

#include "hal.h"

using namespace std;
using namespace hal;

static void initParser(CLParser* optionsParser) {
optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge");
optionsParser->addArgument("outFile", "output HAL file");
optionsParser->addOptionFlag("progress",
"show progress",
false);
optionsParser->setDescription("Merge chromosome HALs into combined file. Ancestral sequences are renamed as needed to avoid conflicts"
". Star trees only");
}

// we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding
// _i to them where i is the file's position in the input.
static string anc_seq_name(const string& seq_name, size_t idx) {
return seq_name + "_" + to_string(idx);
}

// get the dimensions from all genomes in all input files
static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimensions(CLParser* optionsParser,
const vector<string>& hal_paths) {
// genome -> dimensions (covering all input)
unordered_map<string, vector<Sequence::Info>> dimensions;
// to check uniqueness
unordered_set<string> sequence_names;

string root_name;
for (size_t i = 0; i < hal_paths.size(); ++i) {
const string& hal_path = hal_paths[i];

// open the hal file
AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS));

// for every genome
vector<string> genome_names = alignment->getChildNames(alignment->getRootName());
genome_names.push_back(alignment->getRootName());
if (root_name.empty()) {
root_name = alignment->getRootName();
} else if (alignment->getRootName() != root_name) {
throw hal_exception("Root mismatch: " + root_name + " vs " + alignment->getRootName());
}
for (const string& genome_name : genome_names) {
const Genome* genome = alignment->openGenome(genome_name);
vector<Sequence::Info>& genome_dimensions = dimensions[genome_name];
// for every sequence
for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
const Sequence *sequence = seqIt->getSequence();
// add a little suffix to make ancestral sequences unique
string seq_name = genome->getParent() ? sequence->getName() : anc_seq_name(sequence->getName(), i);
genome_dimensions.emplace_back(seq_name,
sequence->getSequenceLength(),
sequence->getNumTopSegments(),
sequence->getNumBottomSegments());
string full_name = genome_name + "." + seq_name;
if (sequence_names.count(full_name)) {
throw hal_exception("Duplicate sequence name found: " + full_name);
} else {
sequence_names.insert(full_name);
}
}
alignment->closeGenome(genome);
}
}
return make_pair(root_name, dimensions);
}

// append each input hal to the out_alignment, one after another. all arrays are copied over,
// but need to be adjsuted to reflect their new offsets.
static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector<string>& in_paths, bool progress) {

// keep track of where we are in the output
vector<size_t> top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0);
size_t bot_offset = 0;

for (size_t i = 0; i < in_paths.size(); ++i) {
AlignmentConstPtr in_alignment(openHalAlignment(in_paths[i], optionsParser, READ_ACCESS));
const Genome* in_root = in_alignment->openGenome(in_alignment->getRootName());
Genome* out_root = out_alignment->openGenome(in_alignment->getRootName());
assert(in_root->getName() == out_root->getName());
size_t in_root_degree = in_root->getNumChildren();
size_t out_root_degree = out_root->getNumChildren();
vector<const Genome*> in_genomes = {in_root};
for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
in_genomes.push_back(in_alignment->openGenome(in_child_name));
}

// copy the dna sequence by sequence
for (const Genome* in_genome : in_genomes) {
if (progress) {
cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl;
}
Genome* out_genome = out_alignment->openGenome(in_genome->getName());
for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
const Sequence* in_sequence = in_si->getSequence();
string out_seq_name = in_genome->getParent() ? in_sequence->getName() : anc_seq_name(in_sequence->getName(), i);
Sequence* out_sequence = out_genome->getSequence(out_seq_name);
DnaIteratorPtr in_di = in_sequence->getDnaIterator(0);
DnaIteratorPtr out_di = out_sequence->getDnaIterator(0);
assert(in_sequence->getSequenceLength() == out_sequence->getSequenceLength());
string dna;
in_sequence->getString(dna);
out_sequence->setString(dna);
}
}

// make a child index map (in -> out) for the root genome
// assume : all genomes in in_genome present in out_genome
vector<size_t> in_ci_to_out_ci(in_root->getNumChildren());
for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
in_ci_to_out_ci.at(in_root->getChildIndex(in_alignment->openGenome(in_child_name))) =
out_root->getChildIndex(out_alignment->openGenome(in_child_name));
}

// copy over the bottom segments of the root
if (progress) {
cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i]
<< " with bseg offset " << bot_offset << endl;
}
BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0);
BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset);
for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) {
// set the segment in the root genome
assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset);
assert(out_bi->bseg()->getNumChildren() >= in_bi->bseg()->getNumChildren());
out_bi->bseg()->setTopParseIndex(NULL_INDEX);
// determine the sequence-relative coordinate in the input
const Sequence* in_sequence = in_bi->bseg()->getSequence();
int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition();
assert(in_start_coord >= 0 && in_start_coord < in_sequence->getSequenceLength());
// set the sequence relative coordinate in the output
const Sequence* out_sequence = out_root->getSequence(anc_seq_name(in_sequence->getName(), i));
int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength());
out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength());
// set the segment in the child genomes
for (size_t out_ci = 0; out_ci < out_root_degree; ++out_ci) {
out_bi->bseg()->setChildIndex(out_ci, NULL_INDEX);
}
for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) {
size_t out_ci = in_ci_to_out_ci.at(in_ci);
assert(out_ci < out_bi->bseg()->getNumChildren());
if (in_bi->bseg()->hasChild(in_ci)) {
out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
}
}
}

// for every child genome, copy over the top segments
for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
if (progress) {
cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl;
}
const Genome* in_child = in_alignment->openGenome(in_child_name);
Genome* out_child = out_alignment->openGenome(in_child_name);

size_t in_ci = in_root->getChildIndex(in_child);
size_t out_ci = in_ci_to_out_ci[in_ci];
size_t top_offset = top_offsets[out_ci];
TopSegmentIteratorPtr in_ti = in_child->getTopSegmentIterator(0);
TopSegmentIteratorPtr out_ti = out_child->getTopSegmentIterator(top_offsets[out_ci]);

for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) {
// set the segment in the child genome
assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset);
if (in_ti->tseg()->hasParent()) {
out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset);
out_ti->tseg()->setParentReversed(in_ti->tseg()->getParentReversed());
} else {
out_ti->tseg()->setParentIndex(NULL_INDEX);
}
out_ti->tseg()->setBottomParseIndex(NULL_INDEX);
// determine the sequence-relative coordinate in the input
const Sequence* in_sequence = in_ti->tseg()->getSequence();
int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition();
// set the sequence relative coordinate in the output
const Sequence* out_sequence = out_child->getSequence(in_sequence->getName());
int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength());
// set the paralogy edge
if (in_ti->tseg()->hasNextParalogy()) {
out_ti->tseg()->setNextParalogyIndex(in_ti->tseg()->getNextParalogyIndex() + top_offset);
} else {
out_ti->tseg()->setNextParalogyIndex(NULL_INDEX);
}
}
}

// update the offsets to move past the current alignment in all genomes
bot_offset += in_root->getNumBottomSegments();
for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
const Genome* in_child = in_alignment->openGenome(in_child_name);
size_t in_ci = in_root->getChildIndex(in_child);
size_t out_ci = in_ci_to_out_ci[in_ci];
top_offsets[out_ci] += in_child->getNumTopSegments();
}
}
}

int main(int argc, char** argv) {
CLParser optionsParser(WRITE_ACCESS);
initParser(&optionsParser);
string in_hal_paths;
string out_hal_path;
bool progress;
try {
optionsParser.parseOptions(argc, argv);
in_hal_paths = optionsParser.getArgument<string>("inFiles");
out_hal_path = optionsParser.getArgument<string>("outFile");
progress = optionsParser.getFlag("progress");
}
catch(exception& e) {
cerr << e.what() << endl;
optionsParser.printUsage(cerr);
exit(1);
}

vector<string> in_paths = chopString(in_hal_paths, ",");

// map genome -> dimensions for each input alignment
if (progress) {
cerr << "[halMergeChroms]: Scanning dimensions of " << in_paths.size() << " input files." << endl;
}
pair<string, unordered_map<string, vector<Sequence::Info>>> rd = get_hal_dimensions(&optionsParser, in_paths);
string& root_name = rd.first;
unordered_map<string, vector<Sequence::Info>>& dimensions = rd.second;

// create the new file
if (progress) {
cerr << "[halMergeChroms]: Creating empty alignment: " << out_hal_path << endl;
}
AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));

// set up the size of each genome, staring with the root
Genome* root_genome = alignment->addRootGenome(root_name);
for (auto& kv : dimensions) {
if (kv.first != root_name) {
Genome* leaf_genome = alignment->addLeafGenome(kv.first, root_name, 1);
leaf_genome->setDimensions(kv.second);
}
}
// important to set root dimensions after adding leaves so bottom segments have right number of slots
root_genome->setDimensions(dimensions.at(root_name));

// copy over over everything
merge_hals(&optionsParser, alignment, in_paths, progress);

if (progress) {
cerr << "[halMergeChroms]: Writing merged alignment" << endl;
}

return 0;
}

3 changes: 0 additions & 3 deletions halRemoveDupes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
* Released under the MIT license, see LICENSE.txt
*/

// This file was created by merging hal2sg.cpp and sg2vg.cpp with
// a small amount of glue for the interface.

//#define debug

#include <cstdlib>
Expand Down
15 changes: 15 additions & 0 deletions tests/small/small2.maf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
##maf version=1

# SNP
a score=0 mafExtractor_splicedBlock=true splice_id=1_0
s human.1 0 3 + 10 GCA
s chimp.3 0 3 + 8 GCA

# Indel and strand change
a score=0 mafExtractor_splicedBlock=true splice_id=1_0
s human.1 3 7 + 10 GCAGAAT
s chimp.3 3 5 + 8 GCAG--T
s cow.3 0 4 - 7 --A-AAT



Loading

0 comments on commit 1244493

Please sign in to comment.