Skip to content

Commit

Permalink
Merge pull request #58 from ComparativeGenomicsToolkit/path-sense
Browse files Browse the repository at this point in the history
Use vg's path metadata
  • Loading branch information
glennhickey authored Nov 8, 2022
2 parents c515377 + d0ad829 commit 576784d
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 100 deletions.
58 changes: 13 additions & 45 deletions clip-vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,48 +67,19 @@ static unordered_map<string, vector<pair<int64_t, int64_t>>> get_clipped_interva
const unordered_map<string, vector<pair<int64_t, int64_t>>>& input_intervals,
const unordered_map<string, vector<pair<int64_t, int64_t>>>& output_intervals);

// from path.cpp in vg
// Check if using subpath naming scheme. If it is return true,
// the root path name, and the offset (false otherwise)
// formats accepted:
// name[start] (0-based)
// name[start-end] (0-based, open-ended like BED, end defaults to 0 in above case)
static tuple<bool, string, size_t, size_t>parse_subpath_name(const string& path_name) {
size_t tag_offset = path_name.rfind("[");
if (tag_offset == string::npos || tag_offset + 2 >= path_name.length() || path_name.back() != ']') {
return make_tuple(false, "", 0, 0);
} else {
string offset_str = path_name.substr(tag_offset + 1, path_name.length() - tag_offset - 2);
size_t sep_offset = offset_str.find("-");
string end_offset_str;
if (sep_offset != string::npos && !offset_str.empty() && sep_offset < offset_str.length() - 1) {
end_offset_str = offset_str.substr(sep_offset + 1, offset_str.length() - sep_offset - 1);
offset_str = offset_str.substr(0, sep_offset);
}
size_t offset_val;
size_t end_offset_val = 0;
try {
offset_val = std::stol(offset_str);
if (!end_offset_str.empty()) {
end_offset_val = std::stol(end_offset_str);
}
} catch(...) {
return make_tuple(false, "", 0, 0);
}
return make_tuple(true, path_name.substr(0, tag_offset), offset_val, end_offset_val);
}
}

// Create a subpath name (todo: make same function in vg consistent (it only includes start))
static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) {
tuple<bool, string, size_t, size_t> parsed_name = parse_subpath_name(path_name);
size_t cur_offset = 0;
const string* cur_name = &path_name;
if (get<0>(parsed_name)) {
cur_offset = get<2>(parsed_name);
cur_name = &get<1>(parsed_name);
}
return *cur_name + "[" + std::to_string(cur_offset + offset) + "-" + std::to_string(cur_offset + offset + length) + "]";
PathSense sense;
string sample;
string locus;
size_t haplotype;
size_t phase_block;
subrange_t subrange;
PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange);
subrange.first = subrange != PathMetadata::NO_SUBRANGE ? subrange.first : 0;
subrange.first += offset;
subrange.second = subrange.first + length;
return PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange);
}

int main(int argc, char** argv) {
Expand Down Expand Up @@ -1029,15 +1000,12 @@ unordered_map<string, vector<pair<int64_t, int64_t>>> get_path_intervals(const P
unordered_map<string, vector<pair<int64_t, int64_t>>> path_intervals;
graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = graph->get_path_name(path_handle);
tuple<bool, string, size_t, size_t> subpath_info = parse_subpath_name(path_name);
size_t path_len = 0;
graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
path_len += graph->get_length(graph->get_handle_of_step(step_handle));
});
int64_t start = get<0>(subpath_info) ? get<2>(subpath_info) : 0;
if (get<0>(subpath_info)) {
path_name = get<1>(subpath_info);
}
subrange_t path_range = graph->get_subrange(path_handle);
int64_t start = path_range == PathMetadata::NO_SUBRANGE ? 0 : path_range.second;
vector<pair<int64_t, int64_t>>& intervals = path_intervals[path_name];
intervals.push_back(make_pair(start, start + path_len));
});
Expand Down
2 changes: 1 addition & 1 deletion deps/hal
Submodule hal updated 99 files
+1 −1 .travis.yml
+11 −7 Makefile
+72 −66 README.md
+11 −3 api/Makefile
+5 −16 api/hdf5_impl/hdf5Alignment.cpp
+45 −4 api/hdf5_impl/hdf5Genome.cpp
+1 −0 api/hdf5_impl/hdf5Genome.h
+21 −0 api/hdf5_impl/hdf5UDCFuseDriver.cpp
+7 −10 api/impl/halAlignmentInstance.cpp
+1 −5 api/impl/halCLParser.cpp
+17 −0 api/impl/halSequence.cpp
+13 −3 api/impl/udc2.c
+35 −0 api/inc/halAlignment.h
+3 −4 api/inc/halAlignmentInstance.h
+1 −1 api/inc/halDefs.h
+25 −0 api/inc/halGenome.h
+28 −1 api/inc/udc2.h
+2 −2 api/tests/halAlignmentTreesTest.cpp
+6 −6 api/tests/halApiTestSupport.cpp
+3 −3 api/tests/halApiTestSupport.h
+12 −12 api/tests/halBottomSegmentTest.cpp
+23 −23 api/tests/halColumnIteratorTest.cpp
+6 −6 api/tests/halGappedSegmentIteratorTest.cpp
+11 −11 api/tests/halGenomeTest.cpp
+28 −28 api/tests/halMappedSegmentTest.cpp
+2 −2 api/tests/halMetaDataTest.cpp
+9 −9 api/tests/halRandomData.cpp
+4 −4 api/tests/halRandomData.h
+6 −6 api/tests/halRearrangementTest.cpp
+1 −1 api/tests/halSegmentTestSupport.h
+8 −8 api/tests/halSequenceTest.cpp
+12 −12 api/tests/halTopSegmentTest.cpp
+12 −12 api/tests/halValidateTest.cpp
+5 −4 api/tests/udc2Test.c
+20 −18 assemblyHub/README.md
+8 −8 assemblyHub/bedTrack.py
+4 −3 assemblyHub/hal2assemblyHub.py
+17 −0 blockViz/Makefile
+53 −19 blockViz/impl/halBlockViz.cpp
+9 −0 blockViz/inc/halBlockViz.h
+27 −11 extract/impl/halExtract.cpp
+1 −1 extract/impl/halMaskExtractMain.cpp
+1 −1 extract/impl/halMaskExtractor.cpp
+1 −1 extract/inc/halMaskExtractor.h
+10 −10 extract/tests/hal4dExtractTest.cpp
+3 −4 fasta/hal2fasta.cpp
+11 −9 include.mk
+1 −1 liftover/impl/halLiftover.cpp
+1 −1 liftover/impl/halLiftoverMain.cpp
+3 −3 liftover/impl/halWiggleLiftover.cpp
+2 −2 liftover/impl/halWiggleLiftoverMain.cpp
+1 −1 liftover/impl/halWiggleLoader.cpp
+1 −1 liftover/inc/halLiftover.h
+2 −2 liftover/inc/halWiggleLiftover.h
+1 −1 liftover/inc/halWiggleLoader.h
+10 −10 liftover/tests/halLiftoverTests.cpp
+9 −9 liftover/tests/halLiftoverTests.h
+4 −4 lod/impl/halLodExtract.cpp
+1 −1 lod/impl/halLodExtractMain.cpp
+1 −1 lod/impl/halLodGraph.cpp
+5 −12 lod/impl/halLodManager.cpp
+1 −1 lod/inc/halLodExtract.h
+1 −1 lod/inc/halLodGraph.h
+2 −2 lod/inc/halLodManager.h
+2 −1 maf/impl/halMafExport.cpp
+1 −1 maf/impl/halMafWriteGenomes.cpp
+2 −2 maf/tests/halMafBlockTest.cpp
+2 −2 maf/tests/halMafBlockTest.h
+18 −8 modify/Makefile
+3 −3 modify/ancestorsML.cpp
+1 −1 modify/ancestorsML.h
+1 −1 modify/ancestorsMLBed.cpp
+2 −2 modify/ancestorsMLBed.h
+2 −5 modify/ancestorsMLMain.cpp
+2 −2 modify/ancestorsMLTest.cpp
+43 −16 modify/halAddToBranch.cpp
+41 −6 modify/halAppendSubtree.cpp
+1 −1 modify/halRemoveGenome.cpp
+60 −0 modify/halRemoveSubtree.cpp
+1 −4 modify/halRenameSequences.cpp
+17 −9 modify/halReplaceGenome.cpp
+2 −2 modify/halUpdateBranchLengths.cpp
+1 −1 modify/markAncestors.cpp
+1 −1 modify/markAncestors.h
+2 −4 mutations/impl/halSummarizeMutations.cpp
+1 −1 mutations/impl/halSummarizeMutationsMain.cpp
+1 −1 mutations/inc/halSummarizeMutations.h
+100 −118 paf/hal2paf.cpp
+ paf/tests/expected/hal2pafMouseRatTest.paf.gz
+1 −1 phyloP/Makefile
+1 −1 randgen/halRandGen.cpp
+1 −1 requirements.txt
+4 −4 stats/impl/halStats.cpp
+62 −62 stats/impl/halStatsMain.cpp
+3 −3 stats/inc/halStats.h
+1 −1 synteny/impl/hal2psl.cpp
+5 −5 synteny/impl/halSynteny.cpp
+8 −2 synteny/impl/psl_io.cpp
+1 −1 synteny/inc/hal2psl.h
2 changes: 1 addition & 1 deletion deps/libbdsg-easy
117 changes: 73 additions & 44 deletions hal2vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ using namespace handlegraph;

static void initParser(CLParser* optionsParser) {
optionsParser->addArgument("halFile", "input hal file");
optionsParser->addOption("refGenomes",
"comma-separated (no spaces) genomes to treat as reference paths with all others as haplotype paths (default=all genomes)",
"\"\"");
optionsParser->addOption("rootGenome",
"process only genomes in clade with specified root"
" (HAL root if empty)",
Expand All @@ -44,11 +47,6 @@ static void initParser(CLParser* optionsParser) {
optionsParser->addOption("ignoreGenomes",
"comma-separated (no spaces) list of genomes to ignore",
"\"\"");
optionsParser->addOptionFlag("onlySequenceNames",
"use only sequence names for output names. By "
"default, the UCSC convention of "
"Genome.Sequence is used",
false);
optionsParser->addOption("outputFormat",
"output graph format in {pg, hg, odgi} [default=pg]",
"pg");
Expand All @@ -65,20 +63,17 @@ static void initParser(CLParser* optionsParser) {
static void add_genome_threads(const Genome* genome,
stPinchThreadSet* threads,
vector<string>& IDToName,
unordered_map<string, int64_t>& nameToID,
bool fullNames);
unordered_map<string, int64_t>& nameToID);

static void pinch_genome(const Genome* genome,
stPinchThreadSet* threads,
unordered_map<string, int64_t>& nameToID,
bool fullNames,
const vector<string>& targetNames,
unordered_map<stPinchThread*, vector<bool>>& snp_cache);

static void pinch_snp(const Genome* genome,
stPinchThreadSet* threads,
unordered_map<string, int64_t>& nameToID,
bool fullNames,
const TopSegmentIteratorPtr& topIt,
int64_t topOffset,
ColumnIteratorPtr& colIt,
Expand All @@ -92,32 +87,34 @@ static void pinch_to_handle(const Genome* genome,
const unordered_map<string, int64_t>& nameToID,
unordered_map<stPinchBlock*, nid_t>& blockToNode,
MutablePathMutableHandleGraph& graph,
bool fullNames);
const vector<string>& refNames);

static void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength);

static void resolve_subpath_naming(string& path_name);
static subrange_t resolve_subpath_naming(string& path_name);

static size_t resolve_haplotype_naming(string& genome_name);

int main(int argc, char** argv) {
CLParser optionsParser;
initParser(&optionsParser);
string halPath;
string refGenomes;
string rootGenomeName;
string targetGenomes;
bool noAncestors;
string ignoreGenomes;
bool fullNames;
string outputFormat;
size_t maxNodeLength;
bool progress;
try {
optionsParser.parseOptions(argc, argv);
halPath = optionsParser.getArgument<string>("halFile");
refGenomes = optionsParser.getOption<string>("refGenomes");
rootGenomeName = optionsParser.getOption<string>("rootGenome");
targetGenomes = optionsParser.getOption<string>("targetGenomes");
noAncestors = optionsParser.getFlag("noAncestors");
ignoreGenomes = optionsParser.getOption<string>("ignoreGenomes");
fullNames = !optionsParser.getFlag("onlySequenceNames");
outputFormat = optionsParser.getOption<string>("outputFormat");
if (outputFormat != "pg" && outputFormat != "hg" && outputFormat != "odgi") {
throw hal_exception("--outputFormat must be one of {pg, hg, odgi}");
Expand All @@ -141,6 +138,12 @@ int main(int argc, char** argv) {
throw hal_exception("input hal alignmenet is empty");
}

vector<string> refNames;
if (refGenomes != "\"\"") {
refNames = chopString(refGenomes, ",");
std::sort(refNames.begin(), refNames.end());
}

// default to alignment root if none specified
bool givenRoot = true;
if (rootGenomeName == "\"\"") {
Expand Down Expand Up @@ -261,7 +264,7 @@ int main(int argc, char** argv) {
if (progress && !(!curParent.empty() && genomeName != rootGenomeName)) {
cerr << "adding threads from " << genome->getName() << endl;
}
add_genome_threads(genome, threadSet, IDToName, nameToID, fullNames);
add_genome_threads(genome, threadSet, IDToName, nameToID);
}

if (!ignoreGenome && !curParent.empty() && genomeName != rootGenomeName) {
Expand Down Expand Up @@ -302,7 +305,7 @@ int main(int argc, char** argv) {
if (progress) {
cerr << "pinching " << pinchGenomes[i] << endl;
}
pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, fullNames, targetNames, snp_cache);
pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, targetNames, snp_cache);
}
snp_cache.clear();

Expand Down Expand Up @@ -347,7 +350,7 @@ int main(int argc, char** argv) {
cerr << "converting " << genomeName << " with " << genome->getNumSequences()
<< " sequences and total length " << genome->getSequenceLength() << endl;
}
pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, fullNames);
pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, refNames);

alignment->closeGenome(genome);
}
Expand Down Expand Up @@ -388,15 +391,14 @@ int main(int argc, char** argv) {

// Add every sequence from the genome into the pinch graph
void add_genome_threads(const Genome* genome,
stPinchThreadSet* threads,
vector<string>& IDToName,
unordered_map<string, int64_t>& nameToID,
bool fullNames) {
stPinchThreadSet* threads,
vector<string>& IDToName,
unordered_map<string, int64_t>& nameToID) {

for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
const Sequence *sequence = seqIt->getSequence();
hal_size_t seqLen = sequence->getSequenceLength();
string name = fullNames ? sequence->getFullName() : sequence->getName();
string name = sequence->getFullName();
// update lookups to map hal sequence to numeric id
int64_t seqID = IDToName.size();
nameToID[name] = seqID;
Expand All @@ -413,7 +415,6 @@ void add_genome_threads(const Genome* genome,
void pinch_genome(const Genome* genome,
stPinchThreadSet* threads,
unordered_map<string, int64_t>& nameToID,
bool fullNames,
const vector<string>& targetNames,
unordered_map<stPinchThread*, vector<bool>>& snp_cache) {

Expand Down Expand Up @@ -451,8 +452,8 @@ void pinch_genome(const Genome* genome,
botIt->toParent(topIt);

// todo: lots of string lookups
int64_t topID = nameToID[fullNames ? topIt->tseg()->getSequence()->getFullName() : topIt->tseg()->getSequence()->getName()];
int64_t botID = nameToID[fullNames ? botIt->bseg()->getSequence()->getFullName() : botIt->bseg()->getSequence()->getName()];
int64_t topID = nameToID[topIt->tseg()->getSequence()->getFullName()];
int64_t botID = nameToID[botIt->bseg()->getSequence()->getFullName()];

if (topIt->tseg()->getSequence() != topSeq) {
topSeq = topIt->tseg()->getSequence();
Expand Down Expand Up @@ -483,7 +484,7 @@ void pinch_genome(const Genome* genome,
}
last_match = i;
} else if (colIt.get() != NULL) {
pinch_snp(genome, threads, nameToID, fullNames, topIt, i, colIt,
pinch_snp(genome, threads, nameToID, topIt, i, colIt,
std::toupper(topString[i]), topThread, snp_cache);
}
if (std::toupper(topString[i]) != std::toupper(botString[i]) || i == (int64_t)topString.length() - 1) {
Expand Down Expand Up @@ -565,7 +566,6 @@ void pinch_genome(const Genome* genome,
void pinch_snp(const Genome* genome,
stPinchThreadSet* threads,
unordered_map<string, int64_t>& nameToID,
bool fullNames,
const TopSegmentIteratorPtr& topIt,
int64_t topOffset,
ColumnIteratorPtr& colIt,
Expand Down Expand Up @@ -596,7 +596,7 @@ void pinch_snp(const Genome* genome,
for (ColumnIterator::DNASet::const_iterator dsi = cmi->second->begin(); dsi != cmi->second->end(); ++dsi) {
char botBase = std::toupper((*dsi)->getBase());

int64_t otherID = nameToID[fullNames ? sequence->getFullName() : sequence->getName()];
int64_t otherID = nameToID[sequence->getFullName()];
stPinchThread* otherThread = stPinchThreadSet_getThread(threads, otherID);
hal_index_t otherStart = (*dsi)->getArrayIndex() - sequence->getStartPosition();

Expand Down Expand Up @@ -634,22 +634,36 @@ void pinch_to_handle(const Genome* genome,
const unordered_map<string, int64_t>& nameToID,
unordered_map<stPinchBlock*, nid_t>& blockToNode,
MutablePathMutableHandleGraph& graph,
bool fullNames) {
const vector<string>& refNames) {

// iterate over the sequences of the genome
for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
const Sequence *sequence = seqIt->getSequence();
string seqName = fullNames ? sequence->getFullName() : sequence->getName();
int64_t seqID = nameToID.find(seqName)->second;
PathSense sense = PathSense::REFERENCE;
if (!refNames.empty() && !std::binary_search(refNames.begin(), refNames.end(), genome->getName())) {
sense = PathSense::HAPLOTYPE;
}
int64_t seqID = nameToID.find(sequence->getFullName())->second;
stPinchThread* thread = stPinchThreadSet_getThread(threadSet, seqID);

// cactus_graphmap_split can make paths like contig_sub_1_3. here we convert that
// into a format vg can (sometimes) understand contig[1-3].
// (the reason we go through this is that assembly hubs can't handle any special characters apparently)
resolve_subpath_naming(seqName);

string parsed_name = sequence->getName();
subrange_t subpath = resolve_subpath_naming(parsed_name);
string parsed_genome_name = genome->getName();
size_t haplotype = resolve_haplotype_naming(parsed_genome_name);
if (sense == PathSense::HAPLOTYPE && haplotype == PathMetadata::NO_HAPLOTYPE) {
haplotype = 0;
}
// create the path
path_handle_t pathHandle = graph.create_path_handle(seqName);
path_handle_t pathHandle = graph.create_path(sense,
parsed_genome_name,
parsed_name,
haplotype,
sense == PathSense::HAPLOTYPE ? 0 : PathMetadata::NO_PHASE_BLOCK,
subpath,
false);
string pathString;

// iterate over the segments of the sequence
Expand Down Expand Up @@ -731,7 +745,7 @@ void pinch_to_handle(const Genome* genome,
string halPathString;
sequence->getString(halPathString);
if (pathString.length() != halPathString.length()) {
throw runtime_error("Incorrect length in coverted path for " + seqName + ": " + std::to_string(pathString.length()) +
throw runtime_error("Incorrect length in coverted path for " + sequence->getFullName() + ": " + std::to_string(pathString.length()) +
". Should be: " + std::to_string(halPathString.length()));
}
vector<size_t> mismatches;
Expand All @@ -742,7 +756,7 @@ void pinch_to_handle(const Genome* genome,
}
if (!mismatches.empty()) {
stringstream msg;
msg << mismatches.size() << " mismatches found in converted path for " << seqName << ":\n";
msg << mismatches.size() << " mismatches found in converted path for " << sequence->getFullName() << ":\n";
for (size_t i = 0; i < mismatches.size() && i < 10; ++i) {
msg << " path[" << mismatches[i] << "]=" << pathString[mismatches[i]] << ". should be " << halPathString[mismatches[i]] << "\n";
}
Expand Down Expand Up @@ -771,22 +785,19 @@ void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength) {
}
}

void resolve_subpath_naming(string& path_name) {
subrange_t resolve_subpath_naming(string& path_name) {
size_t first_length = 0;
size_t start_offset = 0;
bool found_subpath = false;
while (true) {
size_t sp = path_name.rfind("_sub_");
if (sp != string::npos) {
size_t up = path_name.rfind("_");
if (up != string::npos && up > sp + 1) {
int64_t start;
int64_t end;
try {
start = stol(path_name.substr(sp + 5, up - sp - 5));
end = stol(path_name.substr(up + 1));
} catch (...) {
return;
}
start = stol(path_name.substr(sp + 5, up - sp - 5));
end = stol(path_name.substr(up + 1));
stringstream new_name;
start_offset += start; // final offset is sum of all nested offsets
if (first_length == 0) {
Expand All @@ -797,12 +808,30 @@ void resolve_subpath_naming(string& path_name) {
// be derived from the start, plus the length of the "top" path
end = start_offset + first_length;
}
new_name << path_name.substr(0, sp) << "[" << start_offset << "-" << end << "]";
new_name << path_name.substr(0, sp);
path_name = new_name.str();
found_subpath = true;
}
} else {
break;
}
}
return;
if (found_subpath) {
return make_pair(start_offset, start_offset + first_length);
} else {
return PathMetadata::NO_SUBRANGE;
}
}

size_t resolve_haplotype_naming(string& genome_name) {
size_t haplotype = PathMetadata::NO_HAPLOTYPE;
size_t dp = genome_name.rfind(".");
if (dp != string::npos) {
try {
haplotype = stol(genome_name.substr(dp + 1));
genome_name = genome_name.substr(0, dp);
} catch(...) {
}
}
return haplotype;
}
Loading

0 comments on commit 576784d

Please sign in to comment.