From 3e13be62187b232b1cfc41e59c96b56a61415769 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 7 Jul 2018 12:53:26 +0100 Subject: [PATCH 01/15] When an alignment runs off the end of a target sequence, we now just flag a warning, rather than erroring. --- configure.ac | 2 +- lib/src/junction.cc | 77 ++++++++++++++++++++++++--------------------- src/portcullis.cc | 47 ++++++++++++++++++++++++--- 3 files changed, 85 insertions(+), 41 deletions(-) diff --git a/configure.ac b/configure.ac index 346aff51..3c44f25a 100644 --- a/configure.ac +++ b/configure.ac @@ -179,7 +179,7 @@ fi AM_CONDITIONAL([MAKE_DOCS], [test x$sphinx = xyes]) AC_SUBST([MAKE_DOCS]) -AM_CXXFLAGS="-g -O2 -std=c++11" +AM_CXXFLAGS="-g3 -gdwarf-2 -O0 -std=c++11" AC_SUBST([AM_CXXFLAGS]) AM_CPPFLAGS="${PYTHON_CPPFLAGS}" diff --git a/lib/src/junction.cc b/lib/src/junction.cc index 7d41e0a4..86a3e812 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -162,46 +162,51 @@ void portcullis::AlignmentInfo::calcMatchStats(const Intron& i, const uint32_t l string qAnchorRight = ba->getPaddedQuerySeq(query, rightStart, rightEnd, qRightStart, qRightEnd, false); string gAnchorLeft = ba->getPaddedGenomeSeq(ancLeft, leftStart, leftEnd, qLeftStart, qLeftEnd, false); string gAnchorRight = ba->getPaddedGenomeSeq(ancRight, rightStart, rightEnd, qRightStart, qRightEnd, false); + bool error = false; if (qAnchorLeft.size() != gAnchorLeft.size() || qAnchorLeft.empty()) { - BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string( - "Left anchor region for query and genome are not the same size.") - + "\nIntron: " + i.toString() - + "\nJunction anchor limits: " + lexical_cast(leftStart) + "," + lexical_cast(rightEnd) - + "\nGenomic sequence: " + ancLeft - + "\nAlignment coords (before soft clipping): " + ba->toString(false) - + "\nAlignment coords (after soft clipping): " + ba->toString(true) - + "\nRead name: " + ba->deriveName() - + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast(ba->getQuerySeq().size()) + ")" - + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast(ba->getQuerySeqAfterClipping().size()) + ")" - + "\nCigar: " + ba->getCigarAsString() - + "\nLeft Anchor query seq: \n" + qAnchorLeft + " (" + lexical_cast(qAnchorLeft.size()) + ")" - + "\nLeft Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast(gAnchorLeft.size()) + ")")); + error = true; + std::cerr << endl << "WARNING: Skipping problematic alignment:" << endl + << "Left anchor region for query and genome are not the same size." << endl + << "Intron: " + i.toString() << endl + << "Junction anchor limits: " + lexical_cast(leftStart) + "," + lexical_cast(rightEnd) << endl + << "Genomic sequence: " + ancLeft << endl + << "Alignment coords (before soft clipping): " + ba->toString(false) << endl + << "Alignment coords (after soft clipping): " + ba->toString(true) << endl + << "Read name: " + ba->deriveName() << endl + << "Read seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast(ba->getQuerySeq().size()) + ")" << endl + << "Read seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast(ba->getQuerySeqAfterClipping().size()) + ")" << endl + << "Cigar: " + ba->getCigarAsString() << endl + << "Left Anchor query seq: \n" + qAnchorLeft + " (" + lexical_cast(qAnchorLeft.size()) + ")" << endl + << "Left Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast(gAnchorLeft.size()) + ")" << endl << endl; } if (qAnchorRight.size() != gAnchorRight.size() || qAnchorRight.empty()) { - BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string( - "Right Anchor region for query and genome are not the same size.") - + "\nIntron: " + i.toString() - + "\nJunction anchor limits: " + lexical_cast(leftStart) + "," + lexical_cast(rightEnd) - + "\nGenomic sequence: " + ancRight - + "\nAlignment coords (before soft clipping): " + ba->toString(false) - + "\nAlignment coords (after soft clipping): " + ba->toString(true) - + "\nRead name: " + ba->deriveName() - + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast(ba->getQuerySeq().size()) + ")" - + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast(ba->getQuerySeqAfterClipping().size()) + ")" - + "\nCigar: " + ba->getCigarAsString() - + "\nRight Anchor query seq: \n" + qAnchorRight + " (" + lexical_cast(qAnchorRight.size()) + ")" - + "\nRight Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast(gAnchorRight.size()) + ")")); + error = true; + std::cerr << endl << "WARNING: Skipping problematic alignment:" << endl + << "Right Anchor region for query and genome are not the same size." << endl + << "Intron: " + i.toString() << endl + << "Junction anchor limits: " + lexical_cast(leftStart) + "," + lexical_cast(rightEnd) << endl + << "Genomic sequence: " + ancRight << endl + << "Alignment coords (before soft clipping): " + ba->toString(false) << endl + << "Alignment coords (after soft clipping): " + ba->toString(true) << endl + << "Read name: " + ba->deriveName() << endl + << "Read seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast(ba->getQuerySeq().size()) + ")" << endl + << "Read seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast(ba->getQuerySeqAfterClipping().size()) + ")" << endl + << "Cigar: " + ba->getCigarAsString() << endl + << "Right Anchor query seq: \n" + qAnchorRight + " (" + lexical_cast(qAnchorRight.size()) + ")" << endl + << "Right Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast(gAnchorRight.size()) + ")" << endl << endl; + } + if (!error) { + totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft); + totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight); + totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches; + totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches; + nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches; + upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft); + downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight); + minMatch = min(upstreamMatches, downstreamMatches); + maxMatch = max(upstreamMatches, downstreamMatches); + mmes = min(totalUpstreamMatches, totalDownstreamMatches); } - totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft); - totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight); - totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches; - totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches; - nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches; - upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft); - downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight); - minMatch = min(upstreamMatches, downstreamMatches); - maxMatch = max(upstreamMatches, downstreamMatches); - mmes = min(totalUpstreamMatches, totalDownstreamMatches); } } diff --git a/src/portcullis.cc b/src/portcullis.cc index 0154f168..3cda9bfc 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -31,7 +31,6 @@ using std::string; using std::cout; using std::cerr; using std::endl; -using std::exception; #include #include @@ -73,6 +72,43 @@ enum class Mode { FULL }; +void print_backtrace(int depth = 0) { + try { + throw; + } + // this block shows how to handle exceptions of some known type + // You can have your own types instead of std::exception + catch (const std::exception & e) { + std::cerr << std::string(depth, ' ') << e.what() << std::endl; + try { + std::rethrow_if_nested(e); + } catch (...) { + print_backtrace(++depth); + } + } + // Not all nesting exceptions will be of a known type, but if they use the + // mixin type std::nested_exception, then we can at least handle them enough to + // get the nested exception: + catch (const std::nested_exception & ne) { + std::cerr << std::string(depth, ' ') << "Unknown nesting exception\n"; + + try { + ne.rethrow_nested(); + } catch (...) { + print_backtrace(++depth); + } + } + // Exception nesting works through inheritance, which means that if you + // can't inherit from the type, then you can't 'mixin' std::nesting exception. + // If you try something like std::throw_with_nested( int{10} ); Then you'll + // hit this catch block when printing the backtrace. + catch (...) { + std::cerr << std::string(depth, ' ') << "Unknown exception\n"; + } +} + + + Mode parseMode(string mode) { string upperMode = boost::to_upper_copy(mode); if (upperMode == string("PREP") || upperMode == string("PREPARE")) { @@ -474,20 +510,23 @@ int main(int argc, char *argv[]) { cerr << "Error: Parsing Command Line: " << e.what() << endl; return 1; } - catch (boost::exception &e) { + catch (boost::exception& e) { cerr << boost::diagnostic_information(e); - return 4; + return 4; } - catch (exception& e) { + catch (std::exception& e) { cerr << "Error: " << e.what() << endl; + print_backtrace(); return 5; } catch (const char* msg) { cerr << "Error: " << msg << endl; + print_backtrace(); return 6; } catch (...) { cerr << "Error: Exception of unknown type!" << endl; + print_backtrace(); return 7; } return 0; From 91155d124ef33cb236ee614b76f9ece328f4814a Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 7 Jul 2018 13:19:41 +0100 Subject: [PATCH 02/15] Resetting compiler flags --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 3c44f25a..346aff51 100644 --- a/configure.ac +++ b/configure.ac @@ -179,7 +179,7 @@ fi AM_CONDITIONAL([MAKE_DOCS], [test x$sphinx = xyes]) AC_SUBST([MAKE_DOCS]) -AM_CXXFLAGS="-g3 -gdwarf-2 -O0 -std=c++11" +AM_CXXFLAGS="-g -O2 -std=c++11" AC_SUBST([AM_CXXFLAGS]) AM_CPPFLAGS="${PYTHON_CPPFLAGS}" From b4cbb4cea45960cd038daf7456acfbc10b2f3c65 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 7 Jul 2018 13:20:42 +0100 Subject: [PATCH 03/15] Bumping version --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 346aff51..1cbd1317 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[1.1.1],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk]) +AC_INIT([portcullis],[1.1.2],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) From a3b8cbdf869da964cd5fbe78fcedfce7fe607393 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 21 Jul 2018 14:43:15 +0100 Subject: [PATCH 04/15] Adding option to control precision of portcullis --- Makefile.am | 30 +- .../selftrain_initial_neg.layer1.json | 0 .../selftrain_initial_neg.layer2.json | 0 .../selftrain_initial_neg.layer3.json | 0 .../selftrain_initial_neg.layer4.json | 0 .../selftrain_initial_neg.layer5.json | 0 .../selftrain_initial_neg.layer6.json | 0 .../selftrain_initial_neg.layer7.json | 0 .../selftrain_initial_pos.layer1.json | 0 .../selftrain_initial_pos.layer2.json | 0 .../selftrain_initial_pos.layer3.json | 16 - .../precise/selftrain_initial_neg.layer1.json | 17 + .../precise/selftrain_initial_neg.layer2.json | 25 + .../precise/selftrain_initial_neg.layer3.json | 13 + .../precise/selftrain_initial_neg.layer4.json | 13 + .../precise/selftrain_initial_neg.layer5.json | 21 + .../precise/selftrain_initial_neg.layer6.json | 13 + .../precise/selftrain_initial_neg.layer7.json | 17 + .../precise/selftrain_initial_pos.layer1.json | 37 + .../precise/selftrain_initial_pos.layer2.json | 45 + .../precise/selftrain_initial_pos.layer3.json | 65 ++ .../build/src/engine/bootstrap/.gitignore | 1 - doc/source/conf.py | 4 +- src/junction_filter.cc | 983 +++++++++--------- src/junction_filter.hpp | 616 +++++------ src/portcullis.cc | 718 +++++++------ 26 files changed, 1438 insertions(+), 1196 deletions(-) rename data/{ => balanced}/selftrain_initial_neg.layer1.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer2.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer3.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer4.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer5.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer6.json (100%) rename data/{ => balanced}/selftrain_initial_neg.layer7.json (100%) rename data/{ => balanced}/selftrain_initial_pos.layer1.json (100%) rename data/{ => balanced}/selftrain_initial_pos.layer2.json (100%) rename data/{ => balanced}/selftrain_initial_pos.layer3.json (81%) create mode 100755 data/precise/selftrain_initial_neg.layer1.json create mode 100755 data/precise/selftrain_initial_neg.layer2.json create mode 100755 data/precise/selftrain_initial_neg.layer3.json create mode 100755 data/precise/selftrain_initial_neg.layer4.json create mode 100755 data/precise/selftrain_initial_neg.layer5.json create mode 100755 data/precise/selftrain_initial_neg.layer6.json create mode 100755 data/precise/selftrain_initial_neg.layer7.json create mode 100755 data/precise/selftrain_initial_pos.layer1.json create mode 100755 data/precise/selftrain_initial_pos.layer2.json create mode 100755 data/precise/selftrain_initial_pos.layer3.json delete mode 100644 deps/boost/tools/build/src/engine/bootstrap/.gitignore diff --git a/Makefile.am b/Makefile.am index dda1ab58..f9aaf6c7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,16 +22,26 @@ configdir = $(datadir)/portcullis dist_config_DATA = \ data/default_filter.json \ data/low_juncs_filter.json \ - data/selftrain_initial_neg.layer1.json \ - data/selftrain_initial_neg.layer2.json \ - data/selftrain_initial_neg.layer3.json \ - data/selftrain_initial_neg.layer4.json \ - data/selftrain_initial_neg.layer5.json \ - data/selftrain_initial_neg.layer6.json \ - data/selftrain_initial_neg.layer7.json \ - data/selftrain_initial_pos.layer1.json \ - data/selftrain_initial_pos.layer2.json \ - data/selftrain_initial_pos.layer3.json + data/balanced/selftrain_initial_neg.layer1.json \ + data/balanced/selftrain_initial_neg.layer2.json \ + data/balanced/selftrain_initial_neg.layer3.json \ + data/balanced/selftrain_initial_neg.layer4.json \ + data/balanced/selftrain_initial_neg.layer5.json \ + data/balanced/selftrain_initial_neg.layer6.json \ + data/balanced/selftrain_initial_neg.layer7.json \ + data/balanced/selftrain_initial_pos.layer1.json \ + data/balanced/selftrain_initial_pos.layer2.json \ + data/balanced/selftrain_initial_pos.layer3.json \ + data/precise/selftrain_initial_neg.layer1.json \ + data/precise/selftrain_initial_neg.layer2.json \ + data/precise/selftrain_initial_neg.layer3.json \ + data/precise/selftrain_initial_neg.layer4.json \ + data/precise/selftrain_initial_neg.layer5.json \ + data/precise/selftrain_initial_neg.layer6.json \ + data/precise/selftrain_initial_neg.layer7.json \ + data/precise/selftrain_initial_pos.layer1.json \ + data/precise/selftrain_initial_pos.layer2.json \ + data/precise/selftrain_initial_pos.layer3.json # SRC DIRS make_dirs=deps/htslib-1.3 deps/ranger-0.3.8 lib src tests scripts diff --git a/data/selftrain_initial_neg.layer1.json b/data/balanced/selftrain_initial_neg.layer1.json similarity index 100% rename from data/selftrain_initial_neg.layer1.json rename to data/balanced/selftrain_initial_neg.layer1.json diff --git a/data/selftrain_initial_neg.layer2.json b/data/balanced/selftrain_initial_neg.layer2.json similarity index 100% rename from data/selftrain_initial_neg.layer2.json rename to data/balanced/selftrain_initial_neg.layer2.json diff --git a/data/selftrain_initial_neg.layer3.json b/data/balanced/selftrain_initial_neg.layer3.json similarity index 100% rename from data/selftrain_initial_neg.layer3.json rename to data/balanced/selftrain_initial_neg.layer3.json diff --git a/data/selftrain_initial_neg.layer4.json b/data/balanced/selftrain_initial_neg.layer4.json similarity index 100% rename from data/selftrain_initial_neg.layer4.json rename to data/balanced/selftrain_initial_neg.layer4.json diff --git a/data/selftrain_initial_neg.layer5.json b/data/balanced/selftrain_initial_neg.layer5.json similarity index 100% rename from data/selftrain_initial_neg.layer5.json rename to data/balanced/selftrain_initial_neg.layer5.json diff --git a/data/selftrain_initial_neg.layer6.json b/data/balanced/selftrain_initial_neg.layer6.json similarity index 100% rename from data/selftrain_initial_neg.layer6.json rename to data/balanced/selftrain_initial_neg.layer6.json diff --git a/data/selftrain_initial_neg.layer7.json b/data/balanced/selftrain_initial_neg.layer7.json similarity index 100% rename from data/selftrain_initial_neg.layer7.json rename to data/balanced/selftrain_initial_neg.layer7.json diff --git a/data/selftrain_initial_pos.layer1.json b/data/balanced/selftrain_initial_pos.layer1.json similarity index 100% rename from data/selftrain_initial_pos.layer1.json rename to data/balanced/selftrain_initial_pos.layer1.json diff --git a/data/selftrain_initial_pos.layer2.json b/data/balanced/selftrain_initial_pos.layer2.json similarity index 100% rename from data/selftrain_initial_pos.layer2.json rename to data/balanced/selftrain_initial_pos.layer2.json diff --git a/data/selftrain_initial_pos.layer3.json b/data/balanced/selftrain_initial_pos.layer3.json similarity index 81% rename from data/selftrain_initial_pos.layer3.json rename to data/balanced/selftrain_initial_pos.layer3.json index 9d199312..baf27848 100755 --- a/data/selftrain_initial_pos.layer3.json +++ b/data/balanced/selftrain_initial_pos.layer3.json @@ -12,22 +12,6 @@ "operator": "in", "value": ["N"] }, - "nb_rel_aln.1": { - "operator": "gte", - "value": 5 - }, - "nb_rel_aln.2": { - "operator": "gte", - "value": 1 - }, - "maxmmes.1": { - "operator": "gte", - "value": 20 - }, - "maxmmes.2": { - "operator": "gte", - "value": 10 - }, "entropy.1": { "operator": "gt", "value": 3.0 diff --git a/data/precise/selftrain_initial_neg.layer1.json b/data/precise/selftrain_initial_neg.layer1.json new file mode 100755 index 00000000..64becb94 --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer1.json @@ -0,0 +1,17 @@ +{ + "parameters": { + "maxmmes": { + "operator": "lt", + "value": 15 + }, + "nb_us_aln": { + "operator": "lte", + "value": 1 + }, + "rel2raw": { + "operator": "eq", + "value": 0 + } + }, + "expression": "( maxmmes & nb_us_aln & rel2raw )" +} diff --git a/data/precise/selftrain_initial_neg.layer2.json b/data/precise/selftrain_initial_neg.layer2.json new file mode 100755 index 00000000..052e6d2d --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer2.json @@ -0,0 +1,25 @@ +{ + "parameters": { + "canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "rel2raw": { + "operator": "lt", + "value": 0.5 + }, + "mean_mismatches": { + "operator": "gte", + "value": 1 + }, + "maxmmes": { + "operator": "lt", + "value": 15 + }, + "nb_us_aln": { + "operator": "lte", + "value": 1 + } + }, + "expression": "( nb_us_aln & maxmmes & rel2raw & ( canonical_ss | mean_mismatches ) )" +} diff --git a/data/precise/selftrain_initial_neg.layer3.json b/data/precise/selftrain_initial_neg.layer3.json new file mode 100755 index 00000000..b6633c56 --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer3.json @@ -0,0 +1,13 @@ +{ + "parameters": { + "canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "pfp": { + "operator": "eq", + "value": 1 + } + }, + "expression": "( canonical_ss & pfp )" +} diff --git a/data/precise/selftrain_initial_neg.layer4.json b/data/precise/selftrain_initial_neg.layer4.json new file mode 100755 index 00000000..9d1a66d8 --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer4.json @@ -0,0 +1,13 @@ +{ + "parameters": { + "maxmmes": { + "operator": "lt", + "value": 15 + }, + "rel2raw": { + "operator": "lt", + "value": 0.3 + } + }, + "expression": "( maxmmes & rel2raw )" +} diff --git a/data/precise/selftrain_initial_neg.layer5.json b/data/precise/selftrain_initial_neg.layer5.json new file mode 100755 index 00000000..9e036fa3 --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer5.json @@ -0,0 +1,21 @@ +{ + "parameters": { + "nb_rel_aln": { + "operator": "lt", + "value": 1 + }, + "entropy": { + "operator": "eq", + "value": 0 + }, + "primary_junc": { + "operator": "eq", + "value": 0 + }, + "suspicious": { + "operator": "eq", + "value": 1 + } + }, + "expression": "( nb_rel_aln & entropy & primary_junc & suspicious )" +} diff --git a/data/precise/selftrain_initial_neg.layer6.json b/data/precise/selftrain_initial_neg.layer6.json new file mode 100755 index 00000000..b6633c56 --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer6.json @@ -0,0 +1,13 @@ +{ + "parameters": { + "canonical_ss": { + "operator": "in", + "value": ["N", "S"] + }, + "pfp": { + "operator": "eq", + "value": 1 + } + }, + "expression": "( canonical_ss & pfp )" +} diff --git a/data/precise/selftrain_initial_neg.layer7.json b/data/precise/selftrain_initial_neg.layer7.json new file mode 100755 index 00000000..44c876cb --- /dev/null +++ b/data/precise/selftrain_initial_neg.layer7.json @@ -0,0 +1,17 @@ +{ + "parameters": { + "rel2raw": { + "operator": "eq", + "value": 0.0 + }, + "hamming5p": { + "operator": "lte", + "value": 3 + }, + "hamming3p": { + "operator": "lte", + "value": 3 + } + }, + "expression": "( rel2raw & hamming5p & hamming3p )" +} diff --git a/data/precise/selftrain_initial_pos.layer1.json b/data/precise/selftrain_initial_pos.layer1.json new file mode 100755 index 00000000..4699b6ac --- /dev/null +++ b/data/precise/selftrain_initial_pos.layer1.json @@ -0,0 +1,37 @@ +{ + "parameters": { + "nb_rel_aln": { + "operator": "gte", + "value": 1 + }, + "maxmmes": { + "operator": "gte", + "value": 8 + }, + "entropy": { + "operator": "gt", + "value": 1.0 + }, + "hamming5p": { + "operator": "gte", + "value": 4 + }, + "hamming3p": { + "operator": "gte", + "value": 4 + }, + "mean_mismatches": { + "operator": "lte", + "value": 1.0 + }, + "nb_us_aln": { + "operator": "gte", + "value": 1 + }, + "rel2raw": { + "operator": "gte", + "value": 0.25 + } + }, + "expression": "nb_rel_aln & hamming5p & hamming3p & maxmmes & nb_us_aln & mean_mismatches & rel2raw" +} diff --git a/data/precise/selftrain_initial_pos.layer2.json b/data/precise/selftrain_initial_pos.layer2.json new file mode 100755 index 00000000..ce0f4596 --- /dev/null +++ b/data/precise/selftrain_initial_pos.layer2.json @@ -0,0 +1,45 @@ +{ + "parameters": { + "nb_rel_aln.1": { + "operator": "gte", + "value": 5 + }, + "nb_rel_aln.2": { + "operator": "gte", + "value": 3 + }, + "maxmmes.1": { + "operator": "gte", + "value": 20 + }, + "maxmmes.2": { + "operator": "gt", + "value": 12 + }, + "hamming5p.1": { + "operator": "gte", + "value": 7 + }, + "hamming5p.2": { + "operator": "gte", + "value": 9 + }, + "hamming3p.1": { + "operator": "gte", + "value": 7 + }, + "hamming3p.2": { + "operator": "gte", + "value": 9 + }, + "mean_mismatches.1": { + "operator": "lte", + "value": 0 + }, + "mean_mismatches.2": { + "operator": "lt", + "value": 0.33 + } + }, + "expression": "( nb_rel_aln.1 & maxmmes.1 ) | ( nb_rel_aln.2 & maxmmes.2 & hamming5p.1 & hamming3p.1 & mean_mismatches.2 ) | ( hamming5p.2 & hamming3p.2 & mean_mismatches.1 )" +} diff --git a/data/precise/selftrain_initial_pos.layer3.json b/data/precise/selftrain_initial_pos.layer3.json new file mode 100755 index 00000000..baf27848 --- /dev/null +++ b/data/precise/selftrain_initial_pos.layer3.json @@ -0,0 +1,65 @@ +{ + "parameters": { + "canonical_ss.1": { + "operator": "in", + "value": ["C"] + }, + "canonical_ss.2": { + "operator": "in", + "value": ["S"] + }, + "canonical_ss.3": { + "operator": "in", + "value": ["N"] + }, + "entropy.1": { + "operator": "gt", + "value": 3.0 + }, + "entropy.2": { + "operator": "gt", + "value": 1.5 + }, + "hamming5p.1": { + "operator": "gte", + "value": 6 + }, + "hamming5p.2": { + "operator": "gte", + "value": 7 + }, + "hamming3p.1": { + "operator": "gte", + "value": 6 + }, + "hamming3p.2": { + "operator": "gte", + "value": 7 + }, + "mean_mismatches.1": { + "operator": "eq", + "value": 0 + }, + "mean_mismatches.2": { + "operator": "lt", + "value": 0.1 + }, + "nb_us_aln": { + "operator": "gte", + "value": 5 + }, + "rel2raw.1": { + "operator": "gte", + "value": 0.5 + }, + "rel2raw.2": { + "operator": "gte", + "value": 0.75 + }, + "primary_junc": { + "operator": "eq", + "value": 1 + } + }, + "expression": "(( canonical_ss.1 ) | ( canonical_ss.2 & rel2raw.1 & hamming5p.1 & hamming3p.1 ) | ( canonical_ss.3 & rel2raw.2 & hamming5p.2 & hamming3p.2 & mean_mismatches.1 & entropy.2 )) & (primary_junc)" +} diff --git a/deps/boost/tools/build/src/engine/bootstrap/.gitignore b/deps/boost/tools/build/src/engine/bootstrap/.gitignore deleted file mode 100644 index ca6d17cc..00000000 --- a/deps/boost/tools/build/src/engine/bootstrap/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/jam0 diff --git a/doc/source/conf.py b/doc/source/conf.py index 8a489efb..2d3c7e2b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '1.1.1' +version = '1.1.2' # The full version, including alpha/beta/rc tags. -release = '1.1.1' +release = '1.1.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 816842fc..66ec2d57 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -67,160 +67,158 @@ using portcullis::PyHelper; #include "prepare.hpp" portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile, - const path& _output) { - junctionFile = _junctionFile; - prepData.setPrepDir(_prepDir); - modelFile = ""; - genuineFile = ""; - output = _output; - filterFile = ""; - referenceFile = ""; - saveBad = false; - threads = 1; - maxLength = 0; - filterCanonical = false; - filterSemi = false; - filterNovel = false; - source = DEFAULT_FILTER_SOURCE; - verbose = false; - threshold = DEFAULT_FILTER_THRESHOLD; - smote = true; - enn = true; + const path& _output, bool _precise): precise(_precise) { + junctionFile = _junctionFile; + prepData.setPrepDir(_prepDir); + modelFile = ""; + genuineFile = ""; + output = _output; + filterFile = ""; + referenceFile = ""; + saveBad = false; + threads = 1; + maxLength = 0; + filterCanonical = false; + filterSemi = false; + filterNovel = false; + source = DEFAULT_FILTER_SOURCE; + verbose = false; + threshold = DEFAULT_FILTER_THRESHOLD; + smote = true; + enn = true; } void portcullis::JunctionFilter::filter() { - path outputDir = output.parent_path(); - string outputPrefix = output.leaf().string(); - if (outputDir.empty()) { - outputDir = "."; - } - // Test if provided genome exists - if (!exists(junctionFile)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find junction file at: ") + junctionFile.string())); - } - if (!bfs::exists(prepData.getGenomeFilePath())) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); - } - // Test if provided filter config file exists - if (!modelFile.empty() && !exists(modelFile)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter model file at: ") + modelFile.string())); - } - if (!filterFile.empty() && !exists(filterFile)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter configuration file at: ") + filterFile.string())); - } - if (!genuineFile.empty() && !exists(genuineFile)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find file containing marked junction labels at: ") + genuineFile.string())); - } - if (!referenceFile.empty() && !exists(referenceFile)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find reference BED file at: ") + referenceFile.string())); - } - if (!exists(outputDir)) { - if (!bfs::create_directories(outputDir)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not create output directory at: ") + outputDir.string())); - } - } - else if (!bfs::is_directory(outputDir)) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "File exists with name of suggested output directory: ") + outputDir.string())); - } - cout << "Loading junctions from " << junctionFile.string() << " ..."; - cout.flush(); - // Load junction system - JunctionSystem originalJuncs(junctionFile); - cout << " done." << endl - << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; + path outputDir = output.parent_path(); + string outputPrefix = output.leaf().string(); + if (outputDir.empty()) { + outputDir = "."; + } + // Test if provided genome exists + if (!exists(junctionFile)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find junction file at: ") + junctionFile.string())); + } + if (!bfs::exists(prepData.getGenomeFilePath())) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); + } + // Test if provided filter config file exists + if (!modelFile.empty() && !exists(modelFile)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find filter model file at: ") + modelFile.string())); + } + if (!filterFile.empty() && !exists(filterFile)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find filter configuration file at: ") + filterFile.string())); + } + if (!genuineFile.empty() && !exists(genuineFile)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find file containing marked junction labels at: ") + genuineFile.string())); + } + if (!referenceFile.empty() && !exists(referenceFile)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find reference BED file at: ") + referenceFile.string())); + } + if (!exists(outputDir)) { + if (!bfs::create_directories(outputDir)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not create output directory at: ") + outputDir.string())); + } + } else if (!bfs::is_directory(outputDir)) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "File exists with name of suggested output directory: ") + outputDir.string())); + } + cout << "Loading junctions from " << junctionFile.string() << " ..."; + cout.flush(); + // Load junction system + JunctionSystem originalJuncs(junctionFile); + cout << " done." << endl + << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; // Also keep a shortcut to the list of junctions JunctionList currentJuncs = originalJuncs.getJunctions(); - unordered_set ref; - if (!referenceFile.empty()) { - cout << "Loading junctions from reference: " << referenceFile.string() << " ..."; - cout.flush(); - ifstream ifs(referenceFile.c_str()); - string line; - // Loop through until end of file or we move onto the next ref seq - while (std::getline(ifs, line)) { - boost::trim(line); - vector parts; // #2: Search for tokens - boost::split(parts, line, boost::is_any_of("\t"), boost::token_compress_on); - // Ignore any non-entry lines - if (parts.size() == 12) { - int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos - string key = parts[0] + "(" + parts[6] + "," + std::to_string(end) + ")" + parts[5]; - ref.insert(key); - } - } - cout << " done." << endl - << "Found " << ref.size() << " junctions in reference." << endl << endl; - } - vector genuine; - if (!genuineFile.empty()) { - cout << "Loading list of correct predictions of performance analysis ..."; - cout.flush(); - Performance::loadGenuine(genuineFile, genuine); - if (genuine.size() != originalJuncs.getJunctions().size()) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Genuine file contains ") + lexical_cast(genuine.size()) + - " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + - " junctions. The number of entries in both files must be the same to assess performance.")); - } - // Copy over results into junction list - for (size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { - originalJuncs.getJunctionAt(i)->setGenuine(genuine[i]); - } - cout << " done." << endl << endl; - } - - // To be overridden if we are training - ModelFeatures mf; - mf.initGenomeMapper(prepData.getGenomeFilePath()); - mf.features[1].active = false; // NB USRS (BAD) - mf.features[2].active = false; // NB DISTRS (BAD) - //mf.features[3].active=false; // NB RELRS (GOOD) - mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) - //mf.features[5].active = false; // REL2RAW (GOOD) - mf.features[6].active = false; // MAXMINANC (BAD - MAXMMES IS BETTER) - //mf.features[7].active=false; // MAXMMES (GOOD) - //mf.features[8].active=false; // MEAN MISMATCH (GOOD) - //mf.features[9].active=false; // INTRON (GOOD) - //mf.features[10].active=false; // MIN_HAMM (GOOD) - mf.features[11].active = false; // CODING POTENTIAL (BAD) - //mf.features[12].active=false; // POS WEIGHTS (GOOD) - //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) - /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY - mf.features[15].active=false; - mf.features[16].active=false; - mf.features[17].active=false; - mf.features[18].active=false; - mf.features[19].active=false; - mf.features[20].active=false; - mf.features[21].active=false; - mf.features[22].active=false; - mf.features[23].active=false; - mf.features[24].active=false; - mf.features[25].active=false; - mf.features[26].active=false; - mf.features[27].active=false; - mf.features[28].active=false; - mf.features[29].active=false; - */ - double ratio = 0.0; + unordered_set ref; + if (!referenceFile.empty()) { + cout << "Loading junctions from reference: " << referenceFile.string() << " ..."; + cout.flush(); + ifstream ifs(referenceFile.c_str()); + string line; + // Loop through until end of file or we move onto the next ref seq + while (std::getline(ifs, line)) { + boost::trim(line); + vector parts; // #2: Search for tokens + boost::split(parts, line, boost::is_any_of("\t"), boost::token_compress_on); + // Ignore any non-entry lines + if (parts.size() == 12) { + int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos + string key = parts[0] + "(" + parts[6] + "," + std::to_string(end) + ")" + parts[5]; + ref.insert(key); + } + } + cout << " done." << endl + << "Found " << ref.size() << " junctions in reference." << endl << endl; + } + vector genuine; + if (!genuineFile.empty()) { + cout << "Loading list of correct predictions of performance analysis ..."; + cout.flush(); + Performance::loadGenuine(genuineFile, genuine); + if (genuine.size() != originalJuncs.getJunctions().size()) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Genuine file contains ") + lexical_cast(genuine.size()) + + " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + + " junctions. The number of entries in both files must be the same to assess performance.")); + } + // Copy over results into junction list + for (size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { + originalJuncs.getJunctionAt(i)->setGenuine(genuine[i]); + } + cout << " done." << endl << endl; + } + + // To be overridden if we are training + ModelFeatures mf; + mf.initGenomeMapper(prepData.getGenomeFilePath()); + mf.features[1].active = false; // NB USRS (BAD) + mf.features[2].active = false; // NB DISTRS (BAD) + //mf.features[3].active=false; // NB RELRS (GOOD) + mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) + //mf.features[5].active = false; // REL2RAW (GOOD) + mf.features[6].active = false; // MAXMINANC (BAD - MAXMMES IS BETTER) + //mf.features[7].active=false; // MAXMMES (GOOD) + //mf.features[8].active=false; // MEAN MISMATCH (GOOD) + //mf.features[9].active=false; // INTRON (GOOD) + //mf.features[10].active=false; // MIN_HAMM (GOOD) + mf.features[11].active = false; // CODING POTENTIAL (BAD) + //mf.features[12].active=false; // POS WEIGHTS (GOOD) + //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) + /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY + mf.features[15].active=false; + mf.features[16].active=false; + mf.features[17].active=false; + mf.features[18].active=false; + mf.features[19].active=false; + mf.features[20].active=false; + mf.features[21].active=false; + mf.features[22].active=false; + mf.features[23].active=false; + mf.features[24].active=false; + mf.features[25].active=false; + mf.features[26].active=false; + mf.features[27].active=false; + mf.features[28].active=false; + mf.features[29].active=false; + */ + double ratio = 0.0; if (train) { if (currentJuncs.size() < 200) { cout << "Less that 200 junctions found in input set. This is not enough to build a trained model. Will apply a lenient rule-based filter instead." << endl; filterFile = path(dataDir.string()); filterFile /= "low_juncs_filter.json"; - } - else { + } else { // The initial positive and negative sets JunctionList unlabelled, unlabelled2; cout << "Self training mode activated." << endl << endl; @@ -228,27 +226,29 @@ void portcullis::JunctionFilter::filter() { path rf_script = path("portcullis") / "rule_filter.py"; vector args; args.push_back(rf_script.string()); + + string ruleset = dataDir.string() + (precise ? "/precise" : "/balanced"); args.push_back("--pos_json"); - args.push_back(dataDir.string() + "/selftrain_initial_pos.layer1.json"); - args.push_back(dataDir.string() + "/selftrain_initial_pos.layer2.json"); - args.push_back(dataDir.string() + "/selftrain_initial_pos.layer3.json"); + args.push_back(ruleset + "/selftrain_initial_pos.layer1.json"); + args.push_back(ruleset + "/selftrain_initial_pos.layer2.json"); + args.push_back(ruleset + "/selftrain_initial_pos.layer3.json"); args.push_back("--neg_json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer1.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer2.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer3.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer4.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer5.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer6.json"); - args.push_back(dataDir.string() + "/selftrain_initial_neg.layer7.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer1.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer2.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer3.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer4.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer5.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer6.json"); + args.push_back(ruleset + "/selftrain_initial_neg.layer7.json"); args.push_back("--prefix=" + output.string() + ".selftrain.initialset"); args.push_back(junctionFile.string()); char* char_args[50]; - for(size_t i = 0; i < args.size(); i++) { + for (size_t i = 0; i < args.size(); i++) { char_args[i] = strdup(args[i].c_str()); } @@ -257,7 +257,7 @@ void portcullis::JunctionFilter::filter() { cout << "Executing python script with this command: " << rf_script.string() << " " << arg_str << endl; } - PyHelper::getInstance().execute(rf_script.string(), (int)args.size(), char_args); + PyHelper::getInstance().execute(rf_script.string(), (int) args.size(), char_args); // Load junction system JunctionSystem posSystem(path(output.string() + ".selftrain.initialset.pos.junctions.tab")); @@ -281,8 +281,7 @@ void portcullis::JunctionFilter::filter() { cout << "Training set is of insufficient size to reliably use machine learning, we will filter junctions using a lenient rule-based filter instead." << endl; filterFile = path(dataDir.string()); filterFile /= "low_juncs_filter.json"; - } - else { + } else { ratio = 1.0 - ((double) pos.size() / (double) (pos.size() + neg.size())); cout << "Pos to neg ratio: " << ratio << endl << endl; @@ -303,7 +302,7 @@ void portcullis::JunctionFilter::filter() { // Double check we got that correctly if (!foundL95) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Problem loading L95 value from disk: " + output.string() + ".L95_intron_size.txt"))); + "Problem loading L95 value from disk: " + output.string() + ".L95_intron_size.txt"))); } cout << "Confirming intron length L95 is: " << mf.L95 << endl; @@ -314,38 +313,37 @@ void portcullis::JunctionFilter::filter() { cout << " done." << endl << endl; cout << "Training Random Forest" << endl - << "----------------------" << endl << endl; + << "----------------------" << endl << endl; shared_ptr forest = mf.trainInstance(pos, neg, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true, smote, enn, saveFeatures); forest->saveToFile(); modelFile = output.string() + ".selftrain.forest"; cout << endl; } } - } - // Manage a junction system of all discarded junctions - JunctionSystem discardedJuncs; - // Do ML based filtering if requested - if (!modelFile.empty() && exists(modelFile)) { - cout << "Predicting valid junctions using random forest model" << endl - << "----------------------------------------------------" << endl << endl; - JunctionList passJuncs; - JunctionList failJuncs; - forestPredict(currentJuncs, passJuncs, failJuncs, mf); - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); - // Reset currentJuncs - currentJuncs.clear(); - for (auto & j : passJuncs) { - currentJuncs.push_back(j); - } - for (auto & j : failJuncs) { - discardedJuncs.addJunction(j); - } - } + } + // Manage a junction system of all discarded junctions + JunctionSystem discardedJuncs; + // Do ML based filtering if requested + if (!modelFile.empty() && exists(modelFile)) { + cout << "Predicting valid junctions using random forest model" << endl + << "----------------------------------------------------" << endl << endl; + JunctionList passJuncs; + JunctionList failJuncs; + forestPredict(currentJuncs, passJuncs, failJuncs, mf); + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); + // Reset currentJuncs + currentJuncs.clear(); + for (auto & j : passJuncs) { + currentJuncs.push_back(j); + } + for (auto & j : failJuncs) { + discardedJuncs.addJunction(j); + } + } if (currentJuncs.empty()) { cout << "WARNING: No junctions left from input. Will not apply any further filters." << endl; - } - else { + } else { // Do rule based filtering if requested if (!filterFile.empty() && exists(filterFile)) { @@ -367,11 +365,11 @@ void portcullis::JunctionFilter::filter() { char* char_args[50]; - for(size_t i = 0; i < args.size(); i++) { + for (size_t i = 0; i < args.size(); i++) { char_args[i] = strdup(args[i].c_str()); } - PyHelper::getInstance().execute(rf_script.string(), (int)args.size(), char_args); + PyHelper::getInstance().execute(rf_script.string(), (int) args.size(), char_args); // Load junction system JunctionSystem posSystem(path(output.string() + ".rules_out.passed.junctions.tab")); @@ -392,8 +390,7 @@ void portcullis::JunctionFilter::filter() { if (currentJuncs.empty()) { cout << "WARNING: Rule-based filter discarded all junctions from input. Will not apply any further filters." << endl; - } - else { + } else { if (maxLength > 0 || this->doCanonicalFiltering() || minCov > 1) { JunctionList passJuncs; @@ -421,8 +418,7 @@ void portcullis::JunctionFilter::filter() { } if (pass) { passJuncs.push_back(j); - } - else { + } else { failJuncs.push_back(j); discardedJuncs.addJunction(j); } @@ -436,351 +432,348 @@ void portcullis::JunctionFilter::filter() { } } } - cout << endl; - JunctionSystem filteredJuncs; - JunctionSystem refKeptJuncs; - if (currentJuncs.empty()) { - cout << "WARNING: Filters discarded all junctions from input." << endl; - } - else { - cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; - cout.flush(); - for (auto & j : currentJuncs) { - filteredJuncs.addJunction(j); - } - uint32_t inref = 0; - if (!referenceFile.empty()) { - for (auto & j : currentJuncs) { - if (ref.count(j->locationAsString()) > 0) { - inref++; - } - } - for (auto & j : discardedJuncs.getJunctions()) { - if (ref.count(j->locationAsString()) > 0) { - filteredJuncs.addJunction(j); - refKeptJuncs.addJunction(j); - inref++; - } - } - } - filteredJuncs.calcJunctionStats(); - cout << " done." << endl << endl; - if (!referenceFile.empty()) { - cout << "Brought back " << refKeptJuncs.size() << " junctions that were discarded by filters but were present in reference file." << endl; - cout << "Your sample contains " << inref << " / " << ref.size() << " (" << ((double) inref / (double) ref.size()) * 100.0 << "%) junctions from the reference." << endl << endl; - } - } - printFilteringResults(originalJuncs.getJunctions(), - filteredJuncs.getJunctions(), - discardedJuncs.getJunctions(), - string("Overall results")); - cout << endl << "Saving junctions passing filter to disk:" << endl; - filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass", true, this->outputExonGFF, this->outputIntronGFF); - if (saveBad) { - cout << "Saving junctions failing filter to disk:" << endl; - discardedJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".fail", source + "_fail", true, this->outputExonGFF, this->outputIntronGFF); - if (!referenceFile.empty()) { - cout << "Saving junctions failing filters but present in reference:" << endl; - refKeptJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".ref", source + "_ref", true, this->outputExonGFF, this->outputIntronGFF); - } - } + cout << endl; + JunctionSystem filteredJuncs; + JunctionSystem refKeptJuncs; + if (currentJuncs.empty()) { + cout << "WARNING: Filters discarded all junctions from input." << endl; + } else { + cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; + cout.flush(); + for (auto & j : currentJuncs) { + filteredJuncs.addJunction(j); + } + uint32_t inref = 0; + if (!referenceFile.empty()) { + for (auto & j : currentJuncs) { + if (ref.count(j->locationAsString()) > 0) { + inref++; + } + } + for (auto & j : discardedJuncs.getJunctions()) { + if (ref.count(j->locationAsString()) > 0) { + filteredJuncs.addJunction(j); + refKeptJuncs.addJunction(j); + inref++; + } + } + } + filteredJuncs.calcJunctionStats(); + cout << " done." << endl << endl; + if (!referenceFile.empty()) { + cout << "Brought back " << refKeptJuncs.size() << " junctions that were discarded by filters but were present in reference file." << endl; + cout << "Your sample contains " << inref << " / " << ref.size() << " (" << ((double) inref / (double) ref.size()) * 100.0 << "%) junctions from the reference." << endl << endl; + } + } + printFilteringResults(originalJuncs.getJunctions(), + filteredJuncs.getJunctions(), + discardedJuncs.getJunctions(), + string("Overall results")); + cout << endl << "Saving junctions passing filter to disk:" << endl; + filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass", true, this->outputExonGFF, this->outputIntronGFF); + if (saveBad) { + cout << "Saving junctions failing filter to disk:" << endl; + discardedJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".fail", source + "_fail", true, this->outputExonGFF, this->outputIntronGFF); + if (!referenceFile.empty()) { + cout << "Saving junctions failing filters but present in reference:" << endl; + refKeptJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".ref", source + "_ref", true, this->outputExonGFF, this->outputIntronGFF); + } + } } void portcullis::JunctionFilter::undersample(JunctionList& jl, size_t size) { - std::mt19937 rng(12345); - while (jl.size() > size) { - std::uniform_int_distribution gen(0, jl.size()); // uniform, unbiased - int i = gen(rng); - jl.erase(jl.begin() + i); - } + std::mt19937 rng(12345); + while (jl.size() > size) { + std::uniform_int_distribution gen(0, jl.size()); // uniform, unbiased + int i = gen(rng); + jl.erase(jl.begin() + i); + } } - void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { - // Output stats - size_t diff = in.size() - pass.size(); - cout << endl << prefix << endl - << "-------------------------" << endl - << "Input contained " << in.size() << " junctions." << endl - << "Output contains " << pass.size() << " junctions." << endl - << "Filtered out " << diff << " junctions." << endl; - if (!genuineFile.empty() && exists(genuineFile)) { - shared_ptr p = calcPerformance(pass, fail); - cout << Performance::longHeader() << endl; - cout << p->toLongString() << endl << endl; - } + // Output stats + size_t diff = in.size() - pass.size(); + cout << endl << prefix << endl + << "-------------------------" << endl + << "Input contained " << in.size() << " junctions." << endl + << "Output contains " << pass.size() << " junctions." << endl + << "Filtered out " << diff << " junctions." << endl; + if (!genuineFile.empty() && exists(genuineFile)) { + shared_ptr p = calcPerformance(pass, fail); + cout << Performance::longHeader() << endl; + cout << p->toLongString() << endl << endl; + } } shared_ptr portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) { - uint32_t tp = 0, tn = 0, fp = 0, fn = 0; - if (invert) { - for (auto & j : pass) { - if (!j->isGenuine()) tn++; - else fn++; - } - for (auto & j : fail) { - if (j->isGenuine()) tp++; - else fp++; - } - } - else { - for (auto & j : pass) { - if (j->isGenuine()) tp++; - else fp++; - } - for (auto & j : fail) { - if (!j->isGenuine()) tn++; - else fn++; - } - } - return make_shared(tp, tn, fp, fn); + uint32_t tp = 0, tn = 0, fp = 0, fn = 0; + if (invert) { + for (auto & j : pass) { + if (!j->isGenuine()) tn++; + else fn++; + } + for (auto & j : fail) { + if (j->isGenuine()) tp++; + else fp++; + } + } else { + for (auto & j : pass) { + if (j->isGenuine()) tp++; + else fp++; + } + for (auto & j : fail) { + if (!j->isGenuine()) tn++; + else fn++; + } + } + return make_shared(tp, tn, fp, fn); } void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { - cout << "Creating feature vector" << endl; - Data* testingData = mf.juncs2FeatureVectors(all); + cout << "Creating feature vector" << endl; + Data* testingData = mf.juncs2FeatureVectors(all); if (saveFeatures) { path feature_file = output.string() + ".features.testing"; ofstream fout(feature_file.c_str(), std::ofstream::out); fout << Intron::locationOutputHeader() << "\t" << testingData->getHeader() << endl; - for(size_t i = 0; i < testingData->getNumRows(); i++) { - fout << *(all[i]->getIntron()) << "\t"<< testingData->getRow(i) << endl; + for (size_t i = 0; i < testingData->getNumRows(); i++) { + fout << *(all[i]->getIntron()) << "\t" << testingData->getRow(i) << endl; } fout.close(); } - cout << "Initialising random forest" << endl; - shared_ptr f = make_shared(); - vector catVars; - f->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - testingData, // Data object - 0, // M Try (0 == use default) - "", // Output prefix - DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) - 1234567890, // Seed for random generator - threads, // Number of threads - IMP_GINI, // Importance measure - train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size - //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, - "", // Status var name - true, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - false, // predall - 1.0); // Sample fraction - f->setVerboseOut(&cerr); - // Load trees from saved model - f->loadFromFile(modelFile.string()); - cout << "Making predictions" << endl; - f->run(verbose); - // Make sure score is saved back with the junction - for (size_t i = 0; i < all.size(); i++) { + cout << "Initialising random forest" << endl; + shared_ptr f = make_shared(); + vector catVars; + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + testingData, // Data object + 0, // M Try (0 == use default) + "", // Output prefix + DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 1234567890, // Seed for random generator + threads, // Number of threads + IMP_GINI, // Importance measure + train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, + "", // Status var name + true, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + false, // predall + 1.0); // Sample fraction + f->setVerboseOut(&cerr); + // Load trees from saved model + f->loadFromFile(modelFile.string()); + cout << "Making predictions" << endl; + f->run(verbose); + // Make sure score is saved back with the junction + for (size_t i = 0; i < all.size(); i++) { //cout << f->getPredictions()[i][0] << endl; double score = 1.0 - f->getPredictions()[i][0]; all[i]->setScore(score); - } - if (!genuineFile.empty() && exists(genuineFile)) { - vector thresholds; - for (double i = 0.0; i <= 1.0; i += 0.01) { - thresholds.push_back(i); - } - double max_mcc = 0.0; - double max_f1 = 0.0; - double best_t_mcc = 0.0; - double best_t_f1 = 0.0; - cout << "Threshold\t" << Performance::longHeader() << endl; - for (auto & t : thresholds) { - JunctionList pjl; - JunctionList fjl; - categorise(f, all, pjl, fjl, t); - shared_ptr perf = calcPerformance(pjl, fjl); - double mcc = perf->getMCC(); - double f1 = perf->getF1Score(); - cout << t << "\t" << perf->toLongString() << endl; - if (mcc > max_mcc) { - max_mcc = mcc; - best_t_mcc = t; - } - if (f1 > max_f1) { - max_f1 = f1; - best_t_f1 = t; - } - } - cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; - cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; - //threshold = best_t_mcc; - } - //threshold = calcGoodThreshold(f, all); - cout << "Threshold set at " << threshold << endl; - categorise(f, all, pass, fail, threshold); + } + if (!genuineFile.empty() && exists(genuineFile)) { + vector thresholds; + for (double i = 0.0; i <= 1.0; i += 0.01) { + thresholds.push_back(i); + } + double max_mcc = 0.0; + double max_f1 = 0.0; + double best_t_mcc = 0.0; + double best_t_f1 = 0.0; + cout << "Threshold\t" << Performance::longHeader() << endl; + for (auto & t : thresholds) { + JunctionList pjl; + JunctionList fjl; + categorise(f, all, pjl, fjl, t); + shared_ptr perf = calcPerformance(pjl, fjl); + double mcc = perf->getMCC(); + double f1 = perf->getF1Score(); + cout << t << "\t" << perf->toLongString() << endl; + if (mcc > max_mcc) { + max_mcc = mcc; + best_t_mcc = t; + } + if (f1 > max_f1) { + max_f1 = f1; + best_t_f1 = t; + } + } + cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; + cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; + //threshold = best_t_mcc; + } + //threshold = calcGoodThreshold(f, all); + cout << "Threshold set at " << threshold << endl; + categorise(f, all, pass, fail, threshold); delete testingData; } void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { - for (size_t i = 0; i < all.size(); i++) { - if ((1.0 - f->getPredictions()[i][0]) >= t) { - pass.push_back(all[i]); - } - else { - fail.push_back(all[i]); - } - } + for (size_t i = 0; i < all.size(); i++) { + if ((1.0 - f->getPredictions()[i][0]) >= t) { + pass.push_back(all[i]); + } else { + fail.push_back(all[i]); + } + } } double portcullis::JunctionFilter::calcGoodThreshold(shared_ptr f) { - uint32_t pos = 0; - uint32_t neg = 0; - for (auto & p : f->getPredictions()) { - if ((1.0 - p[0]) >= 0.5) { - pos++; - } - else { - neg++; - } - } - return (double) pos / (double) (pos + neg); + uint32_t pos = 0; + uint32_t neg = 0; + for (auto & p : f->getPredictions()) { + if ((1.0 - p[0]) >= 0.5) { + pos++; + } else { + neg++; + } + } + return (double) pos / (double) (pos + neg); } int portcullis::JunctionFilter::main(int argc, char *argv[]) { - path junctionFile; - path prepDir; - path modelFile; - path genuineFile; - path filterFile; - path referenceFile; - path output; - uint16_t threads; - bool no_ml; - bool saveBad; + path junctionFile; + path prepDir; + path modelFile; + path genuineFile; + path filterFile; + path referenceFile; + path output; + uint16_t threads; + bool no_ml; + bool saveBad; bool save_features; - bool exongff; - bool introngff; - uint32_t max_length; - string canonical; - uint32_t mincov; - string source; - bool no_smote; - bool enn; - double threshold; - bool verbose; - bool help; - struct winsize w; - ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); - // Declare the supported options. - po::options_description system_options("System options", w.ws_col, w.ws_col / 1.5); - system_options.add_options() - ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), - "The number of threads to use during testing (only applies if using forest model).") - ("verbose,v", po::bool_switch(&verbose)->default_value(false), - "Print extra information") - ("help", po::bool_switch(&help)->default_value(false), "Produce help message") - ; - po::options_description output_options("Output options", w.ws_col, w.ws_col / 1.5); - output_options.add_options() - ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), - "Output prefix for files generated by this program.") - ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), - "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") - ("exon_gff", po::bool_switch(&exongff)->default_value(false), - "Output exon-based junctions in GFF format.") - ("intron_gff", po::bool_switch(&introngff)->default_value(false), - "Output intron-based junctions in GFF format.") - ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), - "The value to enter into the \"source\" field in GFF files.") - ; - po::options_description filtering_options("Filtering options", w.ws_col, w.ws_col / 1.5); - filtering_options.add_options() - ("filter_file,f", po::value(&filterFile), - "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") - ("reference,r", po::value(&referenceFile), - "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") - ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), - "Disables machine learning filtering") - ("max_length", po::value(&max_length)->default_value(0), - "Filter junctions longer than this value. Default (0) is to not filter based on length.") - ("canonical", po::value(&canonical)->default_value("OFF"), - "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GT-AG), S = Semi-canonical junctions (AT-AC, or GC-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") - ("min_cov", po::value(&mincov)->default_value(1), - "Only keep junctions with a number of split reads greater than or equal to this number") - ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), - "The threshold score at which we determine a junction to be genuine or not. Increase value towards 1.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.") - ; - // Hidden options, will be allowed both on command line and - // in config file, but will not be shown to the user. - po::options_description hidden_options("Hidden options"); - hidden_options.add_options() - ("prep_data_dir", po::value(&prepDir), "Path to directory containing prepared data.") - ("junction_file", po::value(&junctionFile), "Path to the junction tab file to process.") - ("no_smote", po::bool_switch(&no_smote)->default_value(false), - "Use this flag to disable synthetic oversampling") - ("enn", po::bool_switch(&enn)->default_value(false), - "Use this flag to enable Edited Nearest Neighbour to clean decision region") - ("genuine,g", po::value(&genuineFile), - "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions. This option is only useful if you have access to simulated data.") - ("model_file,m", po::value(&modelFile), - "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") - ("save_features", po::bool_switch(&save_features)->default_value(false), - "Use this flag to save features (both for training set and again for all junctions) to disk.") - ; - // Positional option for the input bam file - po::positional_options_description p; - p.add("prep_data_dir", 1); - p.add("junction_file", 1); - // Options to display to the user - po::options_description display_options; - display_options.add(system_options).add(output_options).add(filtering_options); - // Combine non-positional options - po::options_description cmdline_options; - cmdline_options.add(display_options).add(hidden_options); - // Parse command line - po::variables_map vm; - po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm); - po::notify(vm); - // Output help information the exit if requested - if (help || argc <= 1) { - cout << title() << endl << endl - << description() << endl << endl - << "Usage: " << usage() << endl - << display_options << endl; - return 1; - } - auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); - cout << "Running portcullis in junction filter mode" << endl - << "------------------------------------------" << endl << endl; - // Create the prepare class - JunctionFilter filter(prepDir, junctionFile, output); - filter.setSaveBad(saveBad); - filter.setSource(source); - filter.setVerbose(verbose); - filter.setThreads(threads); - filter.setMaxLength(max_length); - filter.setCanonical(canonical); - filter.setMinCov(mincov); - filter.setOutputExonGFF(exongff); - filter.setOutputIntronGFF(introngff); - // Only set the filter rules if specified. - filter.setFilterFile(filterFile); - filter.setGenuineFile(genuineFile); - if (modelFile.empty() && !no_ml) { - filter.setTrain(true); - } - else { - filter.setTrain(false); - if (!no_ml) { - filter.setModelFile(modelFile); - } - } + bool exongff; + bool introngff; + uint32_t max_length; + string canonical; + bool balanced; + uint32_t mincov; + string source; + bool no_smote; + bool enn; + double threshold; + bool verbose; + bool help; + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + // Declare the supported options. + po::options_description system_options("System options", w.ws_col, w.ws_col / 1.5); + system_options.add_options() + ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), + "The number of threads to use during testing (only applies if using forest model).") + ("verbose,v", po::bool_switch(&verbose)->default_value(false), + "Print extra information") + ("help", po::bool_switch(&help)->default_value(false), "Produce help message") + ; + po::options_description output_options("Output options", w.ws_col, w.ws_col / 1.5); + output_options.add_options() + ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), + "Output prefix for files generated by this program.") + ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), + "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") + ("exon_gff", po::bool_switch(&exongff)->default_value(false), + "Output exon-based junctions in GFF format.") + ("intron_gff", po::bool_switch(&introngff)->default_value(false), + "Output intron-based junctions in GFF format.") + ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), + "The value to enter into the \"source\" field in GFF files.") + ; + po::options_description filtering_options("Filtering options", w.ws_col, w.ws_col / 1.5); + filtering_options.add_options() + ("filter_file,f", po::value(&filterFile), + "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") + ("reference,r", po::value(&referenceFile), + "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), + "Disables machine learning filtering") + ("max_length", po::value(&max_length)->default_value(0), + "Filter junctions longer than this value. Default (0) is to not filter based on length.") + ("canonical", po::value(&canonical)->default_value("OFF"), + "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GT-AG), S = Semi-canonical junctions (AT-AC, or GC-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") + ("min_cov", po::value(&mincov)->default_value(1), + "Only keep junctions with a number of split reads greater than or equal to this number") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not. Increase value towards 1.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.") + ("balanced", po::bool_switch(&balanced)->default_value(false), + "Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") + ; + // Hidden options, will be allowed both on command line and + // in config file, but will not be shown to the user. + po::options_description hidden_options("Hidden options"); + hidden_options.add_options() + ("prep_data_dir", po::value(&prepDir), "Path to directory containing prepared data.") + ("junction_file", po::value(&junctionFile), "Path to the junction tab file to process.") + ("no_smote", po::bool_switch(&no_smote)->default_value(false), + "Use this flag to disable synthetic oversampling") + ("enn", po::bool_switch(&enn)->default_value(false), + "Use this flag to enable Edited Nearest Neighbour to clean decision region") + ("genuine,g", po::value(&genuineFile), + "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions. This option is only useful if you have access to simulated data.") + ("model_file,m", po::value(&modelFile), + "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") + ("save_features", po::bool_switch(&save_features)->default_value(false), + "Use this flag to save features (both for training set and again for all junctions) to disk.") + ; + // Positional option for the input bam file + po::positional_options_description p; + p.add("prep_data_dir", 1); + p.add("junction_file", 1); + // Options to display to the user + po::options_description display_options; + display_options.add(system_options).add(output_options).add(filtering_options); + // Combine non-positional options + po::options_description cmdline_options; + cmdline_options.add(display_options).add(hidden_options); + // Parse command line + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm); + po::notify(vm); + // Output help information the exit if requested + if (help || argc <= 1) { + cout << title() << endl << endl + << description() << endl << endl + << "Usage: " << usage() << endl + << display_options << endl; + return 1; + } + auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); + cout << "Running portcullis in junction filter mode" << endl + << "------------------------------------------" << endl << endl; + // Create the prepare class + JunctionFilter filter(prepDir, junctionFile, output, !balanced); + filter.setSaveBad(saveBad); + filter.setSource(source); + filter.setVerbose(verbose); + filter.setThreads(threads); + filter.setMaxLength(max_length); + filter.setCanonical(canonical); + filter.setMinCov(mincov); + filter.setOutputExonGFF(exongff); + filter.setOutputIntronGFF(introngff); + // Only set the filter rules if specified. + filter.setFilterFile(filterFile); + filter.setGenuineFile(genuineFile); + if (modelFile.empty() && !no_ml) { + filter.setTrain(true); + } else { + filter.setTrain(false); + if (!no_ml) { + filter.setModelFile(modelFile); + } + } filter.setSaveFeatures(save_features); - filter.setReferenceFile(referenceFile); - filter.setThreshold(threshold); - filter.setSmote(!no_smote); - filter.setENN(enn); - filter.filter(); - return 0; + filter.setReferenceFile(referenceFile); + filter.setThreshold(threshold); + filter.setSmote(!no_smote); + filter.setENN(enn); + filter.filter(); + return 0; } path portcullis::JunctionFilter::dataDir = "."; diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 5c7d1177..c2b63907 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -67,314 +67,320 @@ using portcullis::PreparedFiles; namespace portcullis { -typedef boost::error_info JuncFilterErrorInfo; - -struct JuncFilterException : virtual boost::exception, virtual std::exception { -}; - - -const string DEFAULT_FILTER_OUTPUT = "portcullis_filter/portcullis"; -const string DEFAULT_FILTER_SOURCE = "portcullis"; -const string DEFAULT_FILTER_RULE_FILE = "default_filter.json"; -const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest"; -const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; -const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; -const uint16_t DEFAULT_FILTER_THREADS = 1; -const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.5; - -class JunctionFilter { -private: + typedef boost::error_info JuncFilterErrorInfo; + + struct JuncFilterException : virtual boost::exception, virtual std::exception { + }; + + + const string DEFAULT_FILTER_OUTPUT = "portcullis_filter/portcullis"; + const string DEFAULT_FILTER_SOURCE = "portcullis"; + const string DEFAULT_FILTER_RULE_FILE = "default_filter.json"; + const string DEFAULT_FILTER_MODEL_FILE = "default_model.forest"; + const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; + const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; + const uint16_t DEFAULT_FILTER_THREADS = 1; + const uint16_t DEFAULT_SELFTRAIN_TREES = 100; + const double DEFAULT_FILTER_THRESHOLD = 0.5; + + class JunctionFilter { + private: + + path junctionFile; + PreparedFiles prepData; + path modelFile; + path filterFile; + path genuineFile; + path referenceFile; + path output; + bool train; + uint16_t threads; + bool saveBad; + bool saveFeatures; + bool outputExonGFF; + bool outputIntronGFF; + uint32_t maxLength; + bool filterCanonical; + uint32_t minCov; + bool filterSemi; + bool filterNovel; + string source; + double threshold; + bool smote; + bool enn; + bool precise; + bool verbose; + + + public: + + static path dataDir; + + JunctionFilter(const path& _prepDir, + const path& _junctionFile, + const path& _output, + bool precise); + + virtual ~JunctionFilter() { + } + + + + public: + + path getFilterFile() const { + return filterFile; + } + + void setFilterFile(path filterFile) { + this->filterFile = filterFile; + } + + path getJunctionFile() const { + return junctionFile; + } + + void setJunctionFile(path junctionFile) { + this->junctionFile = junctionFile; + } + + double getThreshold() const { + return threshold; + } + + void setThreshold(double threshold) { + this->threshold = threshold; + } + + path getOutput() const { + return output; + } + + void setOutput(path output) { + this->output = output; + } + + path getGenuineFile() const { + return genuineFile; + } + + void setGenuineFile(path genuineFile) { + this->genuineFile = genuineFile; + } + + path getModelFile() const { + return modelFile; + } + + void setModelFile(path modelFile) { + this->modelFile = modelFile; + } + + path getReferenceFile() const { + return referenceFile; + } + + void setReferenceFile(path referenceFile) { + this->referenceFile = referenceFile; + } + + bool isTrain() const { + return train; + } + + void setTrain(bool train) { + this->train = train; + } + + bool isSaveBad() const { + return saveBad; + } + + void setSaveBad(bool saveBad) { + this->saveBad = saveBad; + } + + bool isOutputExonGFF() const { + return outputExonGFF; + } + + void setOutputExonGFF(bool outputExonGFF) { + this->outputExonGFF = outputExonGFF; + } + + bool isOutputIntronGFF() const { + return outputIntronGFF; + } + + void setOutputIntronGFF(bool outputIntronGFF) { + this->outputIntronGFF = outputIntronGFF; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + string getSource() const { + return source; + } + + void setSource(string source) { + this->source = source; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isENN() const { + return enn; + } + + void setENN(bool enn) { + this->enn = enn; + } + + bool isSmote() const { + return smote; + } + + void setSmote(bool smote) { + this->smote = smote; + } + + bool doSaveFeatures() const { + return this->saveFeatures; + } - path junctionFile; - PreparedFiles prepData; - path modelFile; - path filterFile; - path genuineFile; - path referenceFile; - path output; - bool train; - uint16_t threads; - bool saveBad; - bool saveFeatures; - bool outputExonGFF; - bool outputIntronGFF; - uint32_t maxLength; - bool filterCanonical; - uint32_t minCov; - bool filterSemi; - bool filterNovel; - string source; - double threshold; - bool smote; - bool enn; - bool verbose; + void setSaveFeatures(bool saveFeatures) { + this->saveFeatures = saveFeatures; + } + void setCanonical(const string& canonical) { + vector modes; + boost::split(modes, canonical, boost::is_any_of(","), boost::token_compress_on); + if (modes.size() > 3) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Canonical filter mode contains too many modes. Max is 2."))); + } + if (modes.empty()) { + this->filterCanonical = false; + this->filterSemi = false; + this->filterNovel = false; + } else { + this->filterCanonical = true; + this->filterSemi = true; + this->filterNovel = true; + for (auto & m : modes) { + string n = boost::to_upper_copy(m); + if (n == "OFF") { + this->filterCanonical = false; + this->filterSemi = false; + this->filterNovel = false; + } else if (n == "C") { + this->filterCanonical = false; + } else if (n == "S") { + this->filterSemi = false; + } else if (n == "N") { + this->filterNovel = false; + } + } + } + } + + bool doCanonicalFiltering() const { + return this->filterCanonical || this->filterSemi || this->filterNovel; + } + + uint32_t isMaxLength() const { + return maxLength; + } + + void setMaxLength(uint32_t maxLength) { + this->maxLength = maxLength; + } + + uint32_t getMinCov() const { + return minCov; + } + + void setMinCov(uint32_t minCov) { + this->minCov = minCov; + } + + bool isPrecise() const { + return precise; + } + + void setPrecise(bool precise) { + this->precise = precise; + } + + path getIntitalPosRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE + ".layer" + std::to_string(index) + ".json"); + } + + path getIntitalNegRulesFile(uint16_t index) const { + return path(dataDir.string() + "/" + ST_INEG_RULES_FILE + ".layer" + std::to_string(index) + ".json"); + } + + void filter(); + + + protected: + + void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf); + + shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { + return calcPerformance(pass, fail, false); + } + shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert); + + void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix); + + void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail); + + void categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t); + + void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); + + void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs); + + double calcGoodThreshold(shared_ptr f); + + void undersample(JunctionList& jl, size_t size); + + public: + + static string title() { + return string("Portcullis Filter Mode Help"); + } + + static string description() { + return string("Filters out junctions that are unlikely to be genuine or that have too little\n") + + "supporting evidence. The user can control three stages of the filtering\n" + + "process. First the user can perform filtering based on a random forest model\n" + + "self-trained on the provided data, alternatively the user can provide a pre-\n" + + "trained model. Second the user can specify a configuration file describing a\n" + + "set of filtering rules to apply. Third, the user can directly through the\n" + + "command line filter based on junction (intron) length, or the canonical label.\n\n" + + "This stage requires the prep directory and the tab file generated from the\n" + + "stage as input."; + } -public: - - static path dataDir; - - JunctionFilter(const path& _prepDir, - const path& _junctionFile, - const path& _output); - - virtual ~JunctionFilter() { - } - - - -public: + static string usage() { + return string("portcullis filter [options] "); + } - path getFilterFile() const { - return filterFile; - } - - void setFilterFile(path filterFile) { - this->filterFile = filterFile; - } - - path getJunctionFile() const { - return junctionFile; - } - - void setJunctionFile(path junctionFile) { - this->junctionFile = junctionFile; - } - - double getThreshold() const { - return threshold; - } - - void setThreshold(double threshold) { - this->threshold = threshold; - } - - path getOutput() const { - return output; - } - - void setOutput(path output) { - this->output = output; - } - - path getGenuineFile() const { - return genuineFile; - } - - void setGenuineFile(path genuineFile) { - this->genuineFile = genuineFile; - } - - path getModelFile() const { - return modelFile; - } - - void setModelFile(path modelFile) { - this->modelFile = modelFile; - } - - path getReferenceFile() const { - return referenceFile; - } - - void setReferenceFile(path referenceFile) { - this->referenceFile = referenceFile; - } - - bool isTrain() const { - return train; - } - - void setTrain(bool train) { - this->train = train; - } - - bool isSaveBad() const { - return saveBad; - } - - void setSaveBad(bool saveBad) { - this->saveBad = saveBad; - } - - bool isOutputExonGFF() const { - return outputExonGFF; - } - - void setOutputExonGFF(bool outputExonGFF) { - this->outputExonGFF = outputExonGFF; - } - - bool isOutputIntronGFF() const { - return outputIntronGFF; - } - - void setOutputIntronGFF(bool outputIntronGFF) { - this->outputIntronGFF = outputIntronGFF; - } - - bool isVerbose() const { - return verbose; - } - - void setVerbose(bool verbose) { - this->verbose = verbose; - } - - string getSource() const { - return source; - } - - void setSource(string source) { - this->source = source; - } - - uint16_t getThreads() const { - return threads; - } - - void setThreads(uint16_t threads) { - this->threads = threads; - } - - bool isENN() const { - return enn; - } - - void setENN(bool enn) { - this->enn = enn; - } - - bool isSmote() const { - return smote; - } - - void setSmote(bool smote) { - this->smote = smote; - } - - bool doSaveFeatures() const { - return this->saveFeatures; - } - - void setSaveFeatures(bool saveFeatures) { - this->saveFeatures = saveFeatures; - } - - void setCanonical(const string& canonical) { - vector modes; - boost::split(modes, canonical, boost::is_any_of(","), boost::token_compress_on); - if (modes.size() > 3) { - BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Canonical filter mode contains too many modes. Max is 2."))); - } - if (modes.empty()) { - this->filterCanonical = false; - this->filterSemi = false; - this->filterNovel = false; - } - else { - this->filterCanonical = true; - this->filterSemi = true; - this->filterNovel = true; - for (auto & m : modes) { - string n = boost::to_upper_copy(m); - if (n == "OFF") { - this->filterCanonical = false; - this->filterSemi = false; - this->filterNovel = false; - } - else if (n == "C") { - this->filterCanonical = false; - } - else if (n == "S") { - this->filterSemi = false; - } - else if (n == "N") { - this->filterNovel = false; - } - } - } - } - - bool doCanonicalFiltering() const { - return this->filterCanonical || this->filterSemi || this->filterNovel; - } - - uint32_t isMaxLength() const { - return maxLength; - } - - void setMaxLength(uint32_t maxLength) { - this->maxLength = maxLength; - } - - uint32_t getMinCov() const { - return minCov; - } - - void setMinCov(uint32_t minCov) { - this->minCov = minCov; - } - - path getIntitalPosRulesFile(uint16_t index) const { - return path(dataDir.string() + "/" + ST_IPOS_RULES_FILE + ".layer" + std::to_string(index) + ".json"); - } - - path getIntitalNegRulesFile(uint16_t index) const { - return path(dataDir.string() + "/" + ST_INEG_RULES_FILE + ".layer" + std::to_string(index) + ".json"); - } - - void filter(); - - -protected: - - void forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf); - - shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail) { - return calcPerformance(pass, fail, false); - } - shared_ptr calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert); - - void printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix); - - void doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail); - - void categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t); - - void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); - - void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs); - - double calcGoodThreshold(shared_ptr f); - - void undersample(JunctionList& jl, size_t size); - -public: - - static string title() { - return string("Portcullis Filter Mode Help"); - } - - static string description() { - return string("Filters out junctions that are unlikely to be genuine or that have too little\n") + - "supporting evidence. The user can control three stages of the filtering\n" + - "process. First the user can perform filtering based on a random forest model\n" + - "self-trained on the provided data, alternatively the user can provide a pre-\n" + - "trained model. Second the user can specify a configuration file describing a\n" + - "set of filtering rules to apply. Third, the user can directly through the\n" + - "command line filter based on junction (intron) length, or the canonical label.\n\n" + - "This stage requires the prep directory and the tab file generated from the\n" + - "stage as input."; - } - - static string usage() { - return string("portcullis filter [options] "); - } - - static int main(int argc, char *argv[]); -}; + static int main(int argc, char *argv[]); + }; } diff --git a/src/portcullis.cc b/src/portcullis.cc index 3cda9bfc..f634c84f 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -54,7 +54,9 @@ using portcullis::JunctionFilter; using portcullis::BamFilter; typedef boost::error_info PortcullisErrorInfo; -struct PortcullisException: virtual boost::exception, virtual std::exception { }; + +struct PortcullisException : virtual boost::exception, virtual std::exception { +}; // Default values for arguments const uint16_t DEFAULT_THREADS = 4; @@ -65,17 +67,17 @@ const uint32_t DEFAULT_GAP_SIZE = 100; portcullis::PortcullisFS portcullis::pfs; enum class Mode { - PREP, - JUNC, - FILTER, - BAM_FILT, + PREP, + JUNC, + FILTER, + BAM_FILT, FULL }; void print_backtrace(int depth = 0) { try { throw; - } + } // this block shows how to handle exceptions of some known type // You can have your own types instead of std::exception catch (const std::exception & e) { @@ -85,7 +87,7 @@ void print_backtrace(int depth = 0) { } catch (...) { print_backtrace(++depth); } - } + } // Not all nesting exceptions will be of a known type, but if they use the // mixin type std::nested_exception, then we can at least handle them enough to // get the nested exception: @@ -97,7 +99,7 @@ void print_backtrace(int depth = 0) { } catch (...) { print_backtrace(++depth); } - } + } // Exception nesting works through inheritance, which means that if you // can't inherit from the type, then you can't 'mixin' std::nesting exception. // If you try something like std::throw_with_nested( int{10} ); Then you'll @@ -107,288 +109,282 @@ void print_backtrace(int depth = 0) { } } - - Mode parseMode(string mode) { - string upperMode = boost::to_upper_copy(mode); - if (upperMode == string("PREP") || upperMode == string("PREPARE")) { - return Mode::PREP; - } - else if (upperMode == string("JUNC") || upperMode == string("ANALYSE") || upperMode == string("ANALYZE")) { - return Mode::JUNC; - } - else if (upperMode == string("FILTER") || upperMode == string("FILT")) { - return Mode::FILTER; - } - else if (upperMode == string("BAMFILT")) { - return Mode::BAM_FILT; - } - else if (upperMode == string("FULL")) { - return Mode::FULL; - } - else { - BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( - "Could not recognise mode string: ") + mode)); - } + string upperMode = boost::to_upper_copy(mode); + if (upperMode == string("PREP") || upperMode == string("PREPARE")) { + return Mode::PREP; + } else if (upperMode == string("JUNC") || upperMode == string("ANALYSE") || upperMode == string("ANALYZE")) { + return Mode::JUNC; + } else if (upperMode == string("FILTER") || upperMode == string("FILT")) { + return Mode::FILTER; + } else if (upperMode == string("BAMFILT")) { + return Mode::BAM_FILT; + } else if (upperMode == string("FULL")) { + return Mode::FULL; + } else { + BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( + "Could not recognise mode string: ") + mode)); + } } string title() { - return string("Portcullis Help"); + return string("Portcullis Help"); } string description() { - return string("Portcullis is a tool to identify genuine splice junctions using aligned RNAseq reads"); + return string("Portcullis is a tool to identify genuine splice junctions using aligned RNAseq reads"); } string usage() { - return string("portcullis [options] "); + return string("portcullis [options] "); } string modes() { - return string( - " - full - Full pipeline. Runs prep, junc, filt (and optionally bamfilt) in sequence\n") + - " - prep - Step 1: Prepares a genome and bam file(s) ready for junction analysis\n" + - " - junc - Step 2: Perform junction analysis on prepared data\n" + - " - filt - Step 3: Discard unlikely junctions\n" + - " - bamfilt - Step 4: Filters a BAM to remove any reads associated with invalid\n" + - " junctions"; + return string( + " - full - Full pipeline. Runs prep, junc, filt (and optionally bamfilt) in sequence\n") + + " - prep - Step 1: Prepares a genome and bam file(s) ready for junction analysis\n" + + " - junc - Step 2: Perform junction analysis on prepared data\n" + + " - filt - Step 3: Discard unlikely junctions\n" + + " - bamfilt - Step 4: Filters a BAM to remove any reads associated with invalid\n" + + " junctions"; } string fulltitle() { - return string("Portcullis Full Pipeline Mode Help"); + return string("Portcullis Full Pipeline Mode Help"); } string fulldescription() { - return string("Runs prep, junc, filter, and optionally bamfilt, as a complete pipeline. Assumes\n") + - "that the self-trained machine learning approach for filtering is to be used."; + return string("Runs prep, junc, filter, and optionally bamfilt, as a complete pipeline. Assumes\n") + + "that the self-trained machine learning approach for filtering is to be used."; } string fullusage() { - return string("portcullis full [options] ()+"); + return string("portcullis full [options] ()+"); } - int mainFull(int argc, char *argv[]) { - // Portcullis args - std::vector bamFiles; - path genomeFile; - path outputDir; - path referenceFile; - string strandSpecific; - string orientation; - uint16_t threads; - bool separate; + // Portcullis args + std::vector bamFiles; + path genomeFile; + path outputDir; + path referenceFile; + string strandSpecific; + string orientation; + uint16_t threads; + bool separate; bool extra; bool copy; - bool keep_temp; + bool keep_temp; bool force; - bool useCsi; - bool saveBad; - bool exongff; - bool introngff; - bool bamFilter; + bool useCsi; + bool saveBad; + bool exongff; + bool introngff; + bool bamFilter; string source; - uint32_t max_length; - uint32_t mincov; - string canonical; - bool verbose; - bool help; - struct winsize w; - ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); - // Declare the supported options. - po::options_description system_options("System options", w.ws_col, (unsigned)((double)w.ws_col / 1.5)); - system_options.add_options() - ("threads,t", po::value(&threads)->default_value(1), - "The number of threads to use. Note that increasing the number of threads will also increase memory requirements. Default: 1") - ("verbose,v", po::bool_switch(&verbose)->default_value(false), - "Print extra information") - ("help", po::bool_switch(&help)->default_value(false), "Produce help message") - ; - po::options_description output_options("Output options", w.ws_col, (unsigned)((double)w.ws_col / 1.5)); - output_options.add_options() - ("output,o", po::value(&outputDir)->default_value("portcullis_out"), - "Output directory. Default: portcullis_out") - ("bam_filter,b", po::bool_switch(&bamFilter)->default_value(false), - "Filter out alignments corresponding with false junctions. Warning: this is time consuming; make sure you really want to do this first!") - ("exon_gff", po::bool_switch(&exongff)->default_value(false), - "Output exon-based junctions in GFF format.") - ("intron_gff", po::bool_switch(&introngff)->default_value(false), - "Output intron-based junctions in GFF format.") - ("source", po::value(&source)->default_value("portcullis"), - "The value to enter into the \"source\" field in GFF files.") - ; - po::options_description prepare_options("Input options", w.ws_col, (unsigned)((double)w.ws_col / 1.5)); - prepare_options.add_options() - ("force", po::bool_switch(&force)->default_value(false), - "Whether or not to clean the output directory before processing, thereby forcing full preparation of the genome and bam files. By default portcullis will only do what it thinks it needs to.") - ("copy", po::bool_switch(©)->default_value(false), - "Whether to copy files from input data to prepared data where possible, otherwise will use symlinks. Will require more time and disk space to prepare input but is potentially more robust.") - ("keep_temp", po::bool_switch(&keep_temp)->default_value(false), - "Whether keep any temporary files created during the prepare stage of portcullis. This might include BAM files and indexes.") - ("use_csi", po::bool_switch(&useCsi)->default_value(false), - "Whether to use CSI indexing rather than BAI indexing. CSI has the advantage that it supports very long target sequences (probably not an issue unless you are working on huge genomes). BAI has the advantage that it is more widely supported (useful for viewing in genome browsers).") - ; - - po::options_description analysis_options("Analysis options", w.ws_col, (unsigned)((double)w.ws_col / 1.5)); - analysis_options.add_options() - ("orientation", po::value(&orientation)->default_value(orientationToString(Orientation::UNKNOWN)), - "The orientation of the reads that produced the BAM alignments: \"F\" (Single-end forward orientation); \"R\" (single-end reverse orientation); \"FR\" (paired-end, with reads sequenced towards center of fragment -> <-. This is usual setting for most Illumina paired end sequencing); \"RF\" (paired-end, reads sequenced away from center of fragment <- ->); \"FF\" (paired-end, reads both sequenced in forward orientation); \"RR\" (paired-end, reads both sequenced in reverse orientation); \"UNKNOWN\" (default, portcullis will workaround any calculations requiring orientation information)") - ("strandedness", po::value(&strandSpecific)->default_value(strandednessToString(Strandedness::UNKNOWN)), - "Whether BAM alignments were generated using a type of strand specific RNAseq library: \"unstranded\" (Standard Illumina); \"firststrand\" (dUTP, NSR, NNSR); \"secondstrand\" (Ligation, Standard SOLiD, flux sim reads); \"UNKNOWN\" (default, portcullis will workaround any calculations requiring strandedness information)") - ("separate", po::bool_switch(&separate)->default_value(false), - "Separate spliced from unspliced reads.") - ("extra", po::bool_switch(&extra)->default_value(false), - "Calculate additional metrics that take some time to generate. Automatically activates BAM splitting mode (--separate).") - ; - - po::options_description filter_options("Filtering options", w.ws_col, (unsigned)((double)w.ws_col / 1.5)); - filter_options.add_options() - ("reference,r", po::value(&referenceFile), - "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") - ("max_length", po::value(&max_length)->default_value(0), - "Filter junctions longer than this value. Default (0) is to not filter based on length.") - ("canonical", po::value(&canonical)->default_value("OFF"), - "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GT-AG), S = Semi-canonical junctions (AT-AC, or GC-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") - ("min_cov", po::value(&mincov)->default_value(1), - "Only keep junctions with a number of split reads greater than or equal to this number") - ("save_bad", po::bool_switch(&saveBad)->default_value(false), - "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") - ; - - // Hidden options, will be allowed both on command line and - // in config file, but will not be shown to the user. - po::options_description hidden_options("Hidden options"); - hidden_options.add_options() - ("bam-files", po::value< std::vector >(&bamFiles), "Path to the BAM files to process.") - ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") - ; - // Positional option for the input bam file - po::positional_options_description p; - p.add("genome-file", 1); - p.add("bam-files", -1); - // Combine non-positional options for displaying to the user - po::options_description display_options; - display_options.add(system_options).add(output_options).add(prepare_options).add(analysis_options).add(filter_options); - // Combine non-positional options for use at the command line - po::options_description cmdline_options; - cmdline_options.add(display_options).add(hidden_options); - // Parse command line - po::variables_map vm; - po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm); - po::notify(vm); - // Output help information the exit if requested - if (help || argc <= 1) { - cout << fulltitle() << endl << endl - << fulldescription() << endl << endl - << "Usage: " << fullusage() << endl - << display_options << endl << endl; - return 1; - } - // Acquire path to bam file - if (vm.count("bam-files")) { - bamFiles = vm["bam-files"].as >(); - } - // Acquire path to genome file - if (vm.count("genome-file")) { - genomeFile = vm["genome-file"].as(); - } - // Test if provided genome exists - if (!exists(genomeFile) && !symbolic_link_exists(genomeFile)) { - BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( - "Could not find genome file at: ") + genomeFile.string())); - } - // Glob the input bam files - std::vector transformedBams = Prepare::globFiles(bamFiles); - auto_cpu_timer timer(1, "\nPortcullis completed.\nTotal runtime: %ws\n\n"); - cout << "Running full portcullis pipeline" << endl - << "--------------------------------" << endl << endl; - if (!exists(outputDir)) { - if (!create_directories(outputDir)) { - BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( - "Could not create output directory: ") + outputDir.string())); - } - } - - // ************ Prepare input data (BAMs + genome) *********** - cout << "Preparing input data (BAMs + genome)" << endl - << "----------------------------------" << endl << endl; - path prepDir = path(outputDir.string() + "/1-prep"); - // Create the prepare class - Prepare prep(prepDir); - prep.setForce(force); - prep.setUseLinks(!copy); - prep.setUseCsi(useCsi); - prep.setThreads(threads); - prep.setVerbose(verbose); - // Prep the input to produce a usable indexed and sorted bam plus, indexed - // genome and queryable coverage information - prep.prepare(transformedBams, genomeFile); - - // ************ Identify all junctions and calculate metrics *********** - cout << "Identifying junctions and calculating metrics" << endl - << "---------------------------------------------" << endl << endl; - path juncDir = outputDir.string() + "/2-junc"; - path juncOut = juncDir.string() + "/portcullis_all"; - // Identify junctions and calculate metrics - JunctionBuilder jb(prepDir, juncOut); - jb.setThreads(threads); - jb.setExtra(false); // Run in fast mode - jb.setSeparate(false); // Run in fast mode - jb.setStrandSpecific(strandednessFromString(strandSpecific)); - jb.setOrientation(orientationFromString(orientation)); + uint32_t max_length; + uint32_t mincov; + string canonical; + bool balanced; + bool verbose; + bool help; + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + // Declare the supported options. + po::options_description system_options("System options", w.ws_col, (unsigned) ((double) w.ws_col / 1.5)); + system_options.add_options() + ("threads,t", po::value(&threads)->default_value(1), + "The number of threads to use. Note that increasing the number of threads will also increase memory requirements. Default: 1") + ("verbose,v", po::bool_switch(&verbose)->default_value(false), + "Print extra information") + ("help", po::bool_switch(&help)->default_value(false), "Produce help message") + ; + po::options_description output_options("Output options", w.ws_col, (unsigned) ((double) w.ws_col / 1.5)); + output_options.add_options() + ("output,o", po::value(&outputDir)->default_value("portcullis_out"), + "Output directory. Default: portcullis_out") + ("bam_filter,b", po::bool_switch(&bamFilter)->default_value(false), + "Filter out alignments corresponding with false junctions. Warning: this is time consuming; make sure you really want to do this first!") + ("exon_gff", po::bool_switch(&exongff)->default_value(false), + "Output exon-based junctions in GFF format.") + ("intron_gff", po::bool_switch(&introngff)->default_value(false), + "Output intron-based junctions in GFF format.") + ("source", po::value(&source)->default_value("portcullis"), + "The value to enter into the \"source\" field in GFF files.") + ; + po::options_description prepare_options("Input options", w.ws_col, (unsigned) ((double) w.ws_col / 1.5)); + prepare_options.add_options() + ("force", po::bool_switch(&force)->default_value(false), + "Whether or not to clean the output directory before processing, thereby forcing full preparation of the genome and bam files. By default portcullis will only do what it thinks it needs to.") + ("copy", po::bool_switch(©)->default_value(false), + "Whether to copy files from input data to prepared data where possible, otherwise will use symlinks. Will require more time and disk space to prepare input but is potentially more robust.") + ("keep_temp", po::bool_switch(&keep_temp)->default_value(false), + "Whether keep any temporary files created during the prepare stage of portcullis. This might include BAM files and indexes.") + ("use_csi", po::bool_switch(&useCsi)->default_value(false), + "Whether to use CSI indexing rather than BAI indexing. CSI has the advantage that it supports very long target sequences (probably not an issue unless you are working on huge genomes). BAI has the advantage that it is more widely supported (useful for viewing in genome browsers).") + ; + + po::options_description analysis_options("Analysis options", w.ws_col, (unsigned) ((double) w.ws_col / 1.5)); + analysis_options.add_options() + ("orientation", po::value(&orientation)->default_value(orientationToString(Orientation::UNKNOWN)), + "The orientation of the reads that produced the BAM alignments: \"F\" (Single-end forward orientation); \"R\" (single-end reverse orientation); \"FR\" (paired-end, with reads sequenced towards center of fragment -> <-. This is usual setting for most Illumina paired end sequencing); \"RF\" (paired-end, reads sequenced away from center of fragment <- ->); \"FF\" (paired-end, reads both sequenced in forward orientation); \"RR\" (paired-end, reads both sequenced in reverse orientation); \"UNKNOWN\" (default, portcullis will workaround any calculations requiring orientation information)") + ("strandedness", po::value(&strandSpecific)->default_value(strandednessToString(Strandedness::UNKNOWN)), + "Whether BAM alignments were generated using a type of strand specific RNAseq library: \"unstranded\" (Standard Illumina); \"firststrand\" (dUTP, NSR, NNSR); \"secondstrand\" (Ligation, Standard SOLiD, flux sim reads); \"UNKNOWN\" (default, portcullis will workaround any calculations requiring strandedness information)") + ("separate", po::bool_switch(&separate)->default_value(false), + "Separate spliced from unspliced reads.") + ("extra", po::bool_switch(&extra)->default_value(false), + "Calculate additional metrics that take some time to generate. Automatically activates BAM splitting mode (--separate).") + ; + + po::options_description filter_options("Filtering options", w.ws_col, (unsigned) ((double) w.ws_col / 1.5)); + filter_options.add_options() + ("reference,r", po::value(&referenceFile), + "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + ("max_length", po::value(&max_length)->default_value(0), + "Filter junctions longer than this value. Default (0) is to not filter based on length.") + ("canonical", po::value(&canonical)->default_value("OFF"), + "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GT-AG), S = Semi-canonical junctions (AT-AC, or GC-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") + ("min_cov", po::value(&mincov)->default_value(1), + "Only keep junctions with a number of split reads greater than or equal to this number") + ("save_bad", po::bool_switch(&saveBad)->default_value(false), + "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") + ("balanced", po::bool_switch(&balanced)->default_value(false), + "Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") + ; + + // Hidden options, will be allowed both on command line and + // in config file, but will not be shown to the user. + po::options_description hidden_options("Hidden options"); + hidden_options.add_options() + ("bam-files", po::value< std::vector >(&bamFiles), "Path to the BAM files to process.") + ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") + ; + // Positional option for the input bam file + po::positional_options_description p; + p.add("genome-file", 1); + p.add("bam-files", -1); + // Combine non-positional options for displaying to the user + po::options_description display_options; + display_options.add(system_options).add(output_options).add(prepare_options).add(analysis_options).add(filter_options); + // Combine non-positional options for use at the command line + po::options_description cmdline_options; + cmdline_options.add(display_options).add(hidden_options); + // Parse command line + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), vm); + po::notify(vm); + // Output help information the exit if requested + if (help || argc <= 1) { + cout << fulltitle() << endl << endl + << fulldescription() << endl << endl + << "Usage: " << fullusage() << endl + << display_options << endl << endl; + return 1; + } + // Acquire path to bam file + if (vm.count("bam-files")) { + bamFiles = vm["bam-files"].as >(); + } + // Acquire path to genome file + if (vm.count("genome-file")) { + genomeFile = vm["genome-file"].as(); + } + // Test if provided genome exists + if (!exists(genomeFile) && !symbolic_link_exists(genomeFile)) { + BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( + "Could not find genome file at: ") + genomeFile.string())); + } + // Glob the input bam files + std::vector transformedBams = Prepare::globFiles(bamFiles); + auto_cpu_timer timer(1, "\nPortcullis completed.\nTotal runtime: %ws\n\n"); + cout << "Running full portcullis pipeline" << endl + << "--------------------------------" << endl << endl; + if (!exists(outputDir)) { + if (!create_directories(outputDir)) { + BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( + "Could not create output directory: ") + outputDir.string())); + } + } + + // ************ Prepare input data (BAMs + genome) *********** + cout << "Preparing input data (BAMs + genome)" << endl + << "----------------------------------" << endl << endl; + path prepDir = path(outputDir.string() + "/1-prep"); + // Create the prepare class + Prepare prep(prepDir); + prep.setForce(force); + prep.setUseLinks(!copy); + prep.setUseCsi(useCsi); + prep.setThreads(threads); + prep.setVerbose(verbose); + // Prep the input to produce a usable indexed and sorted bam plus, indexed + // genome and queryable coverage information + prep.prepare(transformedBams, genomeFile); + + // ************ Identify all junctions and calculate metrics *********** + cout << "Identifying junctions and calculating metrics" << endl + << "---------------------------------------------" << endl << endl; + path juncDir = outputDir.string() + "/2-junc"; + path juncOut = juncDir.string() + "/portcullis_all"; + // Identify junctions and calculate metrics + JunctionBuilder jb(prepDir, juncOut); + jb.setThreads(threads); + jb.setExtra(false); // Run in fast mode + jb.setSeparate(false); // Run in fast mode + jb.setStrandSpecific(strandednessFromString(strandSpecific)); + jb.setOrientation(orientationFromString(orientation)); jb.setExtra(extra); jb.setSeparate(separate); - jb.setSource(source); - jb.setUseCsi(useCsi); - jb.setOutputExonGFF(exongff); - jb.setOutputIntronGFF(introngff); - jb.setVerbose(verbose); - jb.process(); - - // ************ Use default filtering strategy ************* - cout << "Filtering junctions" << endl - << "-------------------" << endl << endl; - path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; - path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; - JunctionFilter filter(prepDir, juncTab, filtOut); - filter.setVerbose(verbose); - filter.setSource(source); - filter.setMaxLength(max_length); - filter.setCanonical(canonical); - filter.setMinCov(mincov); - filter.setTrain(true); - filter.setThreads(threads); - filter.setENN(false); - filter.setOutputExonGFF(exongff); - filter.setOutputIntronGFF(introngff); - filter.setSaveBad(saveBad); - filter.filter(); - - // *********** BAM filter ********* - if (bamFilter) { - cout << "Filtering BAMs" << endl - << "--------------" << endl << endl; - path filtJuncTab = path(filtOut.string() + ".pass.junctions.tab"); - path bamFile = path(prepDir.string() + "/portcullis.sorted.alignments.bam"); - path filteredBam = path(outputDir.string() + "/portcullis.filtered.bam"); - BamFilter bamFilter(filtJuncTab.string(), bamFile.string(), filteredBam.string()); - //bamFilter.setStrandSpecific(strandednessFromString(strandSpecific)); - //bamFilter.setOrientation(orientationFromString(orientation)); - bamFilter.setUseCsi(useCsi); - bamFilter.setVerbose(verbose); - bamFilter.filter(); - } - + jb.setSource(source); + jb.setUseCsi(useCsi); + jb.setOutputExonGFF(exongff); + jb.setOutputIntronGFF(introngff); + jb.setVerbose(verbose); + jb.process(); + + // ************ Use default filtering strategy ************* + cout << "Filtering junctions" << endl + << "-------------------" << endl << endl; + path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; + path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; + JunctionFilter filter(prepDir, juncTab, filtOut, !balanced); + filter.setVerbose(verbose); + filter.setSource(source); + filter.setMaxLength(max_length); + filter.setCanonical(canonical); + filter.setMinCov(mincov); + filter.setTrain(true); + filter.setThreads(threads); + filter.setENN(false); + filter.setOutputExonGFF(exongff); + filter.setOutputIntronGFF(introngff); + filter.setSaveBad(saveBad); + filter.filter(); + + // *********** BAM filter ********* + if (bamFilter) { + cout << "Filtering BAMs" << endl + << "--------------" << endl << endl; + path filtJuncTab = path(filtOut.string() + ".pass.junctions.tab"); + path bamFile = path(prepDir.string() + "/portcullis.sorted.alignments.bam"); + path filteredBam = path(outputDir.string() + "/portcullis.filtered.bam"); + BamFilter bamFilter(filtJuncTab.string(), bamFile.string(), filteredBam.string()); + //bamFilter.setStrandSpecific(strandednessFromString(strandSpecific)); + //bamFilter.setOrientation(orientationFromString(orientation)); + bamFilter.setUseCsi(useCsi); + bamFilter.setVerbose(verbose); + bamFilter.filter(); + } + if (!keep_temp) { cout << "Cleaning temporary files..."; - cout.flush(); + cout.flush(); prep.getOutput()->clean(); cout << " done." << endl << endl; } - - return 0; -} + return 0; +} void handler(int sig) { void *array[10]; @@ -404,131 +400,119 @@ void handler(int sig) { exit(1); } - /** * Start point for portcullis. */ int main(int argc, char *argv[]) { - + signal(SIGSEGV, handler); try { - // Portcullis args - string modeStr; - std::vector others; - bool verbose; - bool version; - bool help; - struct winsize w; - ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); - // Declare the supported options. - po::options_description generic_options("Options", w.ws_col, (unsigned)((double)w.ws_col / 1.7)); - generic_options.add_options() - ("verbose,v", po::bool_switch(&verbose)->default_value(false), "Print extra information") - ("version", po::bool_switch(&version)->default_value(false), "Print version string") - ("help", po::bool_switch(&help)->default_value(false), "Produce help message") - ; - // Hidden options, will be allowed both on command line and - // in config file, but will not be shown to the user. - po::options_description hidden_options("Hidden options"); - hidden_options.add_options() - ("mode", po::value(&modeStr), "Portcullis mode.") - ("others", po::value< std::vector >(&others), "Other options.") - ; - // Positional options - po::positional_options_description p; - p.add("mode", 1); - p.add("others", 100); - // Combine non-positional options - po::options_description cmdline_options; - cmdline_options.add(generic_options).add(hidden_options); - // Parse command line - po::variables_map vm; - po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).allow_unregistered().run(), vm); - po::notify(vm); - // Always output version information but exit if version info was all user requested + // Portcullis args + string modeStr; + std::vector others; + bool verbose; + bool version; + bool help; + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + // Declare the supported options. + po::options_description generic_options("Options", w.ws_col, (unsigned) ((double) w.ws_col / 1.7)); + generic_options.add_options() + ("verbose,v", po::bool_switch(&verbose)->default_value(false), "Print extra information") + ("version", po::bool_switch(&version)->default_value(false), "Print version string") + ("help", po::bool_switch(&help)->default_value(false), "Produce help message") + ; + // Hidden options, will be allowed both on command line and + // in config file, but will not be shown to the user. + po::options_description hidden_options("Hidden options"); + hidden_options.add_options() + ("mode", po::value(&modeStr), "Portcullis mode.") + ("others", po::value< std::vector >(&others), "Other options.") + ; + // Positional options + po::positional_options_description p; + p.add("mode", 1); + p.add("others", 100); + // Combine non-positional options + po::options_description cmdline_options; + cmdline_options.add(generic_options).add(hidden_options); + // Parse command line + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).allow_unregistered().run(), vm); + po::notify(vm); + // Always output version information but exit if version info was all user requested #ifndef PACKAGE_NAME #define PACKAGE_NAME "Portcullis" #endif #ifndef PACKAGE_VERSION #define PACKAGE_VERSION "1.X" #endif - portcullis::pfs = PortcullisFS(argv[0]); - portcullis::pfs.setVersion(PACKAGE_VERSION); - // End if verbose was requested at this level, outputting file system details. - if (verbose) { - cout << endl - << "Project filesystem" << endl - << "------------------" << endl - << portcullis::pfs << endl; - //return 0; - } - // Output help information the exit if requested - if (argc == 1 || (argc == 2 && verbose) || (argc == 2 && help) || (argc == 3 && verbose && help)) { - cout << title() << endl << endl - << description() << endl << endl - << "Modes:" << endl << modes() << endl << endl - << "Usage: " << usage() << endl << endl - << generic_options << endl << endl; - return 1; - } - // End if version was requested. - if (version) { - cout << PACKAGE_NAME << " " << PACKAGE_VERSION << endl; - return 0; - } - else { - cout << "Portcullis V" << PACKAGE_VERSION << endl << endl; - } - // If we've got this far parse the command line properly - Mode mode = parseMode(modeStr); - const int modeArgC = argc - 1; - char** modeArgV = argv + 1; - // Set static variables in downstream subtools so they know where to get their resources from - JunctionFilter::dataDir = portcullis::pfs.getDataDir(); - JunctionSystem::version = portcullis::pfs.getVersion(); - if (mode == Mode::PREP) { - Prepare::main(modeArgC, modeArgV); - } - else if (mode == Mode::JUNC) { - JunctionBuilder::main(modeArgC, modeArgV); - } - else if (mode == Mode::FILTER) { - JunctionFilter::main(modeArgC, modeArgV); - } - else if (mode == Mode::BAM_FILT) { - BamFilter::main(modeArgC, modeArgV); - } - else if (mode == Mode::FULL) { - mainFull(modeArgC, modeArgV); - } - else { - BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( - "Unrecognised portcullis mode: ") + modeStr)); - } - } - catch (po::error& e) { - cerr << "Error: Parsing Command Line: " << e.what() << endl; - return 1; - } - catch (boost::exception& e) { - cerr << boost::diagnostic_information(e); - return 4; - } - catch (std::exception& e) { - cerr << "Error: " << e.what() << endl; - print_backtrace(); - return 5; - } - catch (const char* msg) { - cerr << "Error: " << msg << endl; - print_backtrace(); - return 6; - } - catch (...) { - cerr << "Error: Exception of unknown type!" << endl; - print_backtrace(); - return 7; - } - return 0; + portcullis::pfs = PortcullisFS(argv[0]); + portcullis::pfs.setVersion(PACKAGE_VERSION); + // End if verbose was requested at this level, outputting file system details. + if (verbose) { + cout << endl + << "Project filesystem" << endl + << "------------------" << endl + << portcullis::pfs << endl; + //return 0; + } + // Output help information the exit if requested + if (argc == 1 || (argc == 2 && verbose) || (argc == 2 && help) || (argc == 3 && verbose && help)) { + cout << title() << endl << endl + << description() << endl << endl + << "Modes:" << endl << modes() << endl << endl + << "Usage: " << usage() << endl << endl + << generic_options << endl << endl; + return 1; + } + // End if version was requested. + if (version) { + cout << PACKAGE_NAME << " " << PACKAGE_VERSION << endl; + return 0; + } else { + cout << "Portcullis V" << PACKAGE_VERSION << endl << endl; + } + // If we've got this far parse the command line properly + Mode mode = parseMode(modeStr); + const int modeArgC = argc - 1; + char** modeArgV = argv + 1; + // Set static variables in downstream subtools so they know where to get their resources from + JunctionFilter::dataDir = portcullis::pfs.getDataDir(); + JunctionSystem::version = portcullis::pfs.getVersion(); + if (mode == Mode::PREP) { + Prepare::main(modeArgC, modeArgV); + } else if (mode == Mode::JUNC) { + JunctionBuilder::main(modeArgC, modeArgV); + } else if (mode == Mode::FILTER) { + JunctionFilter::main(modeArgC, modeArgV); + } else if (mode == Mode::BAM_FILT) { + BamFilter::main(modeArgC, modeArgV); + } else if (mode == Mode::FULL) { + mainFull(modeArgC, modeArgV); + } else { + BOOST_THROW_EXCEPTION(PortcullisException() << PortcullisErrorInfo(string( + "Unrecognised portcullis mode: ") + modeStr)); + } + } catch (po::error& e) { + cerr << "Error: Parsing Command Line: " << e.what() << endl; + return 1; + } catch (boost::exception& e) { + cerr << boost::diagnostic_information(e); + return 4; + } catch (std::exception& e) { + cerr << "Error: " << e.what() << endl; + print_backtrace(); + return 5; + } catch (const char* msg) { + cerr << "Error: " << msg << endl; + print_backtrace(); + return 6; + } catch (...) { + cerr << "Error: Exception of unknown type!" << endl; + print_backtrace(); + return 7; + } + return 0; } From 8680bcc3a4f8a7a04e1086c7b7a4e469ff735fd8 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 21 Jul 2018 15:06:28 +0100 Subject: [PATCH 05/15] Fixing issue when installing filtering rule files. --- Makefile.am | 9 +++++++-- deps/boost/.gitignore | 1 + .../checks/architecture/bin/gcc-gnu-7/debug/.gitignore | 2 ++ deps/boost/tools/build/src/engine/bootstrap/.gitignore | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 deps/boost/libs/config/checks/architecture/bin/gcc-gnu-7/debug/.gitignore create mode 100644 deps/boost/tools/build/src/engine/bootstrap/.gitignore diff --git a/Makefile.am b/Makefile.am index f9aaf6c7..d990f025 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,7 +21,9 @@ dist_noinst_SCRIPTS = autogen.sh antigen.sh build_boost.sh configdir = $(datadir)/portcullis dist_config_DATA = \ data/default_filter.json \ - data/low_juncs_filter.json \ + data/low_juncs_filter.json +balanceddir = $(datadir)/portcullis/balanced +dist_balanced_DATA = \ data/balanced/selftrain_initial_neg.layer1.json \ data/balanced/selftrain_initial_neg.layer2.json \ data/balanced/selftrain_initial_neg.layer3.json \ @@ -31,7 +33,10 @@ dist_config_DATA = \ data/balanced/selftrain_initial_neg.layer7.json \ data/balanced/selftrain_initial_pos.layer1.json \ data/balanced/selftrain_initial_pos.layer2.json \ - data/balanced/selftrain_initial_pos.layer3.json \ + data/balanced/selftrain_initial_pos.layer3.json + +precisedir = $(datadir)/portcullis/precise +dist_precise_DATA = \ data/precise/selftrain_initial_neg.layer1.json \ data/precise/selftrain_initial_neg.layer2.json \ data/precise/selftrain_initial_neg.layer3.json \ diff --git a/deps/boost/.gitignore b/deps/boost/.gitignore index 2a4706da..aedcf7ed 100644 --- a/deps/boost/.gitignore +++ b/deps/boost/.gitignore @@ -7,3 +7,4 @@ /project-config.jam.4 *.log /project-config.jam.5 +/project-config.jam.6 diff --git a/deps/boost/libs/config/checks/architecture/bin/gcc-gnu-7/debug/.gitignore b/deps/boost/libs/config/checks/architecture/bin/gcc-gnu-7/debug/.gitignore new file mode 100644 index 00000000..56c888d2 --- /dev/null +++ b/deps/boost/libs/config/checks/architecture/bin/gcc-gnu-7/debug/.gitignore @@ -0,0 +1,2 @@ +/64.o +/x86.o diff --git a/deps/boost/tools/build/src/engine/bootstrap/.gitignore b/deps/boost/tools/build/src/engine/bootstrap/.gitignore new file mode 100644 index 00000000..ca6d17cc --- /dev/null +++ b/deps/boost/tools/build/src/engine/bootstrap/.gitignore @@ -0,0 +1 @@ +/jam0 From 0964f2fd071b0fe5731f2fbc5919f300eb25174b Mon Sep 17 00:00:00 2001 From: maplesond Date: Sun, 22 Jul 2018 12:54:10 +0100 Subject: [PATCH 06/15] Fixing a bug when combining negative sets. Disabling balanced CLI option. --- scripts/portcullis/portcullis/rule_filter.py | 5 ++++- src/junction_filter.cc | 6 +++--- src/portcullis.cc | 6 +++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/portcullis/portcullis/rule_filter.py b/scripts/portcullis/portcullis/rule_filter.py index 357e0a3c..be642a66 100644 --- a/scripts/portcullis/portcullis/rule_filter.py +++ b/scripts/portcullis/portcullis/rule_filter.py @@ -272,7 +272,6 @@ def create_training_sets(args): neg_juncs[-1].to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab", sep='\t') - neg_length_limit = int(L95 * 10) print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 10:", neg_length_limit) neg_juncs.append(df.loc[df["size"] > neg_length_limit]) @@ -285,6 +284,9 @@ def create_training_sets(args): neg_juncs[-1].to_csv(args.prefix + ".neg_layer_intronsize.tab", sep='\t') neg_set = pd.concat(neg_juncs) + remaining = original.reset_index().merge(neg_set, indicator=True, how='inner').set_index('index') + neg_set = remaining.loc[remaining['_merge'] == 'both'] + del neg_set['_merge'] print() print("Negative set contains:", len(neg_set), "junctions") @@ -302,6 +304,7 @@ def create_training_sets(args): training = pd.concat([pos_juncs, neg_set]) remaining = original.reset_index().merge(training, indicator=True, how='outer').set_index('index') others = remaining.loc[remaining['_merge'] == 'left_only'] + del others['_merge'] other_file = args.prefix + ".others.tab" others.to_csv(other_file, sep='\t') print("done. File saved to:", other_file) diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 66ec2d57..ddc55ffe 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -700,8 +700,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { "Only keep junctions with a number of split reads greater than or equal to this number") ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), "The threshold score at which we determine a junction to be genuine or not. Increase value towards 1.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.") - ("balanced", po::bool_switch(&balanced)->default_value(false), - "Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") + //("balanced", po::bool_switch(&balanced)->default_value(false), + //"Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") ; // Hidden options, will be allowed both on command line and // in config file, but will not be shown to the user. @@ -746,7 +746,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { cout << "Running portcullis in junction filter mode" << endl << "------------------------------------------" << endl << endl; // Create the prepare class - JunctionFilter filter(prepDir, junctionFile, output, !balanced); + JunctionFilter filter(prepDir, junctionFile, output, false); filter.setSaveBad(saveBad); filter.setSource(source); filter.setVerbose(verbose); diff --git a/src/portcullis.cc b/src/portcullis.cc index f634c84f..83e49c76 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -248,8 +248,8 @@ int mainFull(int argc, char *argv[]) { "Only keep junctions with a number of split reads greater than or equal to this number") ("save_bad", po::bool_switch(&saveBad)->default_value(false), "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") - ("balanced", po::bool_switch(&balanced)->default_value(false), - "Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") + //("balanced", po::bool_switch(&balanced)->default_value(false), + //"Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.") ; // Hidden options, will be allowed both on command line and @@ -347,7 +347,7 @@ int mainFull(int argc, char *argv[]) { << "-------------------" << endl << endl; path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; - JunctionFilter filter(prepDir, juncTab, filtOut, !balanced); + JunctionFilter filter(prepDir, juncTab, filtOut, false); filter.setVerbose(verbose); filter.setSource(source); filter.setMaxLength(max_length); From df21fe79e3c8656bcd461642f1118a28cf5aadf4 Mon Sep 17 00:00:00 2001 From: maplesond Date: Mon, 23 Jul 2018 21:05:52 +0100 Subject: [PATCH 07/15] Removing duplicates from negative set --- scripts/portcullis/portcullis/rule_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/portcullis/portcullis/rule_filter.py b/scripts/portcullis/portcullis/rule_filter.py index be642a66..05520a82 100644 --- a/scripts/portcullis/portcullis/rule_filter.py +++ b/scripts/portcullis/portcullis/rule_filter.py @@ -283,7 +283,7 @@ def create_training_sets(args): if args.save_layers: neg_juncs[-1].to_csv(args.prefix + ".neg_layer_intronsize.tab", sep='\t') - neg_set = pd.concat(neg_juncs) + neg_set = pd.concat(neg_juncs).drop_duplicates() remaining = original.reset_index().merge(neg_set, indicator=True, how='inner').set_index('index') neg_set = remaining.loc[remaining['_merge'] == 'both'] del neg_set['_merge'] From 2da840e39415a698bbe960a819a8e5a55ed99e1b Mon Sep 17 00:00:00 2001 From: maplesond Date: Thu, 26 Jul 2018 22:03:50 +0100 Subject: [PATCH 08/15] Testing more relaxed L95 filtering for training set. --- scripts/portcullis/portcullis/rule_filter.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/portcullis/portcullis/rule_filter.py b/scripts/portcullis/portcullis/rule_filter.py index 05520a82..7d49c955 100644 --- a/scripts/portcullis/portcullis/rule_filter.py +++ b/scripts/portcullis/portcullis/rule_filter.py @@ -198,11 +198,12 @@ def create_training_sets(args): pos_intron_sizes = pos_juncs["size"].tolist() pos_intron_sizes.sort(key=int) - L95 = pos_intron_sizes[int(len(pos_intron_sizes) * 0.95)] - pos_length_limit = int(L95 * 1.2) + L95_pos = int(len(pos_intron_sizes) * 0.95) + L95 = pos_intron_sizes[L95_pos] + pos_length_limit = L95 # int(L95 * 1.2) - print("Intron size L95 =", L95, " positive set maximum intron size limit set to L95 x 1.2:", pos_length_limit) + print("Intron size at L95 =", L95) # Also save this to file as we'll need it back in the C program with open(args.prefix + ".L95_intron_size.txt", 'w') as l95out: @@ -272,8 +273,8 @@ def create_training_sets(args): neg_juncs[-1].to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab", sep='\t') - neg_length_limit = int(L95 * 10) - print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 10:", neg_length_limit) + neg_length_limit = int(L95 * 20) + print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 20:", neg_length_limit) neg_juncs.append(df.loc[df["size"] > neg_length_limit]) if args.genuine: print(str(i+1) + "\t" + calcPerformance(neg_juncs[-1], df).longStr()) From 7f0e22ecbc192e98014e6046086b6636836bd628 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 28 Jul 2018 16:29:03 +0100 Subject: [PATCH 09/15] Fixing a bug which occurs using GCC7, due to not handling return by reference as expected in KNN code. --- .gitignore | 1 + deps/ranger-0.3.8/.gitignore | 1 + lib/include/portcullis/ml/knn.hpp | 6 +++--- lib/src/knn.cc | 12 +++++++----- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index bef400ae..ab91e7b2 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ callgrind* /portcullis.files /portcullis.includes /portcullis.creator.user +/conftest.cpp diff --git a/deps/ranger-0.3.8/.gitignore b/deps/ranger-0.3.8/.gitignore index ec54341b..7c330f6a 100644 --- a/deps/ranger-0.3.8/.gitignore +++ b/deps/ranger-0.3.8/.gitignore @@ -9,3 +9,4 @@ config.* /libtool *.la /ranger.pc +/conftest.cpp diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp index 28646a2c..604d7f51 100644 --- a/lib/include/portcullis/ml/knn.hpp +++ b/lib/include/portcullis/ml/knn.hpp @@ -55,7 +55,7 @@ class KNN { size_t rows; size_t cols; - vector>> results; + vector> results; void doSlice( uint16_t slice ); @@ -87,12 +87,12 @@ class KNN { this->verbose = verbose; } - const vector>>& getResults() const { + const vector>& getResults() const { return results; } const vector& getNNs(size_t index) const { - return *results[index]; + return results[index]; } void execute(); diff --git a/lib/src/knn.cc b/lib/src/knn.cc index bbd6c375..ccaac79e 100644 --- a/lib/src/knn.cc +++ b/lib/src/knn.cc @@ -36,6 +36,10 @@ portcullis::ml::KNN::KNN(uint16_t defaultK, uint16_t _threads, double* _data, si threads = _threads; verbose = false; results.resize(_rows); + for (size_t i = 0; i < results.size(); i++) { + results[i].resize(k); + } + } void portcullis::ml::KNN::doSlice(uint16_t slice) { @@ -79,12 +83,10 @@ void portcullis::ml::KNN::doSlice(uint16_t slice) { } // Store final results for (uint32_t testidx = start; testidx <= end; testidx++) { - shared_ptr> knn = make_shared>(); for (size_t i = 0; i < k; i++) { uint32_t index = rbuf[((testidx - start) * k) + i]; - knn->push_back(index); - } - results[testidx] = knn; + results[testidx][i] = index; + } } } @@ -105,7 +107,7 @@ void portcullis::ml::KNN::execute() { void portcullis::ml::KNN::print(ostream& out) { for (auto & i : results) { - for (auto j : *i) { + for (auto j : i) { out << j << " "; } out << endl; From e376b0aa0d8bfe310a8a812493448dcc3d40afa5 Mon Sep 17 00:00:00 2001 From: maplesond Date: Sat, 28 Jul 2018 17:41:11 +0100 Subject: [PATCH 10/15] Making output from rule filtering negative sets more similar to V1.0 --- scripts/portcullis/portcullis/rule_filter.py | 63 ++++++++++---------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/scripts/portcullis/portcullis/rule_filter.py b/scripts/portcullis/portcullis/rule_filter.py index 7d49c955..6bbe1d50 100644 --- a/scripts/portcullis/portcullis/rule_filter.py +++ b/scripts/portcullis/portcullis/rule_filter.py @@ -37,7 +37,7 @@ def load_genuine(genuine_file): return glist -def json2pandas(handle, fieldnames): +def json2pandas(handle, fieldnames, data_frame): '''This function will load the JSON configuration, check that both parameters and expression are defined, and convert to a pandas string that will operate on a dataframe call df @@ -86,17 +86,17 @@ def json2pandas(handle, fieldnames): for key in keys: k = key[:-2] if key[-2] == '.' and key[-1].isdigit() else key if json_dict['parameters'][key]['operator'] == "in": - newexpr = re.sub(key, "(df[\"{0}\"].isin({2}))".format(k, replace_op(json_dict['parameters'][key]['operator']), - json_dict['parameters'][key]['value']), newexpr) + newexpr = re.sub(key, "({3}[\"{0}\"].isin({2}))".format(k, replace_op(json_dict['parameters'][key]['operator']), + json_dict['parameters'][key]['value'], data_frame), newexpr) elif json_dict['parameters'][key]['operator'] == "not in": newexpr = re.sub(key, - "(~df[\"{0}\"].isin({2}))".format(k, replace_op(json_dict['parameters'][key]['operator']), - json_dict['parameters'][key]['value']), newexpr) + "(~{3}[\"{0}\"].isin({2}))".format(k, replace_op(json_dict['parameters'][key]['operator']), + json_dict['parameters'][key]['value'], data_frame), newexpr) else: - newexpr = re.sub(key, "(df[\"{0}\"] {1} {2})".format(k, replace_op(json_dict['parameters'][key]['operator']), - json_dict['parameters'][key]['value']), newexpr) + newexpr = re.sub(key, "({3}[\"{0}\"] {1} {2})".format(k, replace_op(json_dict['parameters'][key]['operator']), + json_dict['parameters'][key]['value'], data_frame), newexpr) - return "df.loc[" + newexpr + "]" + return data_frame + ".loc[" + newexpr + "]", data_frame + ".loc[~" + newexpr + "]" def calcPerformance(passed, failed, invert=False): @@ -163,12 +163,12 @@ def create_training_sets(args): print(str(i) + "\t", end="", flush=True) # Create pandas command - pandas_cmd = json2pandas(open(json_file), fieldnames) + pandas_cmd_in, pandas_cmd_out = json2pandas(open(json_file), fieldnames, "df") # print(pandas_cmd) # Execute the pandas command, result should be a filtered dataframe - pos_juncs = eval(pandas_cmd) + pos_juncs = eval(pandas_cmd_in) nb_not_pos = len(original) - len(pos_juncs) @@ -200,10 +200,9 @@ def create_training_sets(args): pos_intron_sizes.sort(key=int) L95_pos = int(len(pos_intron_sizes) * 0.95) L95 = pos_intron_sizes[L95_pos] - pos_length_limit = L95 # int(L95 * 1.2) + pos_length_limit = int(L95 * 1.2) - - print("Intron size at L95 =", L95) + print("Intron size at L95 =", L95, " positive set maximum intron size limit set to L95 x 1.2:", pos_length_limit) # Also save this to file as we'll need it back in the C program with open(args.prefix + ".L95_intron_size.txt", 'w') as l95out: @@ -246,8 +245,8 @@ def create_training_sets(args): else: print("PASS\tFAIL") - neg_juncs = [] - df = not_pos_juncs + neg_set = None + other_juncs = not_pos_juncs # Run through layers of logic to get the positive set for i, json_file in enumerate(args.neg_json, start=1): @@ -255,39 +254,40 @@ def create_training_sets(args): print(str(i) + "\t", end="", flush=True) # Create pandas command - pandas_cmd = json2pandas(open(json_file), fieldnames) + pandas_cmd_in, pandas_cmd_out = json2pandas(open(json_file), fieldnames, "other_juncs") #print(pandas_cmd) # Execute the pandas command, result should be a filtered dataframe - neg_juncs.append(eval(pandas_cmd)) + neg_juncs = eval(pandas_cmd_in) + other_juncs = eval(pandas_cmd_out) - nb_not_neg = len(df) - len(neg_juncs[-1]) + if i > 1: + neg_set = pd.concat([neg_set, neg_juncs]) + else: + neg_set = neg_juncs if args.genuine: - print(calcPerformance(neg_juncs[-1], df).longStr()) + print(calcPerformance(neg_juncs, df).longStr()) else: - print(str(len(neg_juncs[-1])) + "\t" + str(nb_not_neg)) + print(str(len(neg_juncs)) + "\t" + str(len(other_juncs))) if args.save_layers: - neg_juncs[-1].to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab", sep='\t') + neg_juncs.to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab", sep='\t') neg_length_limit = int(L95 * 20) print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 20:", neg_length_limit) - neg_juncs.append(df.loc[df["size"] > neg_length_limit]) + neg_juncs = other_juncs.loc[other_juncs["size"] > neg_length_limit] + neg_set = pd.concat([neg_set, neg_juncs]) if args.genuine: - print(str(i+1) + "\t" + calcPerformance(neg_juncs[-1], df).longStr()) + print(str(i+1) + "\t" + calcPerformance(neg_juncs, df).longStr()) else: - print(str(i+1) + "\t" + str(len(neg_juncs[-1])) + "\t" + str(len(df) - len(neg_juncs[-1]))) + print(str(i+1) + "\t" + str(len(neg_juncs)) + "\t" + str(len(other_juncs))) if args.save_layers: - neg_juncs[-1].to_csv(args.prefix + ".neg_layer_intronsize.tab", sep='\t') + neg_juncs.to_csv(args.prefix + ".neg_layer_intronsize.tab", sep='\t') - neg_set = pd.concat(neg_juncs).drop_duplicates() - remaining = original.reset_index().merge(neg_set, indicator=True, how='inner').set_index('index') - neg_set = remaining.loc[remaining['_merge'] == 'both'] - del neg_set['_merge'] print() print("Negative set contains:", len(neg_set), "junctions") @@ -296,6 +296,7 @@ def create_training_sets(args): del neg_set["genuine"] print("Saving negative set to disk ... ", end="", flush=True) + neg_set.sort_index(inplace=True) neg_file = args.prefix + ".neg.junctions.tab" neg_set.to_csv(neg_file, sep='\t') print("done. File saved to:", neg_file) @@ -333,12 +334,12 @@ def filter_one(args): fieldnames = [key for key in dict(original.dtypes)] # Create pandas command - pandas_cmd = json2pandas(open(args.json), fieldnames) + pandas_cmd_in, pandas_cmd_out = json2pandas(open(args.json), fieldnames, "df") # print(pandas_cmd) # Execute the pandas command, result should be a filtered dataframe - passed = eval(pandas_cmd) + passed = eval(pandas_cmd_in) print("After filtering", len(passed), "junctions remain.") From cc87e79a98030263c300e3f7aaea8122e7563943 Mon Sep 17 00:00:00 2001 From: maplesond Date: Wed, 1 Aug 2018 02:32:41 +0100 Subject: [PATCH 11/15] Adding verbosity to smote --- lib/include/portcullis/ml/knn.hpp | 4 ---- lib/src/model_features.cc | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp index 604d7f51..09ad46a2 100644 --- a/lib/include/portcullis/ml/knn.hpp +++ b/lib/include/portcullis/ml/knn.hpp @@ -67,10 +67,6 @@ class KNN { return k; } - void setK(uint16_t k) { - this->k = k; - } - uint16_t getThreads() const { return threads; } diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index e2de139c..897477f3 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -277,6 +277,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun //cout << endl; } Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols() - 1); + smote.setVerbose(verbose); smote.execute(); smote_rows = smote.getNbSynthRows(); smote_data = new double[smote_rows * SC]; From fd384acf458944bb2a5e08c6508fa109ff8ba1b2 Mon Sep 17 00:00:00 2001 From: maplesond Date: Wed, 1 Aug 2018 02:42:10 +0100 Subject: [PATCH 12/15] Altering tree count in forest. --- src/junction_filter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index c2b63907..da31b08a 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -80,7 +80,7 @@ namespace portcullis { const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; - const uint16_t DEFAULT_SELFTRAIN_TREES = 100; + const uint16_t DEFAULT_SELFTRAIN_TREES = 250; const double DEFAULT_FILTER_THRESHOLD = 0.5; class JunctionFilter { From 5ac7173ae7a2077b7000a8e7eeae3b2f5b97b3c4 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Sat, 4 Aug 2018 15:26:24 +0100 Subject: [PATCH 13/15] Fixing intronsize filtering step when creating the negative training set. Also added in functionality to save out layers --- .../boost/tools/build/src/engine/bootstrap/.gitignore | 1 - scripts/portcullis/portcullis/rule_filter.py | 7 ++++--- src/junction_filter.cc | 8 ++++++++ src/junction_filter.hpp | 11 ++++++++++- src/portcullis.cc | 8 ++++++++ 5 files changed, 30 insertions(+), 5 deletions(-) delete mode 100644 deps/boost/tools/build/src/engine/bootstrap/.gitignore diff --git a/deps/boost/tools/build/src/engine/bootstrap/.gitignore b/deps/boost/tools/build/src/engine/bootstrap/.gitignore deleted file mode 100644 index ca6d17cc..00000000 --- a/deps/boost/tools/build/src/engine/bootstrap/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/jam0 diff --git a/scripts/portcullis/portcullis/rule_filter.py b/scripts/portcullis/portcullis/rule_filter.py index 6bbe1d50..a57996e8 100644 --- a/scripts/portcullis/portcullis/rule_filter.py +++ b/scripts/portcullis/portcullis/rule_filter.py @@ -276,9 +276,10 @@ def create_training_sets(args): neg_juncs.to_csv(args.prefix + ".neg_layer_" + str(i) + ".tab", sep='\t') - neg_length_limit = int(L95 * 20) - print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 20:", neg_length_limit) + neg_length_limit = int(L95 * 8) + print("Intron size L95 =", L95, "negative set will use junctions with intron size over L95 x 8:", neg_length_limit, "and with maxmmes < 12") neg_juncs = other_juncs.loc[other_juncs["size"] > neg_length_limit] + neg_juncs = neg_juncs.loc[neg_juncs["maxmmes"] < 12] neg_set = pd.concat([neg_set, neg_juncs]) if args.genuine: print(str(i+1) + "\t" + calcPerformance(neg_juncs, df).longStr()) @@ -388,4 +389,4 @@ def main(): -if __name__=='__main__': main() \ No newline at end of file +if __name__=='__main__': main() diff --git a/src/junction_filter.cc b/src/junction_filter.cc index ddc55ffe..45c85b82 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -244,6 +244,10 @@ void portcullis::JunctionFilter::filter() { args.push_back(ruleset + "/selftrain_initial_neg.layer7.json"); args.push_back("--prefix=" + output.string() + ".selftrain.initialset"); + + if (this->saveLayers) { + args.push_back("--save_layers"); + } args.push_back(junctionFile.string()); char* char_args[50]; @@ -648,6 +652,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { bool no_ml; bool saveBad; bool save_features; + bool save_layers; bool exongff; bool introngff; uint32_t max_length; @@ -719,6 +724,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") ("save_features", po::bool_switch(&save_features)->default_value(false), "Use this flag to save features (both for training set and again for all junctions) to disk.") + ("save_layers", po::bool_switch(&save_layers)->default_value(false), + "Use this flag to save to disk each layer produced when creating the training set.") ; // Positional option for the input bam file po::positional_options_description p; @@ -768,6 +775,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } } filter.setSaveFeatures(save_features); + filter.setSaveLayers(save_layers); filter.setReferenceFile(referenceFile); filter.setThreshold(threshold); filter.setSmote(!no_smote); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index da31b08a..8640505e 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -97,6 +97,7 @@ namespace portcullis { uint16_t threads; bool saveBad; bool saveFeatures; + bool saveLayers; bool outputExonGFF; bool outputIntronGFF; uint32_t maxLength; @@ -192,7 +193,7 @@ namespace portcullis { this->train = train; } - bool isSaveBad() const { + bool doSaveBad() const { return saveBad; } @@ -200,6 +201,14 @@ namespace portcullis { this->saveBad = saveBad; } + bool doSaveLayers() const { + return saveLayers; + } + + void setSaveLayers(bool saveLayers) { + this->saveLayers = saveLayers; + } + bool isOutputExonGFF() const { return outputExonGFF; } diff --git a/src/portcullis.cc b/src/portcullis.cc index 83e49c76..8748830d 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -185,6 +185,8 @@ int mainFull(int argc, char *argv[]) { uint32_t max_length; uint32_t mincov; string canonical; + bool save_layers; + bool save_features; bool balanced; bool verbose; bool help; @@ -258,6 +260,10 @@ int mainFull(int argc, char *argv[]) { hidden_options.add_options() ("bam-files", po::value< std::vector >(&bamFiles), "Path to the BAM files to process.") ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") + ("save_features", po::bool_switch(&save_features)->default_value(false), + "Use this flag to save features (both for training set and again for all junctions) to disk.") + ("save_layers", po::bool_switch(&save_layers)->default_value(false), + "Use this flag to save to disk each layer produced when creating the training set.") ; // Positional option for the input bam file po::positional_options_description p; @@ -359,6 +365,8 @@ int mainFull(int argc, char *argv[]) { filter.setOutputExonGFF(exongff); filter.setOutputIntronGFF(introngff); filter.setSaveBad(saveBad); + filter.setSaveLayers(save_layers); + filter.setSaveFeatures(save_features); filter.filter(); // *********** BAM filter ********* From 235429e99d332ae1bdc0e60838b6084c36fed279 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Sat, 4 Aug 2018 20:33:55 +0100 Subject: [PATCH 14/15] Updating affiliation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 46af83a5..069ce225 100644 --- a/README.md +++ b/README.md @@ -108,5 +108,5 @@ See AUTHORS file for more details. Acknowledgements ---------------- -Affiliation: The Genome Analysis Centre (TGAC) +Affiliation: The Earlham Institute (EI) Funding: The Biotechnology and Biological Sciences Research Council (BBSRC) From 2b8f6b8940021fd9d4c17532f603660936305e82 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Sat, 4 Aug 2018 20:45:57 +0100 Subject: [PATCH 15/15] Adding debug info to KNN --- lib/include/portcullis/ml/knn.hpp | 3 +++ lib/src/knn.cc | 14 +++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp index 09ad46a2..200948c7 100644 --- a/lib/include/portcullis/ml/knn.hpp +++ b/lib/include/portcullis/ml/knn.hpp @@ -88,6 +88,9 @@ class KNN { } const vector& getNNs(size_t index) const { + if (index >= results.size()) { + std::cerr << "ERROR: Can't request KNNs of item " << index << " as this doesn't exist." << std::endl; + } return results[index]; } diff --git a/lib/src/knn.cc b/lib/src/knn.cc index ccaac79e..0d6d8d5e 100644 --- a/lib/src/knn.cc +++ b/lib/src/knn.cc @@ -20,6 +20,7 @@ #include using std::ostream; using std::cout; +using std::cerr; using std::endl; using std::make_shared; @@ -37,7 +38,7 @@ portcullis::ml::KNN::KNN(uint16_t defaultK, uint16_t _threads, double* _data, si verbose = false; results.resize(_rows); for (size_t i = 0; i < results.size(); i++) { - results[i].resize(k); + results[i].resize(k, 0); } } @@ -81,10 +82,17 @@ void portcullis::ml::KNN::doSlice(uint16_t slice) { rbuf[ri + i] = baseidx; } } - // Store final results + // Store final results (we could probably optimise this part away by replacing + // rbuf with results) for (uint32_t testidx = start; testidx <= end; testidx++) { for (size_t i = 0; i < k; i++) { uint32_t index = rbuf[((testidx - start) * k) + i]; + if (testidx >= results.size()) { + cerr << "ERROR: item has index larger than results array" << endl; + } + if (i >= results[testidx].size()) { + cerr << "ERROR: item is trying to set a value larger than k" << endl; + } results[testidx][i] = index; } } @@ -112,4 +120,4 @@ void portcullis::ml::KNN::print(ostream& out) { } out << endl; } -} \ No newline at end of file +}