From da4e07b2ea089463935599d7277af747848ceb10 Mon Sep 17 00:00:00 2001 From: "Daniel Mapleson (TGAC)" Date: Tue, 3 May 2016 10:23:41 +0100 Subject: [PATCH 01/21] Updates to pipeline --- scripts/hpc.json | 27 +++++++++- scripts/predict.snakefile | 104 ++++++++++++++++++++++++++++++++----- scripts/read_gen.snakefile | 9 ++-- scripts/snakey.py | 2 +- 4 files changed, 122 insertions(+), 20 deletions(-) diff --git a/scripts/hpc.json b/scripts/hpc.json index 73d7c907..c38ec995 100755 --- a/scripts/hpc.json +++ b/scripts/hpc.json @@ -2,13 +2,20 @@ "__default__" : { "threads" : "4", - "memory" : "5000", - "queue" : "Test" + "memory" : "5000" + }, + "align_gsnap_index" : + { + "memory" : "20000" }, "align_gsnap" : { "memory" : "30000" }, + "align_star_index" : + { + "memory" : "60000" + }, "align_star" : { "memory" : "60000" @@ -33,14 +40,30 @@ { "memory" : "30000" }, + "portcullis_prep" : + { + "memory" : "10000" + }, "portcullis_junc" : { "memory" : "25000" }, + "portcullis_junc2" : + { + "memory" : "25000" + }, + "portcullis_prep2" : + { + "memory" : "10000" + }, "truesight" : { "memory": "60000" }, + "soapsplice" : + { + "memory": "60000" + }, "finesplice" : { "memory": "20000" diff --git a/scripts/predict.snakefile b/scripts/predict.snakefile index 99f780df..cf44867d 100644 --- a/scripts/predict.snakefile +++ b/scripts/predict.snakefile @@ -38,6 +38,7 @@ LOAD_PORTCULLIS = config["load_portcullis"] LOAD_SPANKI = config["load_spanki"] LOAD_FINESPLICE = config["load_finesplice"] LOAD_TRUESIGHT = config["load_truesight"] +LOAD_SOAPSPLICE = config["load_soapsplice"] LOAD_PYTHON3 = config["load_python3"] LOAD_GT = config["load_gt"] LOAD_MIKADO = config["load_mikado"] @@ -72,6 +73,8 @@ JUNC_DIR = OUT_DIR + "/junctions" JUNC_DIR_FULL = os.path.abspath(JUNC_DIR) PORTCULLIS_DIR = JUNC_DIR + "/portcullis" PORTCULLIS_DIR_FULL = os.path.abspath(PORTCULLIS_DIR) +PORTCULLIS_DIR2 = JUNC_DIR + "/portcullis2" +PORTCULLIS_DIR2_FULL = os.path.abspath(PORTCULLIS_DIR2) CWD = os.getcwd() @@ -85,7 +88,7 @@ ISOFORM_FRACTION = {"permissive": 0.01, "semipermissive": 0.03, "default": 0.1, ######################### Rules -localrules: all, truesight2bed +localrules: all, truesight2bed, soapsplice2bed # Define rule all: @@ -95,7 +98,9 @@ rule all: expand(JUNC_DIR+"/output/tophat-{reads}-spanki.bed", reads=INPUT_SETS), expand(JUNC_DIR+"/output/tophat-{reads}-finesplice.bed", reads=INPUT_SETS), expand(JUNC_DIR+"/output/truesight-{reads}-truesight.bed", reads=INPUT_SETS), - expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) + expand(JUNC_DIR+"/output/soapsplice-{reads}-soapsplice.bed", reads=INPUT_SETS), + expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS), + #expand(PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc/{aln_method}-{reads}.junctions.tab", aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) # expand(ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.stats", asm_method=ASSEMBLY_METHODS, asm_mode=ASSEMBLY_MODES, aln_method=ALIGNMENT_METHODS, reads=INPUT_SETS) @@ -269,7 +274,9 @@ rule portcullis_prep: ref=REF, bam=rules.bam_sort.output, idx=rules.bam_index.output - output: PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai" + output: + bai=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai", + fa=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep/portcullis.genome.fa" params: outdir=PORTCULLIS_DIR+"/{aln_method}-{reads}/prep", load=LOAD_PORTCULLIS @@ -278,7 +285,7 @@ rule portcullis_prep: message: "Using portcullis to prepare: {input}" run: strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" - shell("{params.load} && portcullis prep -o {params.outdir} -l --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") + shell("{params.load} && portcullis prep -o {params.outdir} --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") rule portcullis_junc: @@ -294,11 +301,12 @@ rule portcullis_junc: message: "Using portcullis to analyse potential junctions: {input}" run: strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" - shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") + shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir}/{wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") rule portcullis_filter: - input: rules.portcullis_junc.output + input: tab=rules.portcullis_junc.output, + ref=rules.portcullis_prep.output.fa output: link=JUNC_DIR+"/output/{aln_method}-{reads}-portcullis.bed", unfilt_link=JUNC_DIR+"/output/{aln_method}-{reads}-all.bed", @@ -311,7 +319,7 @@ rule portcullis_filter: log: PORTCULLIS_DIR+"/{aln_method}-{reads}-filter.log" threads: int(THREADS) message: "Using portcullis to filter invalid junctions: {input}" - shell: "{params.load} && portcullis filter -t {threads} -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} {input} > {log} 2>&1 && ln -sf {params.bed} {output.link} && touch -h {output.link} && ln -sf {params.unfilt_bed} {output.unfilt_link} && touch -h {output.unfilt_link}" + shell: "{params.load} && portcullis filter -t {threads} -o {params.outdir}/{wildcards.aln_method}-{wildcards.reads} {input.ref} {input.tab} > {log} 2>&1 && ln -sf {params.bed} {output.link} && touch -h {output.link} && ln -sf {params.unfilt_bed} {output.unfilt_link} && touch -h {output.unfilt_link}" rule portcullis_bamfilt: @@ -327,6 +335,43 @@ rule portcullis_bamfilt: message: "Using portcullis to filter alignments containing invalid junctions: {input.tab}" shell: "{params.load} && portcullis bamfilt --output={output.bam} {input.tab} {input.bam} > {log} 2>&1" +''' +rule portcullis_prep2: + input: + ref=REF, + bam=rules.bam_sort.output, + idx=rules.bam_index.output + output: PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep/portcullis.sorted.alignments.bam.bai" + params: + outdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep", + load=LOAD_PORTCULLIS2 + log: PORTCULLIS_DIR2+"/{aln_method}-{reads}-prep.log" + threads: int(THREADS) + message: "Using portcullis to prepare: {input}" + run: + strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" + shell("{params.load} && portcullis prep -o {params.outdir} --strandedness={strand} -t {threads} {input.ref} {input.bam} > {log} 2>&1") + + +rule portcullis_junc2: + input: + bai=rules.portcullis_prep2.output + output: PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc/{aln_method}-{reads}.junctions.tab" + params: + prepdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/prep", + outdir=PORTCULLIS_DIR2+"/{aln_method}-{reads}/junc", + load=LOAD_PORTCULLIS2 + log: PORTCULLIS_DIR2+"/{aln_method}-{reads}-junc.log" + threads: int(THREADS) + message: "Using portcullis to analyse potential junctions: {input}" + run: + strand = PORTCULLIS_STRAND if wildcards.reads.startswith("real") else "unstranded" + shell("{params.load} && {RUN_TIME} portcullis junc -o {params.outdir} -p {wildcards.aln_method}-{wildcards.reads} --strandedness={strand} -t {threads} {params.prepdir} > {log} 2>&1") +''' + + + + rule spanki: input: bam=rules.bam_sort.output, @@ -408,7 +453,7 @@ rule truesight: log: JUNC_DIR_FULL+"/truesight/{reads}-truesight.log" threads: int(THREADS) message: "Using Truesight to find junctions" - shell: "{params.load_fs} && cd {params.outdir} && {RUN_TIME} truesight_pair.pl -i {MIN_INTRON} -I {MAX_INTRON} -v 1 -r {params.index} -p {threads} -o . -f {params.r1} {params.r2} > {log} 2>&1 && cd {CWD}" + shell: "{params.load_fs} && cd {params.outdir} && {RUN_TIME} truesight_pair.pl -i {MIN_INTRON} -I {MAX_INTRON} -v 1 -r {params.index} -p {threads} -o . -f {params.r1} {params.r2} > {log} 2>&1 && rm GapAli.sam && cd {CWD}" rule truesight2bed: input: rules.truesight.output @@ -422,6 +467,43 @@ rule truesight2bed: message: "Creating bed file from truesight output: {input}" shell: "{params.load_portcullis} && truesight2bed.py {input} > {output.bed} && ln -sf {params.bed} {output.link} && touch -h {output.link}" +rule soapsplice_index: + input: fa=REF + output: JUNC_DIR + "/soapsplice/index/"+NAME+".index.bwt" + log: JUNC_DIR + "/soapsplice/index.log" + params: + load_ss=LOAD_SOAPSPLICE, + index=JUNC_DIR + "/soapsplice/index/"+NAME + threads: 1 + message: "Creating index for soapsplice" + shell: "{params.load_ss} && 2bwt-builder {input.fa} {params.index} > {log} 2>&1" + +rule soapsplice: + input: + r1=READS_DIR+"/{reads}.R1.fq", + r2=READS_DIR+"/{reads}.R2.fq", + idx=rules.soapsplice_index.output + output: JUNC_DIR+"/soapsplice/{reads}/ss-{reads}.junc" + params: + load_ss=LOAD_SOAPSPLICE, + index=JUNC_DIR + "/soapsplice/index/"+NAME+".index", + outdir=JUNC_DIR+"/soapsplice/{reads}" + log: JUNC_DIR+"/soapsplice/{reads}-soapsplice.log" + threads: int(THREADS) + message: "Using soapsplice to find junctions" + shell: "{params.load_ss} && {RUN_TIME} soapsplice -d {params.index} -1 {input.r1} -2 {input.r2} -I 450 -o {params.outdir}/ss-{wildcards.reads} -p {threads} -t {MAX_INTRON} -c 0 -f 2 -L {MAX_INTRON} -l {MIN_INTRON} > {log} 2>&1 && rm -f {params.outdir}/ss-{wildcards.reads}.sam" + +rule soapsplice2bed: + input: rules.soapsplice.output + output: + link=JUNC_DIR+"/output/soapsplice-{reads}-soapsplice.bed", + bed=JUNC_DIR+"/soapsplice/{reads}/soapsplice-{reads}-soapsplice.bed" + params: + load_portcullis=LOAD_PORTCULLIS, + bed="../soapsplice/{reads}/soapsplice-{reads}-soapsplice.bed" + threads: 1 + message: "Creating bed file from soapsplice output: {input}" + shell: "{params.load_portcullis} && soapsplice2bed.py {input} > {output.bed} && ln -sf {params.bed} {output.link} && touch -h {output.link}" ### @@ -489,13 +571,9 @@ rule gtf_2_bed: output: bed=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.bed" params: - load_gt=LOAD_GT, - load_cuff=LOAD_CUFFLINKS, load_p=LOAD_PORTCULLIS, - gff=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.gff3", - gffi=ASM_DIR+"/output/{asm_method}_{asm_mode}-{aln_method}-{reads}.introns.gff3" message: "Converting GTF to BED for: {input.gtf}" - shell: "{params.load_gt} && {params.load_cuff} && gffread -E {input.gtf} -o- > {params.gff} && gt gff3 -sort -tidy -addintrons -force -o {params.gffi} {params.gff} && {params.load_p} && gff2bed.py {params.gffi} > {output.bed} && rm {params.gff} {params.gffi}" + shell: "{params.load_p} && gtf2bed.py {input.gtf} > {output.bed}" rule gtf_stats: input: diff --git a/scripts/read_gen.snakefile b/scripts/read_gen.snakefile index d2ece61b..44d5fd50 100755 --- a/scripts/read_gen.snakefile +++ b/scripts/read_gen.snakefile @@ -35,6 +35,7 @@ LOAD_SAMTOOLS = config["load_samtools"] LOAD_CUFFLINKS = config["load_cufflinks"] LOAD_STRINGTIE = config["load_stringtie"] LOAD_PORTCULLIS = config["load_portcullis"] +LOAD_PORTCULLIS2 = config["load_portcullis2"] LOAD_PYTHON3 = config["load_python3"] INDEX_STAR_EXTRA = config["index_star_extra"] @@ -261,11 +262,11 @@ rule sim_var_portcullis_prep: output: PORT_DIR+"/prep/portcullis.sorted.alignments.bam.bai" params: outdir=PORT_DIR+"/prep", - load=LOAD_PORTCULLIS + load=LOAD_PORTCULLIS2 log: PORT_DIR+"/prep.log" threads: int(THREADS) message: "Using portcullis to prepare: {input}" - shell: "{params.load} && portcullis prep -o {params.outdir} -l --strandedness={PORTCULLIS_STRAND} -t {threads} {input.ref} {input.bam} > {log} 2>&1" + shell: "{params.load} && portcullis prep -o {params.outdir} --strandedness={PORTCULLIS_STRAND} -t {threads} {input.ref} {input.bam} > {log} 2>&1" rule sim_var_portcullis_junc: @@ -275,10 +276,10 @@ rule sim_var_portcullis_junc: params: prepdir=PORT_DIR+"/prep", outdir=PORT_DIR+"/junc", - load=LOAD_PORTCULLIS + load=LOAD_PORTCULLIS2 log: PORT_DIR+"/junc.log" threads: int(THREADS) message: "Using portcullis to analyse potential junctions: {input}" - shell: "{params.load} && portcullis junc -o {params.outdir} -p portcullis_sim_var -t {threads} {params.prepdir} > {log} 2>&1" + shell: "{params.load} && portcullis junc -o {params.outdir}/portcullis_sim_var -t {threads} {params.prepdir} > {log} 2>&1" diff --git a/scripts/snakey.py b/scripts/snakey.py index 29d71b07..716cb626 100644 --- a/scripts/snakey.py +++ b/scripts/snakey.py @@ -62,7 +62,7 @@ res_cmd = " -R rusage[mem={cluster.memory}]span[ptile={threads}] -n {threads} -q " + QUEUE + " -J {rule} -oo logs/" + now + "_{rule}_%j.out" elif SCHEDULER == "PBS": sub_cmd = "qsub" - res_cmd = " -lselect=1:mem={cluster.memory}MB:ncpus={threads} -q " + QUEUE + " -N {rule} -o logs/{rule}_%j.out -e logs/{rule}_%j.out" + res_cmd = " -lselect=1:mem={cluster.memory}MB:ncpus={threads} -q " + QUEUE + " -o logs/{rule}_%j.out -e logs/{rule}_%j.out" elif SCHEDULER == "SLURM": sub_cmd = "sbatch" cores = "-c {threads} " From 12c5711dbaaeb346b1c36d8cd3a8641a6a830cfb Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Tue, 3 May 2016 17:06:07 +0100 Subject: [PATCH 02/21] Moving some of the more general code from training (KFold and performance output) into the core library. Upping version to 0.16.1. --- configure.ac | 2 +- doc/source/conf.py | 4 +- lib/Makefile.am | 6 +- lib/include/portcullis/k_fold.hpp | 78 ++++++++++++++++ lib/include/portcullis/performance.hpp | 82 +++++++--------- lib/src/performance.cc | 124 +++++++++++++++++++++++++ src/train.cc | 67 ++----------- src/train.hpp | 55 +---------- 8 files changed, 255 insertions(+), 163 deletions(-) create mode 100644 lib/include/portcullis/k_fold.hpp create mode 100644 lib/src/performance.cc diff --git a/configure.ac b/configure.ac index 6996e2cd..cb6d580d 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf setup AC_PREREQ([2.68]) -AC_INIT([portcullis],[0.16.0],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) +AC_INIT([portcullis],[0.16.1],[daniel.mapleson@tgac.ac.uk],[portcullis],[http://www.tgac.ac.uk]) AC_CONFIG_SRCDIR([src/portcullis.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/doc/source/conf.py b/doc/source/conf.py index 419f9401..7d753103 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,9 +53,9 @@ # built documents. # # The short X.Y version. -version = '0.16.0' +version = '0.16.1' # The full version, including alpha/beta/rc tags. -release = '0.16.0' +release = '0.16.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lib/Makefile.am b/lib/Makefile.am index 643163b7..7a9597f0 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -21,7 +21,8 @@ libportcullis_la_SOURCES = \ src/intron.cc \ src/junction.cc \ src/junction_system.cc \ - src/rule_parser.cc + src/rule_parser.cc \ + src/performance.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis PI = include/portcullis @@ -42,7 +43,8 @@ library_include_HEADERS = \ $(PI)/portcullis_fs.hpp \ $(PI)/seq_utils.hpp \ $(PI)/rule_parser.hpp \ - $(PI)/performance.hpp + $(PI)/performance.hpp \ + $(PI)/k_fold.hpp libportcullis_la_CPPFLAGS = \ -isystem $(top_srcdir)/deps/htslib-1.3 \ diff --git a/lib/include/portcullis/k_fold.hpp b/lib/include/portcullis/k_fold.hpp new file mode 100644 index 00000000..30489e4d --- /dev/null +++ b/lib/include/portcullis/k_fold.hpp @@ -0,0 +1,78 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +#include +#include +using std::vector; +using std::random_shuffle; + +#include + +namespace portcullis { + +typedef boost::error_info KFoldErrorInfo; +struct KFoldException: virtual boost::exception, virtual std::exception { }; + +// Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ +template +class KFold { +public: + KFold(int k, In _beg, In _end) : + beg(_beg), end(_end), K(k) { + if (K <= 0) + BOOST_THROW_EXCEPTION(KFoldException() << KFoldErrorInfo(string( + "The supplied value of K is =") + lexical_cast(K) + + ". One cannot create " + lexical_cast(K) + "no of folds")); + + //create the vector of integers + int foldNo = 0; + for (In i = beg; i != end; i++) { + whichFoldToGo.push_back(++foldNo); + if (foldNo == K) + foldNo = 0; + } + if (!K) + BOOST_THROW_EXCEPTION(KFoldException() << KFoldErrorInfo(string( + "With this value of k (=") + lexical_cast(K) + + ")Equal division of the data is not possible")); + + random_shuffle(whichFoldToGo.begin(), whichFoldToGo.end()); + } + + template + void getFold(int foldNo, Out training, Out testing) { + int k = 0; + In i = beg; + while (i != end) { + if (whichFoldToGo[k++] == foldNo) { + *testing++ = *i++; + } else + *training++ = *i++; + } + } + +private: + In beg; + In end; + int K; //how many folds in this + vector whichFoldToGo; +}; + +} \ No newline at end of file diff --git a/lib/include/portcullis/performance.hpp b/lib/include/portcullis/performance.hpp index c219f1f7..f0c6cd20 100644 --- a/lib/include/portcullis/performance.hpp +++ b/lib/include/portcullis/performance.hpp @@ -17,8 +17,17 @@ #pragma once +#include #include +#include +#include +#include +#include +using std::ostream; using std::stringstream; +using std::shared_ptr; +using std::string; +using std::vector; #include #include @@ -207,6 +216,8 @@ class Performance { return getPrecision() + getNPV() - 100.0; } + string toShortString() const; + string toLongString() const; static string shortHeader() { return "TP\tTN\tFP\tFN\tREC\tPRC\tF1"; @@ -222,54 +233,33 @@ class Performance { return out.str(); } + static void loadGenuine(path& genuineFile, vector& results); - string toShortString() const { - vector parts; - parts.push_back(std::to_string(tp)); - parts.push_back(std::to_string(tn)); - parts.push_back(std::to_string(fp)); - parts.push_back(std::to_string(fn)); - parts.push_back(to_2dp_string(getRecall())); - parts.push_back(to_2dp_string(getPrecision())); - parts.push_back(to_2dp_string(getF1Score())); - return boost::algorithm::join(parts, "\t"); - } - - string toLongString() const { - vector parts; - parts.push_back(std::to_string(tp)); - parts.push_back(std::to_string(tn)); - parts.push_back(std::to_string(fp)); - parts.push_back(std::to_string(fn)); - parts.push_back(to_2dp_string(getPrevalence())); - parts.push_back(to_2dp_string(getBias())); - parts.push_back(to_2dp_string(getSensitivity())); - parts.push_back(to_2dp_string(getSpecificity())); - parts.push_back(to_2dp_string(getPrecision())); - parts.push_back(to_2dp_string(getNPV())); - parts.push_back(to_2dp_string(getF1Score())); - parts.push_back(to_2dp_string(getAccuracy())); - parts.push_back(to_2dp_string(getInformedness())); - parts.push_back(to_2dp_string(getMarkedness())); - parts.push_back(to_2dp_string(getMCC())); - return boost::algorithm::join(parts, "\t"); - } - - - static void loadGenuine(path& genuineFile, vector& results) { - - // Load reference data - std::ifstream refs(genuineFile.string()); - string line; - uint32_t lineNb = 0; - while (std::getline(refs, line)) { - std::istringstream iss(line); - bool res; - iss >> res; - results.push_back(res); - } - refs.close(); +}; + +class PerformanceList { +public: + + void clear() { + this->scores.clear(); + } + + shared_ptr operator [](int i) const { + return this->scores[i]; + } + + void add(shared_ptr p) { + this->scores.push_back(p); } + + void outputMeanPerformance(std::ostream& resout); + +protected: + + vector> scores; + + void outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout); + }; } \ No newline at end of file diff --git a/lib/src/performance.cc b/lib/src/performance.cc new file mode 100644 index 00000000..5ea2e764 --- /dev/null +++ b/lib/src/performance.cc @@ -0,0 +1,124 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +#include +using std::cout; +using std::endl; + +#include + + +string portcullis::Performance::toShortString() const { + vector parts; + parts.push_back(std::to_string(tp)); + parts.push_back(std::to_string(tn)); + parts.push_back(std::to_string(fp)); + parts.push_back(std::to_string(fn)); + parts.push_back(portcullis::Performance::to_2dp_string(getRecall())); + parts.push_back(portcullis::Performance::to_2dp_string(getPrecision())); + parts.push_back(portcullis::Performance::to_2dp_string(getF1Score())); + return boost::algorithm::join(parts, "\t"); +} + +string portcullis::Performance::toLongString() const { + vector parts; + parts.push_back(std::to_string(tp)); + parts.push_back(std::to_string(tn)); + parts.push_back(std::to_string(fp)); + parts.push_back(std::to_string(fn)); + parts.push_back(portcullis::Performance::to_2dp_string(getPrevalence())); + parts.push_back(portcullis::Performance::to_2dp_string(getBias())); + parts.push_back(portcullis::Performance::to_2dp_string(getSensitivity())); + parts.push_back(portcullis::Performance::to_2dp_string(getSpecificity())); + parts.push_back(portcullis::Performance::to_2dp_string(getPrecision())); + parts.push_back(portcullis::Performance::to_2dp_string(getNPV())); + parts.push_back(portcullis::Performance::to_2dp_string(getF1Score())); + parts.push_back(portcullis::Performance::to_2dp_string(getAccuracy())); + parts.push_back(portcullis::Performance::to_2dp_string(getInformedness())); + parts.push_back(portcullis::Performance::to_2dp_string(getMarkedness())); + parts.push_back(portcullis::Performance::to_2dp_string(getMCC())); + return boost::algorithm::join(parts, "\t"); +} + + +void portcullis::Performance::loadGenuine(path& genuineFile, vector& results) { + + // Load reference data + std::ifstream refs(genuineFile.string()); + string line; + uint32_t lineNb = 0; + while (std::getline(refs, line)) { + std::istringstream iss(line); + bool res; + iss >> res; + results.push_back(res); + } + refs.close(); +} + +void portcullis::PerformanceList::outputMeanPerformance(std::ostream& resout) { + + vector prevs; + vector biases; + vector recs; + vector prcs; + vector spcs; + vector f1s; + vector infs; + vector mrks; + vector accs; + vector mccs; + + for(auto& p : this->scores) { + prevs.push_back(p->getPrevalence()); + biases.push_back(p->getBias()); + recs.push_back(p->getRecall()); + prcs.push_back(p->getPrecision()); + spcs.push_back(p->getSpecificity()); + f1s.push_back(p->getF1Score()); + infs.push_back(p->getInformedness()); + mrks.push_back(p->getMarkedness()); + accs.push_back(p->getAccuracy()); + mccs.push_back(p->getMCC()); + } + + outputMeanScore(prevs, "prevalence", resout); + outputMeanScore(biases, "bias", resout); + outputMeanScore(recs, "recall", resout); + outputMeanScore(prcs, "precision", resout); + outputMeanScore(f1s, "F1", resout); + outputMeanScore(spcs, "specificity", resout); + outputMeanScore(accs, "accuracy", resout); + outputMeanScore(infs, "informedness", resout); + outputMeanScore(mrks, "markededness", resout); + outputMeanScore(mccs, "MCC", resout); +} + +void portcullis::PerformanceList::outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout) { + double sum = std::accumulate(scores.begin(), scores.end(), 0.0); + double mean = sum / scores.size(); + double sq_sum = std::inner_product(scores.begin(), scores.end(), scores.begin(), 0.0); + double stdev = std::sqrt(sq_sum / scores.size() - mean * mean); + + stringstream msg; + msg << "Mean " << std::left << std::setw(13) << score_type << ": " << std::fixed << std::setprecision(2) << mean << "% (+/- " << stdev << "%)" << endl; + + cout << msg.str(); + resout << msg.str(); +} + diff --git a/src/train.cc b/src/train.cc index 689fa1c5..0f4b500d 100644 --- a/src/train.cc +++ b/src/train.cc @@ -26,7 +26,7 @@ using std::vector; using std::cout; using std::cerr; using std::endl; -using std::unique_ptr; +using std::shared_ptr; using std::make_shared; #include @@ -48,15 +48,17 @@ namespace po = boost::program_options; #include #include +#include using portcullis::JunctionSystem; using portcullis::JunctionList; using portcullis::Performance; +using portcullis::PerformanceList; +using portcullis::KFold; #include "train.hpp" -using portcullis::KFold; using portcullis::Train; -portcullis::Train::Train(const path& _junctionFile, const path& _refFile){ +portcullis::Train::Train(const path& _junctionFile, const path& _refFile) { junctionFile = _junctionFile; refFile = _refFile; outputPrefix = ""; @@ -170,7 +172,7 @@ void portcullis::Train::train() { KFold kf(folds, junctions.begin(), junctions.end()); JunctionList test, train; - vector> perfs; + PerformanceList perfs; cout << endl << "Starting " << folds << "-fold cross validation" << endl; @@ -213,12 +215,12 @@ void portcullis::Train::train() { } } - unique_ptr p( new Performance(tp, tn, fp, fn) ); + shared_ptr p = make_shared(tp, tn, fp, fn); cout << p->toLongString() << endl; resout << p->toLongString() << endl; - perfs.push_back(std::move(p)); + perfs.add(p); // Clear the train and test vectors in preparation for the next step train.clear(); @@ -227,7 +229,7 @@ void portcullis::Train::train() { cout << "Cross validation completed" << endl << endl; - outputMeanPerformance(perfs, resout); + perfs.outputMeanPerformance(resout); resout.close(); cout << endl << "Saved cross validation results to file " << outputPrefix.string() << ".cv_results" << endl; @@ -237,57 +239,6 @@ void portcullis::Train::train() { } -void portcullis::Train::outputMeanPerformance(const vector>& scores, std::ofstream& resout) { - - vector prevs; - vector biases; - vector recs; - vector prcs; - vector spcs; - vector f1s; - vector infs; - vector mrks; - vector accs; - vector mccs; - - for(auto& p : scores) { - prevs.push_back(p->getPrevalence()); - biases.push_back(p->getBias()); - recs.push_back(p->getRecall()); - prcs.push_back(p->getPrecision()); - spcs.push_back(p->getSpecificity()); - f1s.push_back(p->getF1Score()); - infs.push_back(p->getInformedness()); - mrks.push_back(p->getMarkedness()); - accs.push_back(p->getAccuracy()); - mccs.push_back(p->getMCC()); - } - - outputMeanScore(prevs, "prevalence", resout); - outputMeanScore(biases, "bias", resout); - outputMeanScore(recs, "recall", resout); - outputMeanScore(prcs, "precision", resout); - outputMeanScore(f1s, "F1", resout); - outputMeanScore(spcs, "specificity", resout); - outputMeanScore(accs, "accuracy", resout); - outputMeanScore(infs, "informedness", resout); - outputMeanScore(mrks, "markededness", resout); - outputMeanScore(mccs, "MCC", resout); -} - -void portcullis::Train::outputMeanScore(const vector& scores, const string& score_type, std::ofstream& resout) { - double sum = std::accumulate(scores.begin(), scores.end(), 0.0); - double mean = sum / scores.size(); - double sq_sum = std::inner_product(scores.begin(), scores.end(), scores.begin(), 0.0); - double stdev = std::sqrt(sq_sum / scores.size() - mean * mean); - - stringstream msg; - msg << "Mean " << std::left << std::setw(13) << score_type << ": " << std::fixed << std::setprecision(2) << mean << "% (+/- " << stdev << "%)" << endl; - - cout << msg.str(); - resout << msg.str(); -} - void portcullis::Train::getRandomSubset(const JunctionList& in, JunctionList& out) { // Create a list of randomly shuffled indices with maximum of "in". diff --git a/src/train.hpp b/src/train.hpp index 40fe2b34..95f2d63b 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -36,9 +36,7 @@ using boost::filesystem::path; #include #include -#include #include -using portcullis::bam::GenomeMapper; using portcullis::ModelFeatures; namespace portcullis { @@ -54,53 +52,6 @@ const double DEFAULT_TRAIN_FRACTION = 1.0; const int DEFAULT_SEED = 1234567; // To avoid non-deterministic behaviour - -// Derived from https://sureshamrita.wordpress.com/2011/08/24/c-implementation-of-k-fold-cross-validation/ -template -class KFold { -public: - KFold(int k, In _beg, In _end) : - beg(_beg), end(_end), K(k) { - if (K <= 0) - BOOST_THROW_EXCEPTION(TrainException() << TrainErrorInfo(string( - "The supplied value of K is =") + lexical_cast(K) + - ". One cannot create " + lexical_cast(K) + "no of folds")); - - //create the vector of integers - int foldNo = 0; - for (In i = beg; i != end; i++) { - whichFoldToGo.push_back(++foldNo); - if (foldNo == K) - foldNo = 0; - } - if (!K) - BOOST_THROW_EXCEPTION(TrainException() << TrainErrorInfo(string( - "With this value of k (=") + lexical_cast(K) + - ")Equal division of the data is not possible")); - - random_shuffle(whichFoldToGo.begin(), whichFoldToGo.end()); - } - - template - void getFold(int foldNo, Out training, Out testing) { - int k = 0; - In i = beg; - while (i != end) { - if (whichFoldToGo[k++] == foldNo) { - *testing++ = *i++; - } else - *training++ = *i++; - } - } - -private: - In beg; - In end; - int K; //how many folds in this - vector whichFoldToGo; -}; - - class Train { @@ -225,11 +176,7 @@ class Train { void testInstance(shared_ptr f, const JunctionList& y); - void getRandomSubset(const JunctionList& in, JunctionList& out); - - void outputMeanPerformance(const vector>& scores, std::ofstream& resout); - - void outputMeanScore(const vector& scores, const string& score_type, std::ofstream& resout); + void getRandomSubset(const JunctionList& in, JunctionList& out); }; } From b7d6d575e8563b4b36acacfd1dbc35ad1ae272cb Mon Sep 17 00:00:00 2001 From: Dan Date: Tue, 3 May 2016 22:15:17 +0100 Subject: [PATCH 03/21] Added cross validation to selftraining process. --- src/junction_filter.cc | 126 +++++++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 31 deletions(-) diff --git a/src/junction_filter.cc b/src/junction_filter.cc index b3e78942..dbee7783 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -33,6 +33,7 @@ using std::map; using std::unordered_map; using std::unordered_set; using std::vector; +using std::to_string; using std::cout; using std::cerr; @@ -59,12 +60,14 @@ namespace po = boost::program_options; #include #include #include +#include using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; using portcullis::Performance; using portcullis::eval; using portcullis::bam::GenomeMapper; +using portcullis::KFold; #include "junction_filter.hpp" @@ -267,40 +270,11 @@ void portcullis::JunctionFilter::filter() { cout << "Training Random Forest" << endl << "----------------------" << endl << endl; bool done = false; - shared_ptr forest = nullptr; - while(!done) { - - forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); - const vector importance = forest->getVariableImportance(); - mf.resetActiveFeatureIndex(); - uint16_t min_importance_idx = 10000; - double min_importance_val = 100000.0; - uint16_t min_feature_idx = 10000; - // This approach to feature selection (i.e. removing worst features by importance) - // doesn't appear to have much impact on final results and is computationally - // expensive. Removing for now. - /*for(size_t i = 0; i < importance.size(); i++) { - double imp = importance[i]; - int16_t j = mf.getNextActiveFeatureIndex(); - if (imp < 0.1 && imp < min_importance_val) { - min_importance_idx = i; - min_importance_val = imp; - min_feature_idx = j; - } - }*/ - if (min_importance_idx != 10000) { - mf.features[min_feature_idx].active = false; - cout << "Deactivating feature: " << mf.features[min_feature_idx].name << " - " << min_importance_val << endl; - } - else { - done = true; - } - } + shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); const vector importance = forest->getVariableImportance(); - bool foundIrrelevant = false; mf.resetActiveFeatureIndex(); - cout << "Active features remaining:" << endl; + cout << "Feature importance:" << endl; for(auto& i : importance) { int16_t j = mf.getNextActiveFeatureIndex(); cout << mf.features[j].name << " - " << i << endl; @@ -311,6 +285,96 @@ void portcullis::JunctionFilter::filter() { modelFile = output.string() + ".selftrain.forest"; cout << endl; + + // Determine optimal threshold using 5-fold CV + const uint16_t FOLDS = 5; + KFold kf(FOLDS, trainingSystem.getJunctions().begin(), trainingSystem.getJunctions().end()); + + JunctionList test, train; + PerformanceList perfs; + + cout << endl << "Starting " << FOLDS << "-fold cross validation" << endl; + + std::ofstream resout(this->output.string() + ".cv_results"); + + cout << "Fold\t" << Performance::longHeader() << endl; + resout << "Fold\t" << Performance::longHeader() << endl; + + for (uint16_t i = 1; i <= 5; i++) { + + cout << i << "\t"; + resout << i << "\t"; + cout.flush(); + resout.flush(); + + // Populate train and test for this step + kf.getFold(i, back_inserter(train), back_inserter(test)); + + // Train on this particular set + ForestPtr f = ModelFeatures().trainInstance(train, output.string() + ".selftrain.cv-" + to_string(i), DEFAULT_SELFTRAIN_TREES, threads, true, false); + + // Convert testing set junctions into feature vector + Data* testingData = mf.juncs2FeatureVectors(test); + + f->setPredictionMode(true); + f->setData(testingData); + f->run(false); + + delete testingData; + + vector thresholds; + for(double i = 0.0; i <= 1.0; i += 0.01) { + thresholds.push_back(i); + } + double max_mcc = 0.0; + double max_f1 = 0.0; + double best_t_mcc = 0.0; + double best_t_f1 = 0.0; + + cout << "Threshold\t" << Performance::longHeader() << endl; + for(auto& t : thresholds) { + JunctionList pjl; + JunctionList fjl; + categorise(f, test, pjl, fjl, t); + shared_ptr perf = calcPerformance(pjl, fjl); + double mcc = perf->getMCC(); + double f1 = perf->getF1Score(); + //cout << t << "\t" << perf->toLongString() << endl; + if (mcc > max_mcc) { + max_mcc = mcc; + best_t_mcc = t; + } + if (f1 > max_f1) { + max_f1 = f1; + best_t_f1 = t; + } + } + + //cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; + //cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; + //cout << "Actual threshold set at " << threshold << endl; + JunctionList pjl; + JunctionList fjl; + categorise(f, test, pjl, fjl, best_t_mcc); + shared_ptr perf = calcPerformance(pjl, fjl); + cout << perf->toLongString() << endl; + resout << perf->toLongString() << endl; + + perfs.add(perf); + + // Clear the train and test vectors in preparation for the next step + train.clear(); + test.clear(); + } + + cout << "Cross validation completed" << endl << endl; + + perfs.outputMeanPerformance(resout); + + cout << endl << "Saved cross validation results to file " << output.string() << ".cv_results" << endl; + + resout.close(); + } // Manage a junction system of all discarded junctions From b527245adf0d17b46334ae6c829c8c7fbb621cc3 Mon Sep 17 00:00:00 2001 From: Dan Date: Tue, 3 May 2016 23:53:27 +0100 Subject: [PATCH 04/21] Adding output information for cross validation so we can determine performance compared to optimum. --- lib/src/model_features.cc | 8 ++++---- src/junction_filter.cc | 27 +++++++++++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 902c7813..806ee6c1 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -249,11 +249,11 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { - cout << "Creating feature vector" << endl; + if (verbose) cout << "Creating feature vector" << endl; Data* trainingData = juncs2FeatureVectors(x); path feature_file = outputPrefix + ".features"; - cout << "Saving feature vector to disk: " << feature_file << endl; + if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; ofstream fout(feature_file.c_str(), std::ofstream::out); @@ -264,7 +264,7 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis fout.close(); - cout << "Initialising random forest" << endl; + if (verbose) cout << "Initialising random forest" << endl; ForestPtr f = nullptr; if (probabilityMode) { f = make_shared(); @@ -295,7 +295,7 @@ portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionLis false, // predall 1.0); // Sample fraction - cout << "Training" << endl; + if (verbose) cout << "Training" << endl; f->setVerboseOut(&cerr); f->run(verbose); diff --git a/src/junction_filter.cc b/src/junction_filter.cc index dbee7783..757f86ad 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -297,9 +297,9 @@ void portcullis::JunctionFilter::filter() { std::ofstream resout(this->output.string() + ".cv_results"); - cout << "Fold\t" << Performance::longHeader() << endl; - resout << "Fold\t" << Performance::longHeader() << endl; - + cout << "Fold\tT\t" << Performance::longHeader() << endl; + resout << "Fold\tT\t" << Performance::longHeader() << endl; + vector best_thresholds; for (uint16_t i = 1; i <= 5; i++) { cout << i << "\t"; @@ -311,7 +311,7 @@ void portcullis::JunctionFilter::filter() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - ForestPtr f = ModelFeatures().trainInstance(train, output.string() + ".selftrain.cv-" + to_string(i), DEFAULT_SELFTRAIN_TREES, threads, true, false); + ForestPtr f = mf.trainInstance(train, output.string() + ".selftrain.cv-" + to_string(i), DEFAULT_SELFTRAIN_TREES, threads, true, false); // Convert testing set junctions into feature vector Data* testingData = mf.juncs2FeatureVectors(test); @@ -331,7 +331,6 @@ void portcullis::JunctionFilter::filter() { double best_t_mcc = 0.0; double best_t_f1 = 0.0; - cout << "Threshold\t" << Performance::longHeader() << endl; for(auto& t : thresholds) { JunctionList pjl; JunctionList fjl; @@ -356,9 +355,10 @@ void portcullis::JunctionFilter::filter() { JunctionList pjl; JunctionList fjl; categorise(f, test, pjl, fjl, best_t_mcc); + best_thresholds.push_back(best_t_mcc); shared_ptr perf = calcPerformance(pjl, fjl); - cout << perf->toLongString() << endl; - resout << perf->toLongString() << endl; + cout << best_t_mcc << "\t" << perf->toLongString() << endl; + resout << best_t_mcc << "\t" << perf->toLongString() << endl; perfs.add(perf); @@ -371,6 +371,17 @@ void portcullis::JunctionFilter::filter() { perfs.outputMeanPerformance(resout); + double b_t = 0.0; + for(double t : best_thresholds) { + b_t += t; + } + + b_t /= best_thresholds.size(); + + cout << "Optimal threshold: " << b_t << endl << endl; + + threshold = b_t; + cout << endl << "Saved cross validation results to file " << output.string() << ".cv_results" << endl; resout.close(); @@ -898,7 +909,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; cout << "Actual threshold set at " << threshold << endl; - categorise(f, all, pass, fail, best_t_f1); + categorise(f, all, pass, fail, threshold); } else { categorise(f, all, pass, fail, threshold); From b0d96c7f2ffebe1643fc85ab1cb17ed89b550a00 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 4 May 2016 17:31:41 +0100 Subject: [PATCH 05/21] WIP. Trying to implement a proper Semi-supervised random forest implementation. --- deps/ranger-0.3.8/include/ranger/Forest.h | 362 +++++++++--------- deps/ranger-0.3.8/src/Forest.cpp | 5 +- lib/Makefile.am | 15 +- lib/include/portcullis/{ => ml}/k_fold.hpp | 2 + .../portcullis/{ => ml}/markov_model.hpp | 3 +- .../portcullis/{ => ml}/model_features.hpp | 2 + .../portcullis/{ => ml}/performance.hpp | 4 +- lib/include/portcullis/ml/ss_forest.hpp | 66 ++++ lib/src/markov_model.cc | 8 +- lib/src/model_features.cc | 12 +- lib/src/performance.cc | 10 +- lib/src/ss_forest.cc | 122 ++++++ 12 files changed, 415 insertions(+), 196 deletions(-) rename lib/include/portcullis/{ => ml}/k_fold.hpp (99%) rename lib/include/portcullis/{ => ml}/markov_model.hpp (99%) rename lib/include/portcullis/{ => ml}/model_features.hpp (99%) rename lib/include/portcullis/{ => ml}/performance.hpp (99%) create mode 100644 lib/include/portcullis/ml/ss_forest.hpp create mode 100644 lib/src/ss_forest.cc diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index c9351c8e..278b9fd4 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -46,105 +46,122 @@ class Forest { public: - Forest(); - virtual ~Forest(); - - // Init from c++ main or Rcpp from R - void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, - std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, - std::string split_select_weights_file, std::vector& always_split_variable_names, - std::string status_variable_name, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::string case_weights_file, bool predict_all, double sample_fraction); - void initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, - std::vector>& split_select_weights, std::vector& always_split_variable_names, - std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction); - void init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, - std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, double sample_fraction); - virtual void initInternal(std::string status_variable_name) = 0; - - // Grow or predict - void run(bool verbose); - - // Write results to output files - void writeOutput(std::ostream* out); - virtual void writeOutputInternal() = 0; - virtual void writeConfusionFile() = 0; - virtual void writePredictionFile() = 0; - void writeImportanceFile(); - - // Save forest to file - void saveToFile(); - virtual void saveToFileInternal(std::ofstream& outfile) = 0; - - std::vector>>getChildNodeIDs() { - std::vector>> result; - for (auto& tree : trees) { - result.push_back(tree->getChildNodeIDs()); + Forest(); + virtual ~Forest(); + + // Init from c++ main or Rcpp from R + void initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, + std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string split_select_weights_file, std::vector& always_split_variable_names, + std::string status_variable_name, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::string case_weights_file, bool predict_all, double sample_fraction); + void initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, + std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + std::vector>&split_select_weights, std::vector& always_split_variable_names, + std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction); + void init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, + std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + bool predict_all, double sample_fraction); + virtual void initInternal(std::string status_variable_name) = 0; + + // Grow or predict + void run(bool verbose); + + // Write results to output files + void writeOutput(std::ostream* out); + virtual void writeOutputInternal() = 0; + virtual void writeConfusionFile() = 0; + virtual void writePredictionFile() = 0; + void writeImportanceFile(); + + // Save forest to file + void saveToFile(); + void saveToFile(std::string filename); + virtual void saveToFileInternal(std::ofstream& outfile) = 0; + + std::vector>>getChildNodeIDs() { + std::vector>> result; + for (auto& tree : trees) { + result.push_back(tree->getChildNodeIDs()); + } + return result; } - return result; - } - std::vector> getSplitVarIDs() { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getSplitVarIDs()); + + std::vector> getSplitVarIDs() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getSplitVarIDs()); + } + return result; + } + + std::vector> getSplitValues() { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getSplitValues()); + } + return result; + } + + const std::vector& getVariableImportance() const { + return variable_importance; + } + + double getOverallPredictionError() const { + return overall_prediction_error; + } + + const std::vector >& getPredictions() const { + return predictions; + } + + const double getPrediction(int i) const { + double sum = 0.0; + for(size_t j = 0; j < predictions[i].size(); j++) { + sum += predictions[i][j]; + } + + return sum / predictions[i].size(); + } + + size_t getDependentVarId() const { + return dependent_varID; + } + + size_t getNumTrees() const { + return num_trees; } - return result; - } - std::vector> getSplitValues() { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getSplitValues()); + + uint getMtry() const { + return mtry; + } + + uint getMinNodeSize() const { + return min_node_size; + } + + size_t getNumIndependentVariables() const { + return num_independent_variables; + } + + const std::vector& getIsOrderedVariable() const { + return is_ordered_variable; } - return result; - } - const std::vector& getVariableImportance() const { - return variable_importance; - } - double getOverallPredictionError() const { - return overall_prediction_error; - } - const std::vector >& getPredictions() const { - return predictions; - } - size_t getDependentVarId() const { - return dependent_varID; - } - size_t getNumTrees() const { - return num_trees; - } - uint getMtry() const - { - return mtry; - } - uint getMinNodeSize() const - { - return min_node_size; - } - size_t getNumIndependentVariables() const - { - return num_independent_variables; - } - const std::vector& getIsOrderedVariable() const - { - return is_ordered_variable; - } - - std::vector> getInbagCounts() const { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getInbagCounts()); + + std::vector> getInbagCounts() const { + std::vector> result; + for (auto& tree : trees) { + result.push_back(tree->getInbagCounts()); + } + return result; } - return result; - } - + void setPredictionMode(bool prediction_mode) { this->prediction_mode = prediction_mode; } @@ -154,113 +171,114 @@ class Forest { // Assume we want to reset predictions too this->predictions.clear(); } - + void setVerboseOut(std::ostream* verbose_out) { this->verbose_out = verbose_out; } -void loadFromFile(std::string filename); - - + void loadFromFile(std::string filename); + + void grow(bool verbose); + void predict(); + + protected: - void grow(bool verbose); - virtual void growInternal() = 0; + virtual void growInternal() = 0; - // Predict using existing tree from file and data as prediction data - void predict(); - virtual void predictInternal() = 0; + // Predict using existing tree from file and data as prediction data + virtual void predictInternal() = 0; - void computePredictionError(); - virtual void computePredictionErrorInternal() = 0; + void computePredictionError(); + virtual void computePredictionErrorInternal() = 0; - void computePermutationImportance(); + void computePermutationImportance(); - // Multithreading methods for growing/prediction/importance, called by each thread - void growTreesInThread(uint thread_idx, std::vector* variable_importance); - void predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction); - void computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, std::vector* variance); + // Multithreading methods for growing/prediction/importance, called by each thread + void growTreesInThread(uint thread_idx, std::vector* variable_importance); + void predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction); + void computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, std::vector* variance); - // Load forest from file - virtual void loadFromFileInternal(std::ifstream& infile) = 0; + // Load forest from file + virtual void loadFromFileInternal(std::ifstream& infile) = 0; - // Set split select weights and variables to be always considered for splitting - void setSplitWeightVector(std::vector>& split_select_weights); - void setAlwaysSplitVariables(std::vector& always_split_variable_names); + // Set split select weights and variables to be always considered for splitting + void setSplitWeightVector(std::vector>&split_select_weights); + void setAlwaysSplitVariables(std::vector& always_split_variable_names); - // Show progress every few seconds + // Show progress every few seconds #ifdef WIN_R_BUILD - void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); + void showProgress(std::string operation, clock_t start_time, clock_t& lap_time); #else - void showProgress(std::string operation); + void showProgress(std::string operation); #endif - // Verbose output stream, cout if verbose==true, logfile if not - std::ostream* verbose_out; - - size_t num_trees; - uint mtry; - uint min_node_size; - size_t num_variables; - size_t num_independent_variables; - uint seed; - size_t dependent_varID; - size_t num_samples; - bool prediction_mode; - MemoryMode memory_mode; - bool sample_with_replacement; - bool memory_saving_splitting; - SplitRule splitrule; - bool predict_all; - bool keep_inbag; - double sample_fraction; - - // For each varID true if ordered - std::vector is_ordered_variable; - - // Variable to not split at (only dependent_varID for non-survival forests) - std::vector no_split_variables; - - // Multithreading - uint num_threads; - std::vector thread_ranges; + // Verbose output stream, cout if verbose==true, logfile if not + std::ostream* verbose_out; + + size_t num_trees; + uint mtry; + uint min_node_size; + size_t num_variables; + size_t num_independent_variables; + uint seed; + size_t dependent_varID; + size_t num_samples; + bool prediction_mode; + MemoryMode memory_mode; + bool sample_with_replacement; + bool memory_saving_splitting; + SplitRule splitrule; + bool predict_all; + bool keep_inbag; + double sample_fraction; + + // For each varID true if ordered + std::vector is_ordered_variable; + + // Variable to not split at (only dependent_varID for non-survival forests) + std::vector no_split_variables; + + // Multithreading + uint num_threads; + std::vector thread_ranges; #ifndef WIN_R_BUILD - std::mutex mutex; - std::condition_variable condition_variable; + std::mutex mutex; + std::condition_variable condition_variable; #endif - std::vector trees; - Data* data; + std::vector trees; + Data* data; - std::vector> predictions; - double overall_prediction_error; + std::vector> predictions; + double overall_prediction_error; - // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable - // Deterministic variables are always selected - std::vector deterministic_varIDs; - std::vector split_select_varIDs; - std::vector> split_select_weights; + // Weight vector for selecting possible split variables, one weight between 0 (never select) and 1 (always select) for each variable + // Deterministic variables are always selected + std::vector deterministic_varIDs; + std::vector split_select_varIDs; + std::vector> split_select_weights; - // Bootstrap weights - std::vector case_weights; + // Bootstrap weights + std::vector case_weights; - // Random number generator - std::mt19937_64 random_number_generator; + // Random number generator + std::mt19937_64 random_number_generator; - std::string output_prefix; - ImportanceMode importance_mode; + std::string output_prefix; + ImportanceMode importance_mode; - // Variable importance for all variables in forest - std::vector variable_importance; + // Variable importance for all variables in forest + std::vector variable_importance; - // Computation progress (finished trees) - size_t progress; + // Computation progress (finished trees) + size_t progress; #ifdef R_BUILD - size_t aborted_threads; - bool aborted; + size_t aborted_threads; + bool aborted; #endif private: - DISALLOW_COPY_AND_ASSIGN(Forest); + DISALLOW_COPY_AND_ASSIGN(Forest); }; #endif /* FOREST_H_ */ diff --git a/deps/ranger-0.3.8/src/Forest.cpp b/deps/ranger-0.3.8/src/Forest.cpp index f58558f3..a17eb4c2 100644 --- a/deps/ranger-0.3.8/src/Forest.cpp +++ b/deps/ranger-0.3.8/src/Forest.cpp @@ -349,9 +349,12 @@ void Forest::writeImportanceFile() { } void Forest::saveToFile() { + saveToFile(output_prefix + ".forest"); +} + +void Forest::saveToFile(std::string filename) { // Open file for writing - std::string filename = output_prefix + ".forest"; std::ofstream outfile; outfile.open(filename, std::ios::binary); if (!outfile.good()) { diff --git a/lib/Makefile.am b/lib/Makefile.am index 7a9597f0..1172ae1c 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -22,7 +22,8 @@ libportcullis_la_SOURCES = \ src/junction.cc \ src/junction_system.cc \ src/rule_parser.cc \ - src/performance.cc + src/performance.cc \ + src/ss_forest.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis PI = include/portcullis @@ -33,18 +34,20 @@ library_include_HEADERS = \ $(PI)/bam/bam_writer.hpp \ $(PI)/bam/depth_parser.hpp \ $(PI)/bam/genome_mapper.hpp \ + $(PI)/ml/markov_model.hpp \ + $(PI)/ml/model_features.hpp \ + $(PI)/ml/performance.hpp \ + $(PI)/ml/k_fold.hpp \ + $(PI)/ml/ss_forest.hpp \ $(PI)/kmer.hpp \ - $(PI)/markov_model.hpp \ - $(PI)/model_features.hpp \ $(PI)/python_exe.hpp \ $(PI)/intron.hpp \ $(PI)/junction.hpp \ $(PI)/junction_system.hpp \ $(PI)/portcullis_fs.hpp \ $(PI)/seq_utils.hpp \ - $(PI)/rule_parser.hpp \ - $(PI)/performance.hpp \ - $(PI)/k_fold.hpp + $(PI)/rule_parser.hpp + libportcullis_la_CPPFLAGS = \ -isystem $(top_srcdir)/deps/htslib-1.3 \ diff --git a/lib/include/portcullis/k_fold.hpp b/lib/include/portcullis/ml/k_fold.hpp similarity index 99% rename from lib/include/portcullis/k_fold.hpp rename to lib/include/portcullis/ml/k_fold.hpp index 30489e4d..92d2597b 100644 --- a/lib/include/portcullis/k_fold.hpp +++ b/lib/include/portcullis/ml/k_fold.hpp @@ -26,6 +26,7 @@ using std::random_shuffle; #include namespace portcullis { +namespace ml { typedef boost::error_info KFoldErrorInfo; struct KFoldException: virtual boost::exception, virtual std::exception { }; @@ -75,4 +76,5 @@ class KFold { vector whichFoldToGo; }; +} } \ No newline at end of file diff --git a/lib/include/portcullis/markov_model.hpp b/lib/include/portcullis/ml/markov_model.hpp similarity index 99% rename from lib/include/portcullis/markov_model.hpp rename to lib/include/portcullis/ml/markov_model.hpp index 613fb177..7471b6c8 100644 --- a/lib/include/portcullis/markov_model.hpp +++ b/lib/include/portcullis/ml/markov_model.hpp @@ -26,6 +26,7 @@ using std::vector; namespace portcullis { +namespace ml { typedef boost::error_info MMErrorInfo; struct MMException: virtual boost::exception, virtual std::exception {}; @@ -104,6 +105,6 @@ class PosMarkovModel : public portcullis::MarkovModel { } }; - +} } diff --git a/lib/include/portcullis/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp similarity index 99% rename from lib/include/portcullis/model_features.hpp rename to lib/include/portcullis/ml/model_features.hpp index 1820cc01..64151166 100644 --- a/lib/include/portcullis/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -37,6 +37,7 @@ using portcullis::JunctionList; using portcullis::SplicingScores; namespace portcullis { +namespace ml { typedef shared_ptr ForestPtr; @@ -131,4 +132,5 @@ class ModelFeatures { } }; +} } \ No newline at end of file diff --git a/lib/include/portcullis/performance.hpp b/lib/include/portcullis/ml/performance.hpp similarity index 99% rename from lib/include/portcullis/performance.hpp rename to lib/include/portcullis/ml/performance.hpp index f0c6cd20..53ea35e5 100644 --- a/lib/include/portcullis/performance.hpp +++ b/lib/include/portcullis/ml/performance.hpp @@ -34,7 +34,8 @@ using std::vector; using boost::filesystem::path; namespace portcullis { - +namespace ml { + class Performance { protected: uint32_t tp; @@ -262,4 +263,5 @@ class PerformanceList { }; +} } \ No newline at end of file diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp new file mode 100644 index 00000000..07e1783a --- /dev/null +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -0,0 +1,66 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include +using std::shared_ptr; + +#include +using boost::filesystem::path; + +#include +#include + +#include +using portcullis::ml::ModelFeatures; + +#include +using portcullis::bam::GenomeMapper; +using portcullis::MarkovModel; +using portcullis::Junction; +using portcullis::JunctionPtr; +using portcullis::JunctionList; +using portcullis::SplicingScores; + +namespace portcullis { +namespace ml { + +const uint16_t REPEAT_LIMIT = 3; + +class SemiSupervisedForest { +private: + uint16_t trees; + Data* labelled; + Data* unlablled; + ModelFeatures mf; + ForestPtr forest; + +public: + SemiSupervisedForest(const JunctionList& labelled, const JunctionList& unlabelled, + string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, + bool verbose); + + void train(); + + ForestPtr getForest(); + + static Data* juncs2FeatureVectors(const JunctionList& x); + +}; +} +} \ No newline at end of file diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 2e7828d7..5d112876 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -27,7 +27,7 @@ using portcullis::SeqUtils; #include -void portcullis::KmerMarkovModel::train(const vector& input, const uint32_t _order) { +void portcullis::ml::KmerMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; KMMU temp; for(auto& seq : input) { @@ -52,7 +52,7 @@ void portcullis::KmerMarkovModel::train(const vector& input, const uint3 } -double portcullis::KmerMarkovModel::getScore(const string& seq) { +double portcullis::ml::KmerMarkovModel::getScore(const string& seq) { string s = SeqUtils::makeClean(seq); double score = 1.0; for(uint16_t i = order; i < s.size(); i++){ @@ -64,7 +64,7 @@ double portcullis::KmerMarkovModel::getScore(const string& seq) { return log(score); } -void portcullis::PosMarkovModel::train(const vector& input, const uint32_t _order) { +void portcullis::ml::PosMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; PMMU temp; for(auto& seq : input) { @@ -87,7 +87,7 @@ void portcullis::PosMarkovModel::train(const vector& input, const uint32 } -double portcullis::PosMarkovModel::getScore(const string& seq) { +double portcullis::ml::PosMarkovModel::getScore(const string& seq) { string s = SeqUtils::makeClean(seq); double score = 1.0; for(uint16_t i = order; i < s.size(); i++){ diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 806ee6c1..eaaff11a 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -35,7 +35,7 @@ using portcullis::Junction; -void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { +void portcullis::ml::ModelFeatures::initGenomeMapper(const path& genomeFile) { // Initialise gmap.setGenomeFile(genomeFile); @@ -44,7 +44,7 @@ void portcullis::ModelFeatures::initGenomeMapper(const path& genomeFile) { gmap.loadFastaIndex(); } -uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { +uint32_t portcullis::ml::ModelFeatures::calcIntronThreshold(const JunctionList& juncs) { vector intron_sizes; for(auto& j : juncs) { @@ -58,7 +58,7 @@ uint32_t portcullis::ModelFeatures::calcIntronThreshold(const JunctionList& junc return L95; } -void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { +void portcullis::ml::ModelFeatures::trainCodingPotentialModel(const JunctionList& in) { vector exons; vector introns; @@ -102,7 +102,7 @@ void portcullis::ModelFeatures::trainCodingPotentialModel(const JunctionList& in intronModel.train(introns, 5); } -void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { +void portcullis::ml::ModelFeatures::trainSplicingModels(const JunctionList& pass, const JunctionList& fail) { vector donors; vector acceptors; @@ -167,7 +167,7 @@ void portcullis::ModelFeatures::trainSplicingModels(const JunctionList& pass, co acceptorFModel.train(acceptors, 5); } -Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { +Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { vector headers; for(auto& f : features) { @@ -246,7 +246,7 @@ Data* portcullis::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { -portcullis::ForestPtr portcullis::ModelFeatures::trainInstance(const JunctionList& x, +portcullis::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { if (verbose) cout << "Creating feature vector" << endl; diff --git a/lib/src/performance.cc b/lib/src/performance.cc index 5ea2e764..cb379da6 100644 --- a/lib/src/performance.cc +++ b/lib/src/performance.cc @@ -23,7 +23,7 @@ using std::endl; #include -string portcullis::Performance::toShortString() const { +string portcullis::ml::Performance::toShortString() const { vector parts; parts.push_back(std::to_string(tp)); parts.push_back(std::to_string(tn)); @@ -35,7 +35,7 @@ string portcullis::Performance::toShortString() const { return boost::algorithm::join(parts, "\t"); } -string portcullis::Performance::toLongString() const { +string portcullis::ml::Performance::toLongString() const { vector parts; parts.push_back(std::to_string(tp)); parts.push_back(std::to_string(tn)); @@ -56,7 +56,7 @@ string portcullis::Performance::toLongString() const { } -void portcullis::Performance::loadGenuine(path& genuineFile, vector& results) { +void portcullis::ml::Performance::loadGenuine(path& genuineFile, vector& results) { // Load reference data std::ifstream refs(genuineFile.string()); @@ -71,7 +71,7 @@ void portcullis::Performance::loadGenuine(path& genuineFile, vector& resul refs.close(); } -void portcullis::PerformanceList::outputMeanPerformance(std::ostream& resout) { +void portcullis::ml::PerformanceList::outputMeanPerformance(std::ostream& resout) { vector prevs; vector biases; @@ -109,7 +109,7 @@ void portcullis::PerformanceList::outputMeanPerformance(std::ostream& resout) { outputMeanScore(mccs, "MCC", resout); } -void portcullis::PerformanceList::outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout) { +void portcullis::ml::PerformanceList::outputMeanScore(const vector& scores, const string& score_type, std::ostream& resout) { double sum = std::accumulate(scores.begin(), scores.end(), 0.0); double mean = sum / scores.size(); double sq_sum = std::inner_product(scores.begin(), scores.end(), scores.begin(), 0.0); diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc new file mode 100644 index 00000000..e7c33226 --- /dev/null +++ b/lib/src/ss_forest.cc @@ -0,0 +1,122 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +#include + +#include +using portcullis::SeqUtils; + +#include + +ForestPtr portcullis::ml::SSForest::train() { + + if (verbose) cout << "Initialising random forest" << endl; + ForestPtr f = make_shared(); + + vector catVars; + + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + labelled, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction + + f->setVerboseOut(&cerr); + + if (verbose) cout << "Training on labelled data" << endl; + f->run(verbose); + + string best_forest = outputPrefix+"/ssrf.0.forest"; + f->saveToFile(best_forest); + + vector oobe; + + // Store out of box prediction error for first run on just labelled data + oobe.push_back(f->getOverallPredictionError()); + + // Loop until no improvement using deterministic annealing + bool improved = true; + uint16_t repeat = 1; + uint16_t it = 1; + while(improved || repeat <= REPEAT_LIMIT) { + + // If OOBE improved during the last iteration make new predictions on + // unlabelled data with the current model + if (improved) { + if (verbose) cout << "Making predictions on the unlabelled set using current model" << endl; + f->setData(unlabelled); + f->setPredictionMode(true); + f->run(verbose); + + // For the unlabelled set draw random labels using the actual probability + // distributions of each tree in the forest and reassign genuine flag + for(size_t i = 0; i < unlabelled.size(); i++) { + double score = f->getPrediction(i); + double genuine = f->generateNewPrediction(i); + + } + } + + + + if (verbose) cout << "Re-training using labelled and unlabelled data" << endl; + f->setPredictionMode(false); + f->run(verbose); + + cout << "OOBE: " << f->getOverallPredictionError() << endl; + + double error_delta = oobe.back() - f->getOverallPredictionError(); + if (error_delta <= 0.0) { + improved = false; + repeat++; + cout << "No improvement with this permutation" << endl; + } + else { + oobe.push_back(f->getOverallPredictionError()); + improved = true; + repeat = 1; + forest = fm; + cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; + best_forest = outputPrefix+"/ssrf." + std::to_string(it++) + ".forest"; + f->saveToFile(best_forest); + } + } + + // Revert to the best forest + f->loadFromFile(best_forest); + + return f; +} From af0b4f1639eeeab5bd465b11ab8d69d31db2f987 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 4 May 2016 23:25:09 +0100 Subject: [PATCH 06/21] Modifications to complete last commit. This commit fixes compilation issues and generally finished the logic for a semi-supervised random forest implementation but there is a bug when computing prediction errors. To fix... --- deps/ranger-0.3.8/include/ranger/Data.h | 2 +- deps/ranger-0.3.8/include/ranger/Forest.h | 20 ++- deps/ranger-0.3.8/src/Forest.cpp | 2 +- lib/include/portcullis/junction.hpp | 7 +- lib/include/portcullis/ml/markov_model.hpp | 8 +- lib/include/portcullis/ml/model_features.hpp | 4 +- lib/include/portcullis/ml/ss_forest.hpp | 22 ++-- lib/src/markov_model.cc | 3 +- lib/src/model_features.cc | 4 +- lib/src/performance.cc | 30 ++--- lib/src/ss_forest.cc | 58 +++++++-- src/junction_filter.cc | 124 ++----------------- src/junction_filter.hpp | 14 ++- src/train.cc | 11 +- src/train.hpp | 9 +- 15 files changed, 144 insertions(+), 174 deletions(-) diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index 56dd3810..46e3aea3 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -42,7 +42,7 @@ class Data { virtual ~Data(); virtual double get(size_t row, size_t col) const = 0; - + size_t getVariableID(std::string variable_name); virtual void reserveMemory() = 0; diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index 278b9fd4..70f2e9bb 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -129,6 +129,24 @@ class Forest { return sum / predictions[i].size(); } + + double makePrediction(int i) const { + double sum = std::accumulate(predictions[i].begin(), predictions[i].end(), 0.0); + double mean = sum / predictions[i].size(); + + double sq_sum = std::inner_product(predictions[i].begin(), predictions[i].end(), predictions[i].begin(), 0.0); + double stdev = std::sqrt(sq_sum / predictions[i].size() - mean * mean); + + std::random_device rd; + std::mt19937 gen(rd()); + + // values near the mean are the most likely + // standard deviation affects the dispersion of generated values from the mean + std::normal_distribution ndist(mean,stdev); + + // Return the new prediction based on the distribution + return std::round(ndist(gen)); + } size_t getDependentVarId() const { return dependent_varID; @@ -263,7 +281,7 @@ class Forest { // Random number generator std::mt19937_64 random_number_generator; - + std::string output_prefix; ImportanceMode importance_mode; diff --git a/deps/ranger-0.3.8/src/Forest.cpp b/deps/ranger-0.3.8/src/Forest.cpp index a17eb4c2..ab99bfd5 100644 --- a/deps/ranger-0.3.8/src/Forest.cpp +++ b/deps/ranger-0.3.8/src/Forest.cpp @@ -666,7 +666,7 @@ void Forest::growTreesInThread(uint thread_idx, std::vector* variable_im } void Forest::predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction) { - if (thread_ranges.size() > thread_idx + 1) { + if (thread_ranges.size() > thread_idx + 1) { for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { trees[i]->predict(prediction_data, oob_prediction); diff --git a/lib/include/portcullis/junction.hpp b/lib/include/portcullis/junction.hpp index b18c3d2a..7da0112e 100644 --- a/lib/include/portcullis/junction.hpp +++ b/lib/include/portcullis/junction.hpp @@ -43,11 +43,12 @@ typedef std::unordered_map SplicedAlignmentMap; #include "bam/genome_mapper.hpp" using namespace portcullis::bam; +#include "ml/markov_model.hpp" +using portcullis::ml::KmerMarkovModel; +using portcullis::ml::PosMarkovModel; + #include "intron.hpp" #include "seq_utils.hpp" -#include "markov_model.hpp" -using portcullis::KmerMarkovModel; -using portcullis::PosMarkovModel; using portcullis::Intron; diff --git a/lib/include/portcullis/ml/markov_model.hpp b/lib/include/portcullis/ml/markov_model.hpp index 7471b6c8..7318bba4 100644 --- a/lib/include/portcullis/ml/markov_model.hpp +++ b/lib/include/portcullis/ml/markov_model.hpp @@ -75,9 +75,9 @@ class MarkovModel { virtual double getScore(const string& seq) = 0; }; -class KmerMarkovModel : public portcullis::MarkovModel { +class KmerMarkovModel : public portcullis::ml::MarkovModel { private: - portcullis::KMMU model; + portcullis::ml::KMMU model; public: KmerMarkovModel() : MarkovModel(1) {} KmerMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; @@ -90,9 +90,9 @@ class KmerMarkovModel : public portcullis::MarkovModel { } }; -class PosMarkovModel : public portcullis::MarkovModel { +class PosMarkovModel : public portcullis::ml::MarkovModel { private: - portcullis::PMMU model; + portcullis::ml::PMMU model; public: PosMarkovModel() : MarkovModel(1) {} PosMarkovModel(const uint32_t _order) : MarkovModel(_order) {}; diff --git a/lib/include/portcullis/ml/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp index 64151166..4efe224c 100644 --- a/lib/include/portcullis/ml/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -27,10 +27,10 @@ using boost::filesystem::path; #include #include -#include +#include #include using portcullis::bam::GenomeMapper; -using portcullis::MarkovModel; +using portcullis::ml::MarkovModel; using portcullis::Junction; using portcullis::JunctionPtr; using portcullis::JunctionList; diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp index 07e1783a..7c699113 100644 --- a/lib/include/portcullis/ml/ss_forest.hpp +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -27,11 +27,14 @@ using boost::filesystem::path; #include #include +#include using portcullis::ml::ModelFeatures; +using portcullis::ml::MarkovModel; +using portcullis::ml::ForestPtr; #include using portcullis::bam::GenomeMapper; -using portcullis::MarkovModel; + using portcullis::Junction; using portcullis::JunctionPtr; using portcullis::JunctionList; @@ -46,18 +49,23 @@ class SemiSupervisedForest { private: uint16_t trees; Data* labelled; - Data* unlablled; + Data* unlabelled; + Data* all; ModelFeatures mf; ForestPtr forest; + bool verbose; + uint16_t threads; + string outputPrefix; public: - SemiSupervisedForest(const JunctionList& labelled, const JunctionList& unlabelled, - string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, - bool verbose); + SemiSupervisedForest(ModelFeatures& _mf, const JunctionList& labelled, const JunctionList& unlabelled, + string outputPrefix, uint16_t trees, uint16_t threads, bool verbose); + + virtual ~SemiSupervisedForest(); - void train(); + ForestPtr train(); - ForestPtr getForest(); + ForestPtr getForest() { return forest; }; static Data* juncs2FeatureVectors(const JunctionList& x); diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 5d112876..98564207 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -25,7 +25,8 @@ using std::endl; #include using portcullis::SeqUtils; -#include +#include +using portcullis::ml::KMMU; void portcullis::ml::KmerMarkovModel::train(const vector& input, const uint32_t _order) { order = _order; diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index eaaff11a..621eee36 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -31,7 +31,7 @@ using std::make_shared; #include using portcullis::Junction; -#include +#include @@ -246,7 +246,7 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) -portcullis::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& x, +portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { if (verbose) cout << "Creating feature vector" << endl; diff --git a/lib/src/performance.cc b/lib/src/performance.cc index cb379da6..4a468930 100644 --- a/lib/src/performance.cc +++ b/lib/src/performance.cc @@ -20,7 +20,7 @@ using std::cout; using std::endl; -#include +#include string portcullis::ml::Performance::toShortString() const { @@ -29,9 +29,9 @@ string portcullis::ml::Performance::toShortString() const { parts.push_back(std::to_string(tn)); parts.push_back(std::to_string(fp)); parts.push_back(std::to_string(fn)); - parts.push_back(portcullis::Performance::to_2dp_string(getRecall())); - parts.push_back(portcullis::Performance::to_2dp_string(getPrecision())); - parts.push_back(portcullis::Performance::to_2dp_string(getF1Score())); + parts.push_back(Performance::to_2dp_string(getRecall())); + parts.push_back(Performance::to_2dp_string(getPrecision())); + parts.push_back(Performance::to_2dp_string(getF1Score())); return boost::algorithm::join(parts, "\t"); } @@ -41,17 +41,17 @@ string portcullis::ml::Performance::toLongString() const { parts.push_back(std::to_string(tn)); parts.push_back(std::to_string(fp)); parts.push_back(std::to_string(fn)); - parts.push_back(portcullis::Performance::to_2dp_string(getPrevalence())); - parts.push_back(portcullis::Performance::to_2dp_string(getBias())); - parts.push_back(portcullis::Performance::to_2dp_string(getSensitivity())); - parts.push_back(portcullis::Performance::to_2dp_string(getSpecificity())); - parts.push_back(portcullis::Performance::to_2dp_string(getPrecision())); - parts.push_back(portcullis::Performance::to_2dp_string(getNPV())); - parts.push_back(portcullis::Performance::to_2dp_string(getF1Score())); - parts.push_back(portcullis::Performance::to_2dp_string(getAccuracy())); - parts.push_back(portcullis::Performance::to_2dp_string(getInformedness())); - parts.push_back(portcullis::Performance::to_2dp_string(getMarkedness())); - parts.push_back(portcullis::Performance::to_2dp_string(getMCC())); + parts.push_back(Performance::to_2dp_string(getPrevalence())); + parts.push_back(Performance::to_2dp_string(getBias())); + parts.push_back(Performance::to_2dp_string(getSensitivity())); + parts.push_back(Performance::to_2dp_string(getSpecificity())); + parts.push_back(Performance::to_2dp_string(getPrecision())); + parts.push_back(Performance::to_2dp_string(getNPV())); + parts.push_back(Performance::to_2dp_string(getF1Score())); + parts.push_back(Performance::to_2dp_string(getAccuracy())); + parts.push_back(Performance::to_2dp_string(getInformedness())); + parts.push_back(Performance::to_2dp_string(getMarkedness())); + parts.push_back(Performance::to_2dp_string(getMCC())); return boost::algorithm::join(parts, "\t"); } diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index e7c33226..1bfde874 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -17,17 +17,55 @@ #include using std::cout; +using std::cerr; using std::endl; +using std::ostream; #include #include +#include + #include using portcullis::SeqUtils; #include -ForestPtr portcullis::ml::SSForest::train() { +portcullis::ml::SemiSupervisedForest::SemiSupervisedForest(ModelFeatures& mf, + const JunctionList& _labelled, const JunctionList& _unlabelled, + string _outputPrefix, uint16_t _trees, uint16_t _threads, bool _verbose) { + + verbose = _verbose; + + labelled = mf.juncs2FeatureVectors(_labelled); + if (verbose) cout << "Created labelled FV with " << labelled->getNumRows() << " entries." << endl; + + unlabelled = mf.juncs2FeatureVectors(_unlabelled); + if (verbose) cout << "Created unlabelled FV with " << unlabelled->getNumRows() << " entries." << endl; + + // Combine labelled and unlabelled + // This is inefficient but it makes life easier... hopefully it doesn't take too + // much memory + JunctionList _all; + _all.insert(_all.end(), _labelled.begin(), _labelled.end()); + _all.insert(_all.end(), _unlabelled.begin(), _unlabelled.end()); + all = mf.juncs2FeatureVectors(_all); + if (verbose) cout << "Created combined FV with " << all->getNumRows() << " entries." << endl; + + + outputPrefix = _outputPrefix; + threads = _threads; + trees = _trees; + +} + +portcullis::ml::SemiSupervisedForest::~SemiSupervisedForest() { + delete labelled; + delete unlabelled; + delete all; +} + +ForestPtr portcullis::ml::SemiSupervisedForest::train() { if (verbose) cout << "Initialising random forest" << endl; ForestPtr f = make_shared(); @@ -54,7 +92,7 @@ ForestPtr portcullis::ml::SSForest::train() { true, // predall 1.0); // Sample fraction - f->setVerboseOut(&cerr); + f->setVerboseOut(&cout); if (verbose) cout << "Training on labelled data" << endl; f->run(verbose); @@ -66,6 +104,8 @@ ForestPtr portcullis::ml::SSForest::train() { // Store out of box prediction error for first run on just labelled data oobe.push_back(f->getOverallPredictionError()); + cout << "OOBE: " << f->getOverallPredictionError() << endl; + // Loop until no improvement using deterministic annealing bool improved = true; @@ -83,16 +123,16 @@ ForestPtr portcullis::ml::SSForest::train() { // For the unlabelled set draw random labels using the actual probability // distributions of each tree in the forest and reassign genuine flag - for(size_t i = 0; i < unlabelled.size(); i++) { - double score = f->getPrediction(i); - double genuine = f->generateNewPrediction(i); - + for(size_t i = 0; i < unlabelled->getNumRows(); i++) { + // Note we are assuming field 0 contains the label + // Override field 0 with the new label + bool error = false; + all->set(labelled->getNumRows() + i, 0, f->makePrediction(i), error); } } - - if (verbose) cout << "Re-training using labelled and unlabelled data" << endl; + f->setData(all); f->setPredictionMode(false); f->run(verbose); @@ -108,7 +148,7 @@ ForestPtr portcullis::ml::SSForest::train() { oobe.push_back(f->getOverallPredictionError()); improved = true; repeat = 1; - forest = fm; + forest = f; cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; best_forest = outputPrefix+"/ssrf." + std::to_string(it++) + ".forest"; f->saveToFile(best_forest); diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 757f86ad..e8af616b 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -53,21 +53,18 @@ namespace po = boost::program_options; #include #include -#include +#include +using portcullis::ml::SemiSupervisedForest; + #include #include #include #include -#include #include -#include using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; -using portcullis::Performance; using portcullis::eval; -using portcullis::bam::GenomeMapper; -using portcullis::KFold; #include "junction_filter.hpp" @@ -240,13 +237,13 @@ void portcullis::JunctionFilter::filter() { if (train) { // The initial positive and negative sets - JunctionList pos, unlabelled, neg; + JunctionList pos, unlabelled, neg, unlabelled2; cout << "Self training mode activated." << endl << endl; createPositiveSet(currentJuncs, pos, unlabelled, mf); - createNegativeSet(mf.L95, unlabelled, neg); + createNegativeSet(mf.L95, unlabelled, neg, unlabelled2); cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; @@ -270,7 +267,9 @@ void portcullis::JunctionFilter::filter() { cout << "Training Random Forest" << endl << "----------------------" << endl << endl; bool done = false; - shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + //shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true); + shared_ptr forest = ssf.train(); const vector importance = forest->getVariableImportance(); mf.resetActiveFeatureIndex(); @@ -284,108 +283,7 @@ void portcullis::JunctionFilter::filter() { forest->writeOutput(&cout); modelFile = output.string() + ".selftrain.forest"; - cout << endl; - - // Determine optimal threshold using 5-fold CV - const uint16_t FOLDS = 5; - KFold kf(FOLDS, trainingSystem.getJunctions().begin(), trainingSystem.getJunctions().end()); - - JunctionList test, train; - PerformanceList perfs; - - cout << endl << "Starting " << FOLDS << "-fold cross validation" << endl; - - std::ofstream resout(this->output.string() + ".cv_results"); - - cout << "Fold\tT\t" << Performance::longHeader() << endl; - resout << "Fold\tT\t" << Performance::longHeader() << endl; - vector best_thresholds; - for (uint16_t i = 1; i <= 5; i++) { - - cout << i << "\t"; - resout << i << "\t"; - cout.flush(); - resout.flush(); - - // Populate train and test for this step - kf.getFold(i, back_inserter(train), back_inserter(test)); - - // Train on this particular set - ForestPtr f = mf.trainInstance(train, output.string() + ".selftrain.cv-" + to_string(i), DEFAULT_SELFTRAIN_TREES, threads, true, false); - - // Convert testing set junctions into feature vector - Data* testingData = mf.juncs2FeatureVectors(test); - - f->setPredictionMode(true); - f->setData(testingData); - f->run(false); - - delete testingData; - - vector thresholds; - for(double i = 0.0; i <= 1.0; i += 0.01) { - thresholds.push_back(i); - } - double max_mcc = 0.0; - double max_f1 = 0.0; - double best_t_mcc = 0.0; - double best_t_f1 = 0.0; - - for(auto& t : thresholds) { - JunctionList pjl; - JunctionList fjl; - categorise(f, test, pjl, fjl, t); - shared_ptr perf = calcPerformance(pjl, fjl); - double mcc = perf->getMCC(); - double f1 = perf->getF1Score(); - //cout << t << "\t" << perf->toLongString() << endl; - if (mcc > max_mcc) { - max_mcc = mcc; - best_t_mcc = t; - } - if (f1 > max_f1) { - max_f1 = f1; - best_t_f1 = t; - } - } - - //cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; - //cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; - //cout << "Actual threshold set at " << threshold << endl; - JunctionList pjl; - JunctionList fjl; - categorise(f, test, pjl, fjl, best_t_mcc); - best_thresholds.push_back(best_t_mcc); - shared_ptr perf = calcPerformance(pjl, fjl); - cout << best_t_mcc << "\t" << perf->toLongString() << endl; - resout << best_t_mcc << "\t" << perf->toLongString() << endl; - - perfs.add(perf); - - // Clear the train and test vectors in preparation for the next step - train.clear(); - test.clear(); - } - - cout << "Cross validation completed" << endl << endl; - - perfs.outputMeanPerformance(resout); - - double b_t = 0.0; - for(double t : best_thresholds) { - b_t += t; - } - - b_t /= best_thresholds.size(); - - cout << "Optimal threshold: " << b_t << endl << endl; - - threshold = b_t; - - cout << endl << "Saved cross validation results to file " << output.string() << ".cv_results" << endl; - - resout.close(); - + cout << endl; } // Manage a junction system of all discarded junctions @@ -630,7 +528,7 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc } } -void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg) { +void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs) { JuncResultMap resultMap; @@ -685,7 +583,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL } cout << endl << "L95x5\t"; - JunctionList passJuncs, failJuncs; + JunctionList passJuncs; const uint32_t L95x5 = L95 * 5; for(auto& j : f7) { if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index d21b14e2..1da9379c 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -47,19 +47,21 @@ using boost::filesystem::create_directory; using boost::filesystem::symbolic_link_exists; #include +using portcullis::bam::GenomeMapper; + +#include +#include +using portcullis::ml::Performance; +using portcullis::ml::ModelFeatures; + #include #include #include -#include #include -#include -using portcullis::bam::GenomeMapper; using portcullis::PortcullisFS; using portcullis::Intron; using portcullis::IntronHasher; -using portcullis::Performance; using portcullis::JuncResultMap; -using portcullis::ModelFeatures; namespace portcullis { @@ -301,7 +303,7 @@ class JunctionFilter { void createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf); - void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg); + void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs); public: diff --git a/src/train.cc b/src/train.cc index 0f4b500d..f1c30c27 100644 --- a/src/train.cc +++ b/src/train.cc @@ -46,14 +46,15 @@ namespace po = boost::program_options; #include #include +#include +#include +using portcullis::ml::Performance; +using portcullis::ml::PerformanceList; +using portcullis::ml::KFold; + #include -#include -#include using portcullis::JunctionSystem; using portcullis::JunctionList; -using portcullis::Performance; -using portcullis::PerformanceList; -using portcullis::KFold; #include "train.hpp" using portcullis::Train; diff --git a/src/train.hpp b/src/train.hpp index 95f2d63b..0a1c3538 100644 --- a/src/train.hpp +++ b/src/train.hpp @@ -35,9 +35,10 @@ using std::unique_ptr; using boost::filesystem::path; #include -#include -#include -using portcullis::ModelFeatures; + +#include +using portcullis::ml::ModelFeatures; +using portcullis::ml::ForestPtr; namespace portcullis { @@ -174,7 +175,7 @@ class Train { protected: - void testInstance(shared_ptr f, const JunctionList& y); + void testInstance(ForestPtr f, const JunctionList& y); void getRandomSubset(const JunctionList& in, JunctionList& out); }; From 82e4d515eece867306f9983d5e7c99586e40426a Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 9 May 2016 10:08:03 +0100 Subject: [PATCH 07/21] Changed interface for filter tool to take in prep dir rather than the genome file. This should guarentee that the genome index is present. Fixed a bug preventing ssforest from outputing model to disk, however there is still a bug when making predictions on the unlabelled data. --- lib/src/ss_forest.cc | 2 +- src/junction_builder.cc | 5 +++++ src/junction_filter.cc | 25 +++++++++++++++---------- src/junction_filter.hpp | 23 ++++++++++------------- src/portcullis.cc | 4 ++-- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 1bfde874..4cdc0845 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -97,7 +97,7 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { if (verbose) cout << "Training on labelled data" << endl; f->run(verbose); - string best_forest = outputPrefix+"/ssrf.0.forest"; + string best_forest = outputPrefix+".ssrf.0.forest"; f->saveToFile(best_forest); vector oobe; diff --git a/src/junction_builder.cc b/src/junction_builder.cc index 9b44b641..1835797a 100644 --- a/src/junction_builder.cc +++ b/src/junction_builder.cc @@ -93,6 +93,11 @@ void portcullis::JunctionBuilder::process() { } } + if (!bfs::exists(prepData.getSortedBamFilePath())) { + BOOST_THROW_EXCEPTION(JunctionBuilderException() << JunctionBuilderErrorInfo(string( + "Could not find prepared BAM file at: ") + prepData.getSortedBamFilePath().string())); + } + // Test if we have all the required data if (!prepData.valid(useCsi)) { BOOST_THROW_EXCEPTION(JunctionBuilderException() << JunctionBuilderErrorInfo(string( diff --git a/src/junction_filter.cc b/src/junction_filter.cc index e8af616b..4dbeee40 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -67,11 +67,12 @@ using portcullis::IntronHasher; using portcullis::eval; #include "junction_filter.hpp" +#include "prepare.hpp" -portcullis::JunctionFilter::JunctionFilter( const path& _junctionFile, +portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile, const path& _output) { junctionFile = _junctionFile; - genomeFile = ""; + prepData = PreparedFiles(_prepDir); modelFile = ""; genuineFile = ""; output = _output; @@ -103,6 +104,11 @@ void portcullis::JunctionFilter::filter() { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( "Could not find junction file at: ") + junctionFile.string())); } + + if (!bfs::exists(prepData.getGenomeFilePath())) { + BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( + "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); + } // Test if provided filter config file exists if (!modelFile.empty() && !exists(modelFile)) { @@ -199,7 +205,7 @@ void portcullis::JunctionFilter::filter() { // To be overridden if we are training ModelFeatures mf; - mf.initGenomeMapper(this->genomeFile); + mf.initGenomeMapper(prepData.getGenomeFilePath()); mf.features[1].active=false; // NB USRS (BAD) mf.features[2].active=false; // NB DISTRS (BAD) @@ -834,7 +840,7 @@ void portcullis::JunctionFilter::categorise(shared_ptr f, const Junction int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; - path genomeFile; + path prepDir; path modelFile; path genuineFile; path filterFile; @@ -888,16 +894,16 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // in config file, but will not be shown to the user. po::options_description hidden_options("Hidden options"); hidden_options.add_options() - ("genome-file", po::value(&genomeFile), "Path to the genome file to process.") - ("junction-file", po::value(&junctionFile), "Path to the junction file to process.") + ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") + ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), "The threshold score at which we determine a junction to be genuine or not.") ; // Positional option for the input bam file po::positional_options_description p; - p.add("genome-file", 1); - p.add("junction-file", 1); + p.add("prep_data_dir", 1); + p.add("junction_file", 1); // Combine non-positional options @@ -923,7 +929,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { << "------------------------------------------" << endl << endl; // Create the prepare class - JunctionFilter filter(junctionFile, output); + JunctionFilter filter(prepDir, junctionFile, output); filter.setSaveBad(saveBad); filter.setSource(source); filter.setVerbose(verbose); @@ -944,7 +950,6 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } } filter.setReferenceFile(referenceFile); - filter.setGenomeFile(genomeFile); filter.setThreshold(threshold); filter.filter(); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 1da9379c..8c088fd5 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -63,6 +63,9 @@ using portcullis::Intron; using portcullis::IntronHasher; using portcullis::JuncResultMap; +#include "prepare.hpp" +using portcullis::PreparedFiles; + namespace portcullis { @@ -86,7 +89,7 @@ class JunctionFilter { private: path junctionFile; - path genomeFile; + PreparedFiles prepData; path modelFile; path filterFile; path genuineFile; @@ -109,7 +112,8 @@ class JunctionFilter { static path scriptsDir; static path dataDir; - JunctionFilter( const path& _junctionFile, + JunctionFilter( const path& _prepDir, + const path& _junctionFile, const path& _output); virtual ~JunctionFilter() { @@ -135,14 +139,6 @@ class JunctionFilter { this->junctionFile = junctionFile; } - path getGenomeFile() const { - return genomeFile; - } - - void setGenomeFile(path genomeFile) { - this->genomeFile = genomeFile; - } - double getThreshold() const { return threshold; } @@ -311,11 +307,12 @@ class JunctionFilter { return string("\nPortcullis Filter Mode Help.\n\n") + "Filters out junctions that are unlikely to be genuine or that have too little\n" + "supporting evidence. The user can control three stages of the filtering\n" + - "process. First the user can perform filtering based on a pre-defined random\n" + - "forest model. Second the user can specify a configuration file described a\n" + + "process. First the user can perform filtering based on a random forest model\n" + + "self-trained on the provided data, alternatively the user can provide a pre-\n" + + "trained model. Second the user can specify a configuration file describing a\n" + "set of filtering rules to apply. Third, the user can directly through the\n" + "command line filter based on junction (intron) length, or the canonical label.\n\n" + - "Usage: portcullis filter [options] \n\n" + + "Usage: portcullis filter [options] \n\n" + "Options"; } diff --git a/src/portcullis.cc b/src/portcullis.cc index 74af944f..56cfd57c 100644 --- a/src/portcullis.cc +++ b/src/portcullis.cc @@ -264,7 +264,7 @@ int mainFull(int argc, char *argv[]) { path juncOut = juncDir.string() + "/portcullis_all"; // Identify junctions and calculate metrics - JunctionBuilder jb(prepDir.string(), juncOut); + JunctionBuilder jb(prepDir, juncOut); jb.setThreads(threads); jb.setExtra(false); // Run in fast mode jb.setSeparate(false); // Run in fast mode @@ -284,7 +284,7 @@ int mainFull(int argc, char *argv[]) { path filtOut = outputDir.string() + "/3-filt/portcullis_filtered"; path juncTab = juncDir.string() + "/portcullis_all.junctions.tab"; - JunctionFilter filter(juncTab, filtOut); + JunctionFilter filter(prepDir, juncTab, filtOut); filter.setVerbose(verbose); filter.setSource(source); filter.setMaxLength(max_length); From dc819967d5cc9839e16a5a67e957ddf36c9d40c5 Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 9 May 2016 11:00:29 +0100 Subject: [PATCH 08/21] Got the semi-supervised approach working although it doesn't seem to improve performance. Will need to investigate why. --- deps/ranger-0.3.8/include/ranger/Forest.h | 18 +----- deps/ranger-0.3.8/src/Forest.cpp | 23 ++++++- lib/include/portcullis/ml/ss_forest.hpp | 3 + lib/src/ss_forest.cc | 74 +++++++++++++++++++---- src/junction_filter.cc | 6 +- 5 files changed, 89 insertions(+), 35 deletions(-) diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index 70f2e9bb..ad9e165c 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -130,23 +130,7 @@ class Forest { return sum / predictions[i].size(); } - double makePrediction(int i) const { - double sum = std::accumulate(predictions[i].begin(), predictions[i].end(), 0.0); - double mean = sum / predictions[i].size(); - - double sq_sum = std::inner_product(predictions[i].begin(), predictions[i].end(), predictions[i].begin(), 0.0); - double stdev = std::sqrt(sq_sum / predictions[i].size() - mean * mean); - - std::random_device rd; - std::mt19937 gen(rd()); - - // values near the mean are the most likely - // standard deviation affects the dispersion of generated values from the mean - std::normal_distribution ndist(mean,stdev); - - // Return the new prediction based on the distribution - return std::round(ndist(gen)); - } + double makePrediction(int i) const; size_t getDependentVarId() const { return dependent_varID; diff --git a/deps/ranger-0.3.8/src/Forest.cpp b/deps/ranger-0.3.8/src/Forest.cpp index ab99bfd5..5f100281 100644 --- a/deps/ranger-0.3.8/src/Forest.cpp +++ b/deps/ranger-0.3.8/src/Forest.cpp @@ -438,11 +438,12 @@ void Forest::grow(bool verbose) { // Initailize importance per thread std::vector> variable_importance_threads(num_threads); - + for (uint i = 0; i < num_threads; ++i) { if (importance_mode == IMP_GINI) { variable_importance_threads[i].resize(num_independent_variables, 0); } + threads.push_back(std::thread(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i]))); } if (verbose) showProgress("Growing trees.."); @@ -455,7 +456,7 @@ void Forest::grow(bool verbose) { throw std::runtime_error("User interrupt."); } #endif - + // Sum thread importances if (importance_mode == IMP_GINI) { variable_importance.resize(num_independent_variables, 0); @@ -517,6 +518,24 @@ void Forest::predict() { predictInternal(); } +double Forest::makePrediction(int i) const { + double sum = std::accumulate(predictions[i].begin(), predictions[i].end(), 0.0); + double mean = sum / predictions[i].size(); + + double sq_sum = std::inner_product(predictions[i].begin(), predictions[i].end(), predictions[i].begin(), 0.0); + double stdev = std::sqrt(sq_sum / predictions[i].size() - mean * mean); + + std::random_device rd; + std::mt19937 gen(rd()); + + // values near the mean are the most likely + // standard deviation affects the dispersion of generated values from the mean + std::normal_distribution ndist(mean,stdev); + + // Return the new prediction based on the distribution + return std::abs(std::round(ndist(gen))); +} + void Forest::computePredictionError() { // Predict trees in multiple threads diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp index 7c699113..6465561b 100644 --- a/lib/include/portcullis/ml/ss_forest.hpp +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -42,6 +42,9 @@ using portcullis::SplicingScores; namespace portcullis { namespace ml { + +typedef boost::error_info SSRFErrorInfo; +struct SSRFException: virtual boost::exception, virtual std::exception {}; const uint16_t REPEAT_LIMIT = 3; diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 4cdc0845..730357dd 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -103,11 +103,13 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { vector oobe; // Store out of box prediction error for first run on just labelled data - oobe.push_back(f->getOverallPredictionError()); + //oobe.push_back(f->getOverallPredictionError()); + oobe.push_back(1.0); cout << "OOBE: " << f->getOverallPredictionError() << endl; // Loop until no improvement using deterministic annealing + bool initOnly = true; bool improved = true; uint16_t repeat = 1; uint16_t it = 1; @@ -120,20 +122,45 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { f->setData(unlabelled); f->setPredictionMode(true); f->run(verbose); - - // For the unlabelled set draw random labels using the actual probability - // distributions of each tree in the forest and reassign genuine flag - for(size_t i = 0; i < unlabelled->getNumRows(); i++) { - // Note we are assuming field 0 contains the label - // Override field 0 with the new label - bool error = false; - all->set(labelled->getNumRows() + i, 0, f->makePrediction(i), error); + if (verbose) cout << "Made predictions. Applying labels to initially unlabelled set using randomly using normal distribution acquired from scores from decision trees." << endl; + } + + // For the unlabelled set draw random labels using the actual probability + // distributions of each tree in the forest and reassign genuine flag + for(size_t i = 0; i < unlabelled->getNumRows(); i++) { + // Note we are assuming field 0 contains the label + // Override field 0 with the new label + bool error = false; + double pred = f->makePrediction(i); + //cout << pred << endl; + all->set(0, labelled->getNumRows() + i, pred, error); + if (error) { + BOOST_THROW_EXCEPTION(SSRFException() << SSRFErrorInfo(string( + "Error setting label for initially unlabelled junction ") + std::to_string(i))); } } - if (verbose) cout << "Re-training using labelled and unlabelled data" << endl; - f->setData(all); - f->setPredictionMode(false); + if (verbose) cout << "Re-training using initially labelled and newly labelled data" << endl; + f = make_shared(); + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + all, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction f->run(verbose); cout << "OOBE: " << f->getOverallPredictionError() << endl; @@ -147,15 +174,36 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { else { oobe.push_back(f->getOverallPredictionError()); improved = true; + initOnly = false; repeat = 1; forest = f; cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; - best_forest = outputPrefix+"/ssrf." + std::to_string(it++) + ".forest"; + best_forest = outputPrefix+".ssrf." + std::to_string(it++) + ".forest"; f->saveToFile(best_forest); } } // Revert to the best forest + f = make_shared(); + f->init( + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + initOnly ? labelled : all, // Data object + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + "", // Status var name + false, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction f->loadFromFile(best_forest); return f; diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 4dbeee40..d79fe6b7 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -276,17 +276,17 @@ void portcullis::JunctionFilter::filter() { //shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true); shared_ptr forest = ssf.train(); - + /* const vector importance = forest->getVariableImportance(); mf.resetActiveFeatureIndex(); cout << "Feature importance:" << endl; for(auto& i : importance) { int16_t j = mf.getNextActiveFeatureIndex(); cout << mf.features[j].name << " - " << i << endl; - } + }*/ forest->saveToFile(); - forest->writeOutput(&cout); + //forest->writeOutput(&cout); modelFile = output.string() + ".selftrain.forest"; cout << endl; From fb6d345173cd4f6c29bb8cc9e5a07339d24e8c01 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 11 May 2016 17:02:44 +0100 Subject: [PATCH 09/21] Modifications to improve the way we handle semi supervised learning. Unfortunately, it's still not completely working yet. Improved filters to be more precise. --- data/selftrain_initial_neg.layer3.json | 84 +-- data/selftrain_initial_neg.layer4.json | 84 +-- data/selftrain_initial_neg.layer5.json | 84 +-- data/selftrain_initial_neg.layer6.json | 86 +-- data/selftrain_initial_neg.layer7.json | 4 +- data/selftrain_initial_pos.layer1.json | 10 +- data/selftrain_initial_pos.layer2.json | 8 +- deps/ranger-0.3.8/include/ranger/Data.h | 22 +- deps/ranger-0.3.8/include/ranger/DataChar.h | 56 +- deps/ranger-0.3.8/include/ranger/DataDouble.h | 52 +- deps/ranger-0.3.8/include/ranger/DataFloat.h | 44 +- deps/ranger-0.3.8/include/ranger/Forest.h | 2 +- deps/ranger-0.3.8/src/Data.cpp | 10 +- deps/ranger-0.3.8/src/DataChar.cpp | 41 +- deps/ranger-0.3.8/src/DataDouble.cpp | 15 +- deps/ranger-0.3.8/src/DataFloat.cpp | 21 +- lib/include/portcullis/ml/ss_forest.hpp | 9 +- lib/src/junction.cc | 2 + lib/src/model_features.cc | 38 +- lib/src/ss_forest.cc | 148 ++-- src/junction_filter.cc | 642 +++++++++--------- 21 files changed, 631 insertions(+), 831 deletions(-) diff --git a/data/selftrain_initial_neg.layer3.json b/data/selftrain_initial_neg.layer3.json index ec5cbc5d..ec8c8b81 100644 --- a/data/selftrain_initial_neg.layer3.json +++ b/data/selftrain_initial_neg.layer3.json @@ -4,90 +4,14 @@ "operator": "in", "value": ["N", "S"] }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 + "value": 3.0 }, - "PFP": { + "Suspect": { "operator": "eq", "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 - } + } }, - "expression": "( Suspect & M11-entropy.2 )" + "expression": "( M1-canonical_ss & Suspect & M11-entropy )" } diff --git a/data/selftrain_initial_neg.layer4.json b/data/selftrain_initial_neg.layer4.json index 44890918..bab2560d 100644 --- a/data/selftrain_initial_neg.layer4.json +++ b/data/selftrain_initial_neg.layer4.json @@ -1,93 +1,13 @@ { "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["N", "S"] - }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, "M12-maxmmes": { "operator": "lt", "value": 7 }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, - "PFP": { - "operator": "eq", - "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, "M22-rel2raw": { "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 + "value": 0.5 } }, - "expression": "( M12-maxmmes & M22-rel2raw )" + "expression": "( M12-maxmmes & M22-rel2raw )" } diff --git a/data/selftrain_initial_neg.layer5.json b/data/selftrain_initial_neg.layer5.json index 945bd289..ef96164c 100644 --- a/data/selftrain_initial_neg.layer5.json +++ b/data/selftrain_initial_neg.layer5.json @@ -2,92 +2,12 @@ "parameters": { "M1-canonical_ss": { "operator": "in", - "value": ["N", "S"] + "value": ["N"] }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, - "PFP": { - "operator": "eq", - "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { "operator": "eq", "value": 0.0 } }, - "expression": "( M1-canonical_ss & M22-rel2raw.2 )" + "expression": "( M1-canonical_ss & M22-rel2raw )" } diff --git a/data/selftrain_initial_neg.layer6.json b/data/selftrain_initial_neg.layer6.json index 0265d1eb..ea5b4298 100644 --- a/data/selftrain_initial_neg.layer6.json +++ b/data/selftrain_initial_neg.layer6.json @@ -3,91 +3,11 @@ "M1-canonical_ss": { "operator": "in", "value": ["N", "S"] - }, - "M2-nb_reads": { - "operator": "lte", - "value": 1 - }, - "M3-nb_dist_aln": { - "operator": "gte", - "value": 2 - }, - "M4-nb_rel_aln": { - "operator": "eq", - "value": 0 - }, - "M11-entropy": { - "operator": "lt", - "value": 1.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M11-entropy.3": { - "operator": "eq", - "value": 0.0 - }, - "M12-maxmmes": { - "operator": "lt", - "value": 7 - }, - "M12-maxmmes.2": { - "operator": "lt", - "value": 10 - }, - "M12-maxmmes.3": { - "operator": "lt", - "value": 20 - }, - "M8-max_min_anc": { - "operator": "lt", - "value": 16 - }, - "Suspect": { - "operator": "eq", - "value": 1 - }, + }, "PFP": { "operator": "eq", "value": 1 - }, - "M13-hamming5p": { - "operator": "lte", - "value": 2 - }, - "M14-hamming3p": { - "operator": "lte", - "value": 2 - }, - "M19-mean_mismatches": { - "operator": "gte", - "value": 5.0 - }, - "M19-mean_mismatches.2": { - "operator": "gte", - "value": 2.0 - }, - "M20-nb_usrs": { - "operator": "eq", - "value": 0 - }, - "M20-nb_usrs.2": { - "operator": "eq", - "value": 1 - }, - "M21-nb_msrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "lt", - "value": 0.9 - }, - "M22-rel2raw.2": { - "operator": "eq", - "value": 0.0 - } + } }, - "expression": "( PFP )" + "expression": "( M1-canonical_ss & PFP )" } diff --git a/data/selftrain_initial_neg.layer7.json b/data/selftrain_initial_neg.layer7.json index 5b6bde7c..fb0352f3 100644 --- a/data/selftrain_initial_neg.layer7.json +++ b/data/selftrain_initial_neg.layer7.json @@ -10,11 +10,11 @@ }, "M13-hamming5p": { "operator": "lte", - "value": 4 + "value": 3 }, "M14-hamming3p": { "operator": "lte", - "value": 4 + "value": 3 } }, "expression": "( M2-nb_reads & M22-rel2raw & M13-hamming5p & M14-hamming3p )" diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json index 1bc58557..f128bacb 100644 --- a/data/selftrain_initial_pos.layer1.json +++ b/data/selftrain_initial_pos.layer1.json @@ -8,6 +8,10 @@ "operator": "gte", "value": 8 }, + "M11-entropy": { + "operator": "gt", + "value": 1.0 + }, "M13-hamming5p": { "operator": "gte", "value": 5 @@ -18,7 +22,7 @@ }, "M19-mean_mismatches": { "operator": "lte", - "value": 1.5 + "value": 0.5 }, "M20-nb_usrs": { "operator": "gte", @@ -26,8 +30,8 @@ }, "M22-rel2raw": { "operator": "gte", - "value": 0.25 + "value": 0.33 } }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M11-entropy & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" } diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json index 62c0273e..b03d88ac 100644 --- a/data/selftrain_initial_pos.layer2.json +++ b/data/selftrain_initial_pos.layer2.json @@ -2,11 +2,11 @@ "parameters": { "M4-nb_rel_aln": { "operator": "gte", - "value": 3 + "value": 5 }, "M4-nb_rel_aln.2": { "operator": "gte", - "value": 2 + "value": 3 }, "M12-maxmmes": { "operator": "gte", @@ -14,7 +14,7 @@ }, "M12-maxmmes.2": { "operator": "gt", - "value": 12 + "value": 15 }, "M13-hamming5p": { "operator": "gte", @@ -38,7 +38,7 @@ }, "M19-mean_mismatches.2": { "operator": "lt", - "value": 0.5 + "value": 0.25 } }, "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index 46e3aea3..e918c6d7 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -39,13 +39,19 @@ class Data { public: Data(); + Data(std::vector variable_names, size_t num_rows, size_t num_cols); virtual ~Data(); virtual double get(size_t row, size_t col) const = 0; - + size_t getVariableID(std::string variable_name); - virtual void reserveMemory() = 0; + virtual void reserveMemoryInternal() = 0; + + void reserveMemory() { + reserveMemoryInternal(); + } + virtual void set(size_t col, size_t row, double value, bool& error) = 0; void addSparseData(unsigned char* sparse_data, size_t num_cols_sparse); @@ -115,12 +121,8 @@ class Data { } } - void setExternalData(bool external) { - this->externalData = external; - } - void saveToTSV(const std::string& file); - + std::string getRow(size_t row) const { std::stringstream ss; ss << this->get(row, 0); @@ -129,7 +131,7 @@ class Data { } return ss.str(); } - + std::string getHeader() const { std::stringstream ss; ss << variable_names[0]; @@ -138,7 +140,7 @@ class Data { } return ss.str(); } - + protected: std::vector variable_names; size_t num_rows; @@ -148,8 +150,6 @@ class Data { unsigned char* sparse_data; size_t num_cols_no_sparse; - bool externalData; - size_t* index_data; std::vector> unique_data_values; size_t max_num_unique_values; diff --git a/deps/ranger-0.3.8/include/ranger/DataChar.h b/deps/ranger-0.3.8/include/ranger/DataChar.h index 4c0fc5f5..1af193ac 100644 --- a/deps/ranger-0.3.8/include/ranger/DataChar.h +++ b/deps/ranger-0.3.8/include/ranger/DataChar.h @@ -34,40 +34,44 @@ wright@imbs.uni-luebeck.de #include "globals.h" #include "Data.h" -class DataChar: public Data { +class DataChar : public Data { public: - DataChar(); - DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, bool& error); - virtual ~DataChar(); - - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - } - } + DataChar(); + + DataChar(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, bool& error); + virtual ~DataChar(); - void reserveMemory() { - data = new char[num_cols * num_rows]; - } + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + } + } - void set(size_t col, size_t row, double value, bool& error) { - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; + void reserveMemoryInternal() { + data = new char[num_cols * num_rows]; } - if (floor(value) != ceil(value)) { - error = true; + + void set(size_t col, size_t row, double value, bool& error) { + if (value > CHAR_MAX || value < CHAR_MIN) { + error = true; + } + if (floor(value) != ceil(value)) { + error = true; + } + data[col * num_rows + row] = (char) value; } - data[col * num_rows + row] = (char) value; - } private: - char* data; + char* data; - DISALLOW_COPY_AND_ASSIGN(DataChar); + DISALLOW_COPY_AND_ASSIGN(DataChar); }; #endif /* DATACHAR_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/DataDouble.h b/deps/ranger-0.3.8/include/ranger/DataDouble.h index f722de8e..27c6c967 100644 --- a/deps/ranger-0.3.8/include/ranger/DataDouble.h +++ b/deps/ranger-0.3.8/include/ranger/DataDouble.h @@ -33,41 +33,39 @@ wright@imbs.uni-luebeck.de #include "utility.h" #include "Data.h" -class DataDouble: public Data { +class DataDouble : public Data { public: - DataDouble(); - DataDouble(double* data, std::vector variable_names, size_t num_rows, size_t num_cols) : - data(data) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - } - virtual ~DataDouble(); + DataDouble(); - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - double result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); - return result; + DataDouble(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataDouble(double* data, std::vector variable_names, size_t num_rows, size_t num_cols); + virtual ~DataDouble(); + + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + double result = (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + return result; + } } - } - void reserveMemory() { - data = new double[num_cols * num_rows]; - } + void reserveMemoryInternal() { + data = new double[num_cols * num_rows]; + } - void set(size_t col, size_t row, double value, bool& error) { - data[col * num_rows + row] = value; - } + void set(size_t col, size_t row, double value, bool& error) { + data[col * num_rows + row] = value; + } private: - double* data; + double* data; - DISALLOW_COPY_AND_ASSIGN(DataDouble); + DISALLOW_COPY_AND_ASSIGN(DataDouble); }; #endif /* DATADOUBLE_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/DataFloat.h b/deps/ranger-0.3.8/include/ranger/DataFloat.h index ed26f7da..b56c3595 100644 --- a/deps/ranger-0.3.8/include/ranger/DataFloat.h +++ b/deps/ranger-0.3.8/include/ranger/DataFloat.h @@ -32,34 +32,38 @@ wright@imbs.uni-luebeck.de #include "globals.h" #include "Data.h" -class DataFloat: public Data { +class DataFloat : public Data { public: - DataFloat(); - DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols); - virtual ~DataFloat(); + DataFloat(); - double get(size_t row, size_t col) const { - if (col < num_cols_no_sparse) { - return data[col * num_rows + row]; - } else { - // Get data out of sparse storage. -1 because of GenABEL coding. - size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; - return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + DataFloat(std::vector variable_names, size_t num_rows, size_t num_cols) : Data(variable_names, num_rows, num_cols) { + reserveMemory(); + }; + DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols); + virtual ~DataFloat(); + + double get(size_t row, size_t col) const { + if (col < num_cols_no_sparse) { + return data[col * num_rows + row]; + } else { + // Get data out of sparse storage. -1 because of GenABEL coding. + size_t idx = (col - num_cols_no_sparse) * num_rows_rounded + row; + return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); + } } - } - void reserveMemory() { - data = new float[num_cols * num_rows]; - } + void reserveMemoryInternal() { + data = new float[num_cols * num_rows]; + } - void set(size_t col, size_t row, double value, bool& error) { - data[col * num_rows + row] = (float) value; - } + void set(size_t col, size_t row, double value, bool& error) { + data[col * num_rows + row] = (float) value; + } private: - float* data; + float* data; - DISALLOW_COPY_AND_ASSIGN(DataFloat); + DISALLOW_COPY_AND_ASSIGN(DataFloat); }; #endif /* DATAFLOAT_H_ */ diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index ad9e165c..f15b9957 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -171,7 +171,7 @@ class Forest { void setData(Data* data) { this->data = data; // Assume we want to reset predictions too - this->predictions.clear(); + //this->predictions.clear(); } void setVerboseOut(std::ostream* verbose_out) { diff --git a/deps/ranger-0.3.8/src/Data.cpp b/deps/ranger-0.3.8/src/Data.cpp index 712afb81..220dfbad 100644 --- a/deps/ranger-0.3.8/src/Data.cpp +++ b/deps/ranger-0.3.8/src/Data.cpp @@ -36,10 +36,17 @@ #include Data::Data() : -num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), externalData(true), index_data( +num_rows(0), num_rows_rounded(0), num_cols(0), sparse_data(0), num_cols_no_sparse(0), index_data( 0), max_num_unique_values(0) { } +Data::Data(std::vector variable_names, size_t num_rows, size_t num_cols) : Data() { + this->variable_names = variable_names; + this->num_rows = num_rows; + this->num_cols = num_cols; + this->num_cols_no_sparse = num_cols; +} + Data::~Data() { if (index_data != 0) { delete[] index_data; @@ -94,7 +101,6 @@ bool Data::loadFromFile(std::string filename) { result = loadFromFileWhitespace(input_file, header_line); } - externalData = false; input_file.close(); return result; } diff --git a/deps/ranger-0.3.8/src/DataChar.cpp b/deps/ranger-0.3.8/src/DataChar.cpp index 737527dd..40028b0c 100644 --- a/deps/ranger-0.3.8/src/DataChar.cpp +++ b/deps/ranger-0.3.8/src/DataChar.cpp @@ -34,35 +34,28 @@ wright@imbs.uni-luebeck.de #include DataChar::DataChar() : - data(0) { +data(0) { } -DataChar::DataChar(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols, - bool& error) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - - reserveMemory(); - - // Save data and report errors - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - double value = data_double[i * num_rows + j]; - if (value > CHAR_MAX || value < CHAR_MIN) { - error = true; - } - if (floor(value) != ceil(value)) { - error = true; - } - data[i * num_rows + j] = (char) value; +DataChar::DataChar(double* data_double, std::vector variable_names, + size_t num_rows, size_t num_cols, bool& error) : + DataChar(variable_names, num_rows, num_cols) { + + // Save data and report errors + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + double value = data_double[i * num_rows + j]; + if (value > CHAR_MAX || value < CHAR_MIN) { + error = true; + } + if (floor(value) != ceil(value)) { + error = true; + } + data[i * num_rows + j] = (char) value; + } } - } } DataChar::~DataChar() { - if (!externalData) { delete[] data; - } } diff --git a/deps/ranger-0.3.8/src/DataDouble.cpp b/deps/ranger-0.3.8/src/DataDouble.cpp index af657bf6..862a85fa 100644 --- a/deps/ranger-0.3.8/src/DataDouble.cpp +++ b/deps/ranger-0.3.8/src/DataDouble.cpp @@ -29,12 +29,19 @@ wright@imbs.uni-luebeck.de #include DataDouble::DataDouble() : - data(0) { +data(0) { +} + +DataDouble::DataDouble(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) : + DataDouble(variable_names, num_rows, num_cols) { + + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + data[i * num_rows + j] = data_double[i * num_rows + j]; + } + } } DataDouble::~DataDouble() { - if (!externalData) { delete[] data; - } } - diff --git a/deps/ranger-0.3.8/src/DataFloat.cpp b/deps/ranger-0.3.8/src/DataFloat.cpp index 9df28dc2..bcb64506 100644 --- a/deps/ranger-0.3.8/src/DataFloat.cpp +++ b/deps/ranger-0.3.8/src/DataFloat.cpp @@ -32,26 +32,19 @@ wright@imbs.uni-luebeck.de #include DataFloat::DataFloat() : - data(0) { +data(0) { } -DataFloat::DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) { - this->variable_names = variable_names; - this->num_rows = num_rows; - this->num_cols = num_cols; - this->num_cols_no_sparse = num_cols; - - reserveMemory(); - for (size_t i = 0; i < num_cols; ++i) { - for (size_t j = 0; j < num_rows; ++j) { - data[i * num_rows + j] = (float) data_double[i * num_rows + j]; +DataFloat::DataFloat(double* data_double, std::vector variable_names, size_t num_rows, size_t num_cols) : + DataFloat(variable_names, num_rows, num_cols) { + for (size_t i = 0; i < num_cols; ++i) { + for (size_t j = 0; j < num_rows; ++j) { + data[i * num_rows + j] = (float) data_double[i * num_rows + j]; + } } - } } DataFloat::~DataFloat() { - if (!externalData) { delete[] data; - } } diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp index 6465561b..3ff28478 100644 --- a/lib/include/portcullis/ml/ss_forest.hpp +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -59,10 +59,14 @@ class SemiSupervisedForest { bool verbose; uint16_t threads; string outputPrefix; + double contribution; // How much of a contribution unlabelled data should make to the decision making + + double makePrediction(ForestPtr l, ForestPtr u, int i) const; + public: SemiSupervisedForest(ModelFeatures& _mf, const JunctionList& labelled, const JunctionList& unlabelled, - string outputPrefix, uint16_t trees, uint16_t threads, bool verbose); + string outputPrefix, uint16_t trees, uint16_t threads, double contribution, bool verbose); virtual ~SemiSupervisedForest(); @@ -71,6 +75,9 @@ class SemiSupervisedForest { ForestPtr getForest() { return forest; }; static Data* juncs2FeatureVectors(const JunctionList& x); + + + }; } diff --git a/lib/src/junction.cc b/lib/src/junction.cc index d791b272..1dc1bf70 100644 --- a/lib/src/junction.cc +++ b/lib/src/junction.cc @@ -192,6 +192,7 @@ portcullis::Junction::Junction(shared_ptr _location, int32_t _leftAncSta meanQueryLength = 0; suspicious = false; pfp = false; + genuine = false; canonicalSpliceSites = NO; maxMinAnchor = intron->minAnchorLength(_leftAncStart, _rightAncEnd); diffAnchor = 0; @@ -247,6 +248,7 @@ portcullis::Junction::Junction(const Junction& j, bool withAlignments) { meanQueryLength = j.meanQueryLength; suspicious = j.suspicious; pfp = j.pfp; + genuine = j.genuine; canonicalSpliceSites = j.canonicalSpliceSites; maxMinAnchor = j.maxMinAnchor; diffAnchor = j.diffAnchor; diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 621eee36..258b35da 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -177,71 +177,69 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) } // Convert junction list info to double* - double* d = new double[x.size() * headers.size()]; + Data* d = new DataDouble(headers, x.size(), headers.size()); uint32_t row = 0; for (const auto& j : x) { SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, donorPWModel, acceptorPWModel); - d[0 * x.size() + row] = j->isGenuine(); + bool error = false; + d->set(0, row, j->isGenuine(), error); uint16_t i = 1; if (features[1].active) { - d[i++ * x.size() + row] = j->getNbUniquelySplicedReads(); + d->set(i++, row, j->getNbUniquelySplicedReads(), error); } if (features[2].active) { - d[i++ * x.size() + row] = j->getNbDistinctAlignments(); + d->set(i++, row, j->getNbDistinctAlignments(), error); } if (features[3].active) { - d[i++ * x.size() + row] = j->getNbReliableAlignments(); + d->set(i++, row, j->getNbReliableAlignments(), error); } if (features[4].active) { - d[i++ * x.size() + row] = j->getEntropy(); + d->set(i++, row, j->getEntropy(), error); } if (features[5].active) { - d[i++ * x.size() + row] = j->getReliable2RawRatio(); + d->set(i++, row, j->getReliable2RawRatio(), error); } if (features[6].active) { - d[i++ * x.size() + row] = j->getMaxMinAnchor(); + d->set(i++, row, j->getMaxMinAnchor(), error); } if (features[7].active) { - d[i++ * x.size() + row] = j->getMaxMMES(); + d->set(i++, row, j->getMaxMMES(), error); } if (features[8].active) { - d[i++ * x.size() + row] = j->getMeanMismatches(); + d->set(i++, row, j->getMeanMismatches(), error); } if (features[9].active) { - d[i++ * x.size() + row] = L95 == 0 ? 0.0 : j->calcIntronScore(L95); + d->set(i++, row, L95 == 0 ? 0.0 : j->calcIntronScore(L95), error); } if (features[10].active) { - d[i++ * x.size() + row] = std::min(j->getHammingDistance5p(), j->getHammingDistance3p()); + d->set(i++, row, std::min(j->getHammingDistance5p(), j->getHammingDistance3p()), error); } if (features[11].active) { - d[i++ * x.size() + row] = isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel); + d->set(i++, row, isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel), error); } if (features[12].active) { - d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.positionWeighting; + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.positionWeighting, error); } if (features[13].active) { - d[i++ * x.size() + row] = isPWModelEmpty() ? 0.0 : ss.splicingSignal; + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.splicingSignal, error); } - //Junction overhang values at each position are first converted into deviation from expected distributions for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { if (features[joi + 14].active) { - d[i++ * x.size() + row] = j->getJunctionOverhangLogDeviation(joi); + d->set(i++, row, j->getJunctionOverhangLogDeviation(joi), error); } } row++; } - Data* data = new DataDouble(d, headers, x.size(), headers.size()); - data->setExternalData(false); // This causes 'd' to be deleted when 'data' is deleted - return data; + return d; } diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 730357dd..4554771e 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -33,9 +33,10 @@ using portcullis::SeqUtils; portcullis::ml::SemiSupervisedForest::SemiSupervisedForest(ModelFeatures& mf, const JunctionList& _labelled, const JunctionList& _unlabelled, - string _outputPrefix, uint16_t _trees, uint16_t _threads, bool _verbose) { + string _outputPrefix, uint16_t _trees, uint16_t _threads, double _contribution, bool _verbose) { verbose = _verbose; + contribution = _contribution; labelled = mf.juncs2FeatureVectors(_labelled); if (verbose) cout << "Created labelled FV with " << labelled->getNumRows() << " entries." << endl; @@ -68,11 +69,12 @@ portcullis::ml::SemiSupervisedForest::~SemiSupervisedForest() { ForestPtr portcullis::ml::SemiSupervisedForest::train() { if (verbose) cout << "Initialising random forest" << endl; - ForestPtr f = make_shared(); - + ForestPtr l = make_shared(); + ForestPtr u = make_shared(); + vector catVars; - f->init( + l->init( "Genuine", // Dependant variable name MEM_DOUBLE, // Memory mode labelled, // Data object @@ -85,44 +87,50 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size "", // Status var name false, // Prediction mode - true, // Replace + false, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving DEFAULT_SPLITRULE, // Split rule true, // predall 1.0); // Sample fraction - f->setVerboseOut(&cout); + l->setVerboseOut(&cout); if (verbose) cout << "Training on labelled data" << endl; - f->run(verbose); + l->run(verbose); - string best_forest = outputPrefix+".ssrf.0.forest"; - f->saveToFile(best_forest); + string orig_forest = outputPrefix+".ssrf.0.forest"; + l->saveToFile(orig_forest); vector oobe; // Store out of box prediction error for first run on just labelled data - //oobe.push_back(f->getOverallPredictionError()); - oobe.push_back(1.0); - cout << "OOBE: " << f->getOverallPredictionError() << endl; - + oobe.push_back(l->getOverallPredictionError()); + cout << "OOBE: " << l->getOverallPredictionError() << endl; + + l->setData(unlabelled); + l->setPredictionMode(true); + l->run(verbose); + + // Loop until no improvement using deterministic annealing - bool initOnly = true; + bool initOnly = false; + bool first = true; bool improved = true; uint16_t repeat = 1; uint16_t it = 1; + string best_forest; while(improved || repeat <= REPEAT_LIMIT) { // If OOBE improved during the last iteration make new predictions on // unlabelled data with the current model - if (improved) { + if (improved && !first) { if (verbose) cout << "Making predictions on the unlabelled set using current model" << endl; - f->setData(unlabelled); - f->setPredictionMode(true); - f->run(verbose); - if (verbose) cout << "Made predictions. Applying labels to initially unlabelled set using randomly using normal distribution acquired from scores from decision trees." << endl; + u->setData(all); + u->setPredictionMode(true); + u->run(verbose); + if (verbose) cout << "Made predictions." << endl; } // For the unlabelled set draw random labels using the actual probability @@ -131,7 +139,7 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { // Note we are assuming field 0 contains the label // Override field 0 with the new label bool error = false; - double pred = f->makePrediction(i); + double pred = first ? l->makePrediction(i) : u->makePrediction(labelled->getNumRows() + i); //cout << pred << endl; all->set(0, labelled->getNumRows() + i, pred, error); if (error) { @@ -140,14 +148,14 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { } } - if (verbose) cout << "Re-training using initially labelled and newly labelled data" << endl; - f = make_shared(); - f->init( + if (verbose) cout << "Re-training newly labelled data" << endl; + u = make_shared(); + u->init( "Genuine", // Dependant variable name MEM_DOUBLE, // Memory mode all, // Data object 0, // M Try (0 == use default) - outputPrefix, // Output prefix + outputPrefix + "_" + std::to_string(it), // Output prefix trees, // Number of trees 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible threads, // Number of threads @@ -155,37 +163,50 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size "", // Status var name false, // Prediction mode - true, // Replace + false, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving DEFAULT_SPLITRULE, // Split rule true, // predall 1.0); // Sample fraction - f->run(verbose); + u->run(verbose); - cout << "OOBE: " << f->getOverallPredictionError() << endl; + cout << "OOBE: " << u->getOverallPredictionError() << endl; - double error_delta = oobe.back() - f->getOverallPredictionError(); - if (error_delta <= 0.0) { + double error_delta = oobe.back() - u->getOverallPredictionError(); + if (error_delta <= 0.0 && !first) { improved = false; repeat++; cout << "No improvement with this permutation" << endl; } else { - oobe.push_back(f->getOverallPredictionError()); + oobe.push_back(u->getOverallPredictionError()); improved = true; - initOnly = false; repeat = 1; - forest = f; - cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; + forest = u; + if (!first) cout << "Improvement of " << error_delta << " to OOBE with this iteration" << endl; best_forest = outputPrefix+".ssrf." + std::to_string(it++) + ".forest"; - f->saveToFile(best_forest); + u->saveToFile(best_forest); } + + first = false; } + if (oobe.back() - oobe.front() >= 0.0) { + initOnly = true; + cout << "Using unlabelled data does not appear to have improved the model. Reverting to original model derived only from initally labelled data." << endl; + } + else { + for(size_t j = labelled->getNumRows(); j < labelled->getNumRows()+unlabelled->getNumRows(); j++) { + bool error = false; + all->set(0, j, unlabelled->get(j - labelled->getNumRows(), 0), error); + } + } + + // Revert to the best forest - f = make_shared(); - f->init( + ForestPtr b = make_shared(); + b->init( "Genuine", // Dependant variable name MEM_DOUBLE, // Memory mode initOnly ? labelled : all, // Data object @@ -204,7 +225,58 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { DEFAULT_SPLITRULE, // Split rule true, // predall 1.0); // Sample fraction - f->loadFromFile(best_forest); - return f; + b->run(verbose); + + return b; } + +double portcullis::ml::SemiSupervisedForest::makePrediction(ForestPtr lab, ForestPtr unlab, int i) const { + + const std::vector> lab_pred = lab->getPredictions(); + const std::vector> unlab_pred = unlab->getPredictions(); + + double lab_sum = std::accumulate(lab_pred[i].begin(), lab_pred[i].end(), 0.0); + double unlab_sum = std::accumulate(unlab_pred[i].begin(), unlab_pred[i].end(), 0.0); + + double weight_sum = (double)lab_pred[i].size() + ((double)unlab_pred[i].size() * contribution ); + + double lab_mean = lab_sum / (double)lab_pred[i].size(); + double unlab_mean = unlab_sum / (double)unlab_pred[i].size(); + double weighted_mean = (lab_sum + (unlab_sum * contribution)) / (weight_sum); + + cout << "Means: lab - " << lab_mean << "; unlab - " << unlab_mean << "; weighted - " << weighted_mean << endl; + + double contr2 = std::pow(contribution, 2); + + double lab_sq_sum = std::inner_product(lab_pred[i].begin(), lab_pred[i].end(), lab_pred[i].begin(), 0.0); + double lab_stdev = std::sqrt(lab_sq_sum / lab_pred[i].size() - lab_mean * lab_mean); + + double unlab_sq_sum = std::inner_product(unlab_pred[i].begin(), unlab_pred[i].end(), unlab_pred[i].begin(), 0.0); + double unlab_stdev = std::sqrt(unlab_sq_sum / unlab_pred[i].size() - unlab_mean * unlab_mean); + + double var = 0.0; + for(size_t j = 0; j < lab_pred[i].size(); j++) { + var += std::pow(lab_pred[i][j] - weighted_mean, 2); + } + for(size_t j = 0; j < unlab_pred[i].size(); j++) { + var += contribution * std::pow(unlab_pred[i][j] - weighted_mean, 2); + } + + var /= weight_sum; + double weighted_stdev = std::sqrt(var); + + cout << "STD dev: lab - " << lab_stdev << "; unlab - " << unlab_stdev << "; weighted - " << weighted_stdev << endl; + + + + std::random_device rd; + std::mt19937 gen(rd()); + + // values near the mean are the most likely + // standard deviation affects the dispersion of generated values from the mean + std::normal_distribution ndist(weighted_mean,weighted_stdev); + + // Return the new prediction based on the distribution + return std::abs(std::round(ndist(gen))); +} \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index d79fe6b7..cbf9934d 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -69,8 +69,8 @@ using portcullis::eval; #include "junction_filter.hpp" #include "prepare.hpp" -portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile, - const path& _output) { +portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile, + const path& _output) { junctionFile = _junctionFile; prepData = PreparedFiles(_prepDir); modelFile = ""; @@ -88,47 +88,46 @@ portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _ju verbose = false; threshold = DEFAULT_FILTER_THRESHOLD; } - - + void portcullis::JunctionFilter::filter() { path outputDir = output.parent_path(); string outputPrefix = output.leaf().string(); - + if (outputDir.empty()) { outputDir = "."; } - + // Test if provided genome exists if (!exists(junctionFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find junction file at: ") + junctionFile.string())); + "Could not find junction file at: ") + junctionFile.string())); } - - if (!bfs::exists(prepData.getGenomeFilePath())) { + + if (!bfs::exists(prepData.getGenomeFilePath())) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); + "Could not find prepared genome file at: ") + prepData.getGenomeFilePath().string())); } // Test if provided filter config file exists if (!modelFile.empty() && !exists(modelFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter model file at: ") + modelFile.string())); + "Could not find filter model file at: ") + modelFile.string())); } - + if (!filterFile.empty() && !exists(filterFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find filter configuration file at: ") + filterFile.string())); + "Could not find filter configuration file at: ") + filterFile.string())); } - + if (!genuineFile.empty() && !exists(genuineFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find file containing marked junction labels at: ") + genuineFile.string())); + "Could not find file containing marked junction labels at: ") + genuineFile.string())); } - + if (!referenceFile.empty() && !exists(referenceFile)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "Could not find reference BED file at: ") + referenceFile.string())); + "Could not find reference BED file at: ") + referenceFile.string())); } if (!exists(outputDir)) { @@ -136,12 +135,11 @@ void portcullis::JunctionFilter::filter() { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( "Could not create output directory at: ") + outputDir.string())); } - } - else if (!bfs::is_directory(outputDir)) { + } else if (!bfs::is_directory(outputDir)) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( - "File exists with name of suggested output directory: ") + outputDir.string())); + "File exists with name of suggested output directory: ") + outputDir.string())); } - + cout << "Loading junctions from " << junctionFile.string() << " ..."; cout.flush(); @@ -149,75 +147,75 @@ void portcullis::JunctionFilter::filter() { JunctionSystem originalJuncs(junctionFile); cout << " done." << endl - << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; + << "Found " << originalJuncs.getJunctions().size() << " junctions." << endl << endl; unordered_set ref; - if (!referenceFile.empty()) { + if (!referenceFile.empty()) { ifstream ifs(referenceFile.c_str()); string line; // Loop through until end of file or we move onto the next ref seq - while ( std::getline(ifs, line) ) { + while (std::getline(ifs, line)) { boost::trim(line); vector parts; // #2: Search for tokens - boost::split( parts, line, boost::is_any_of("\t"), boost::token_compress_on ); + boost::split(parts, line, boost::is_any_of("\t"), boost::token_compress_on); // Ignore any non-entry lines if (parts.size() == 12) { - int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos + int end = std::stoi(parts[7]) - 1; // -1 to get from BED to portcullis coords for end pos string key = parts[0] + "(" + parts[6] + "," + std::to_string(end) + ")" + parts[5]; ref.insert(key); } } } - + vector genuine; if (!genuineFile.empty()) { - + cout << "Loading list of correct predictions of performance analysis ..."; cout.flush(); - + Performance::loadGenuine(genuineFile, genuine); - + if (genuine.size() != originalJuncs.getJunctions().size()) { BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string( "Genuine file contains ") + lexical_cast(genuine.size()) + - " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + + " entries. Junction file contains " + lexical_cast(originalJuncs.getJunctions().size()) + " junctions. The number of entries in both files must be the same to assess performance.")); } - + // Copy over results into junction list - for(size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { + for (size_t i = 0; i < originalJuncs.getJunctions().size(); i++) { originalJuncs.getJunctionAt(i)->setGenuine(genuine[i]); } - + cout << " done." << endl << endl; } - + // Also keep the current list of junctions JunctionList currentJuncs; - + // Copy everything into passJunc to begin with - for(auto& j : originalJuncs.getJunctions()) { + for (auto& j : originalJuncs.getJunctions()) { currentJuncs.push_back(j); } - + // To be overridden if we are training ModelFeatures mf; mf.initGenomeMapper(prepData.getGenomeFilePath()); - - mf.features[1].active=false; // NB USRS (BAD) - mf.features[2].active=false; // NB DISTRS (BAD) + + mf.features[1].active = false; // NB USRS (BAD) + mf.features[2].active = false; // NB DISTRS (BAD) //mf.features[3].active=false; // NB RELRS (GOOD) - mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) + mf.features[4].active = false; // ENTROPY (BAD - JO LOGDEV ARE BETTER) //mf.features[5].active = false; // REL2RAW (GOOD) - mf.features[6].active=false; // MAXMINANC (BAD - MAXMMES IS BETTER) + mf.features[6].active = false; // MAXMINANC (BAD - MAXMMES IS BETTER) //mf.features[7].active=false; // MAXMMES (GOOD) //mf.features[8].active=false; // MEAN MISMATCH (GOOD) //mf.features[9].active=false; // INTRON (GOOD) //mf.features[10].active=false; // MIN_HAMM (GOOD) - mf.features[11].active=false; // CODING POTENTIAL (BAD) + mf.features[11].active = false; // CODING POTENTIAL (BAD) //mf.features[12].active=false; // POS WEIGHTS (GOOD) //mf.features[13].active=false; // SPLICE SIGNAL (GOOD) /*mf.features[14].active=false; // JO LOGDEV FEATURES BETTER THAN ENTROPY @@ -236,45 +234,45 @@ void portcullis::JunctionFilter::filter() { mf.features[27].active=false; mf.features[28].active=false; mf.features[29].active=false;*/ - - - - + + + + if (train) { - + // The initial positive and negative sets JunctionList pos, unlabelled, neg, unlabelled2; - + cout << "Self training mode activated." << endl << endl; - + createPositiveSet(currentJuncs, pos, unlabelled, mf); - + createNegativeSet(mf.L95, unlabelled, neg, unlabelled2); - + cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; - + cout << "Training markov models ..."; cout.flush(); mf.trainCodingPotentialModel(pos); mf.trainSplicingModels(pos, neg); cout << " done." << endl << endl; - - - + + + // Build the training set by combining the positive and negative sets JunctionList training; training.reserve(pos.size() + neg.size()); training.insert(training.end(), pos.begin(), pos.end()); training.insert(training.end(), neg.begin(), neg.end()); - + JunctionSystem trainingSystem(training); - trainingSystem.sort(); - + trainingSystem.sort(); + cout << "Training Random Forest" << endl - << "----------------------" << endl << endl; + << "----------------------" << endl << endl; bool done = false; //shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); - SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true); + SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); shared_ptr forest = ssf.train(); /* const vector importance = forest->getVariableImportance(); @@ -284,80 +282,80 @@ void portcullis::JunctionFilter::filter() { int16_t j = mf.getNextActiveFeatureIndex(); cout << mf.features[j].name << " - " << i << endl; }*/ - + forest->saveToFile(); //forest->writeOutput(&cout); - + modelFile = output.string() + ".selftrain.forest"; - cout << endl; + cout << endl; } - + // Manage a junction system of all discarded junctions JunctionSystem discardedJuncs; - + // Do ML based filtering if requested - if(!modelFile.empty() && exists(modelFile)){ - cout << "Predicting valid junctions using random forest model" << endl - << "----------------------------------------------------" << endl << endl; - + if (!modelFile.empty() && exists(modelFile)) { + cout << "Predicting valid junctions using random forest model" << endl + << "----------------------------------------------------" << endl << endl; + JunctionList passJuncs; JunctionList failJuncs; - + forestPredict(currentJuncs, passJuncs, failJuncs, mf); - + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Random Forest filtering results")); - + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } - - for(auto& j : failJuncs) { + + for (auto& j : failJuncs) { discardedJuncs.addJunction(j); - } + } } - - + + // Do rule based filtering if requested - if (!filterFile.empty() && exists(filterFile)) { - + if (!filterFile.empty() && exists(filterFile)) { + JunctionList passJuncs; JunctionList failJuncs; JuncResultMap resultMap; - + doRuleBasedFiltering(filterFile, currentJuncs, passJuncs, failJuncs, "Rule-based filtering", resultMap); - + RuleFilter::saveResults(path(output.string() + ".rule_filtering.results"), originalJuncs, resultMap); - - printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Rule-based filtering")); - + + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Rule-based filtering")); + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } - + // Add to discarded - for(auto& j : failJuncs) { + for (auto& j : failJuncs) { discardedJuncs.addJunction(j); - } + } } - + if (maxLength > 0 || this->doCanonicalFiltering()) { - + JunctionList passJuncs; JunctionList failJuncs; - - for(auto& j : currentJuncs) { - + + for (auto& j : currentJuncs) { + bool pass = true; if (maxLength > 0) { if (j->getIntronSize() > maxLength) { pass = false; } } - + if (pass && this->doCanonicalFiltering()) { if (this->filterNovel && j->getSpliceSiteType() == CanonicalSS::NO) { pass = false; @@ -372,41 +370,39 @@ void portcullis::JunctionFilter::filter() { if (pass) { passJuncs.push_back(j); - } - else { + } else { failJuncs.push_back(j); discardedJuncs.addJunction(j); } } - + printFilteringResults(currentJuncs, passJuncs, failJuncs, string("Post filtering (length and/or canonical) results")); - + // Reset currentJuncs currentJuncs.clear(); - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { currentJuncs.push_back(j); } } - + cout << endl; - + JunctionSystem filteredJuncs; JunctionSystem refKeptJuncs; - + if (currentJuncs.empty()) { cout << "WARNING: Filters discarded all junctions from input." << endl; - } - else { - + } else { + cout << "Recalculating junction grouping and distance stats based on new junction list that passed filters ..."; cout.flush(); - for(auto& j : currentJuncs) { + for (auto& j : currentJuncs) { filteredJuncs.addJunction(j); } - + if (!referenceFile.empty()) { - for(auto& j : discardedJuncs.getJunctions()) { + for (auto& j : discardedJuncs.getJunctions()) { if (ref.count(j->locationAsString()) > 0) { filteredJuncs.addJunction(j); refKeptJuncs.addJunction(j); @@ -417,26 +413,26 @@ void portcullis::JunctionFilter::filter() { filteredJuncs.calcJunctionStats(); cout << " done." << endl << endl; - + if (!referenceFile.empty()) { cout << "Brought back " << refKeptJuncs.size() << " junctions that were discarded by filters but were present in reference file." << endl << endl; } } - - printFilteringResults( originalJuncs.getJunctions(), - filteredJuncs.getJunctions(), - discardedJuncs.getJunctions(), - string("Overall results")); + + printFilteringResults(originalJuncs.getJunctions(), + filteredJuncs.getJunctions(), + discardedJuncs.getJunctions(), + string("Overall results")); cout << endl << "Saving junctions passing filter to disk:" << endl; filteredJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".pass", source + "_pass"); - + if (saveBad) { cout << "Saving junctions failing filter to disk:" << endl; discardedJuncs.saveAll(outputDir.string() + "/" + outputPrefix + ".fail", source + "_fail"); - + if (!referenceFile.empty()) { cout << "Saving junctions failing filters but present in reference:" << endl; @@ -447,80 +443,93 @@ void portcullis::JunctionFilter::filter() { void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) { JuncResultMap resultMap; - + cout << "Creating initial positive set for training" << endl - << "------------------------------------------" << endl << endl - << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial positive set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each positive filter layer (Low FPs is important):" << endl; - } + cout << "Performance of each positive filter layer (Low FPs is important):" << endl; + } cout << "LAYER\t"; if (!genuineFile.empty()) { cout << Performance::longHeader(); + } else { + cout << "PASS\tFAIL"; } - + JunctionList p1, p2, p3; - + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalPosRulesFile(1), all, p1, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p1, unlabelled)->toLongString(); + } else { + cout << p1.size() << "\t" << unlabelled.size(); } - cout << endl << "2\t"; + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalPosRulesFile(2), p1, p2, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p2, unlabelled)->toLongString(); + } else { + cout << p2.size() << "\t" << unlabelled.size(); } - cout << endl << "3\t"; + cout << endl << "3\t"; RuleFilter::filter(this->getIntitalPosRulesFile(3), p2, p3, unlabelled, "Creating initial positive set for training", resultMap); if (!genuineFile.empty()) { cout << calcPerformance(p3, unlabelled)->toLongString(); + } else { + cout << p3.size() << "\t" << unlabelled.size(); } + cout << endl << "L95x1.5\t"; const uint32_t L95 = mf.calcIntronThreshold(p2); const uint32_t pos_length_limit = L95 * 1.5; - + JunctionList passJuncs; - for(auto& j : p3) { + for (auto& j : p3) { if (j->getIntronSize() <= pos_length_limit) { passJuncs.push_back(j); - } - else { + } else { unlabelled.push_back(j); } } if (!genuineFile.empty()) { cout << calcPerformance(passJuncs, unlabelled)->toLongString(); + } else { + cout << passJuncs.size() << "\t" << unlabelled.size(); } - - cout << endl << endl << "Found " << passJuncs.size() << " junctions for the positive set" << endl << endl; - // Analyse positive set to get L0.05 of intron size - cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(passJuncs) << endl << endl; - + JunctionSystem isp(passJuncs); + isp.sort(); + cout << endl << endl << "Found " << isp.size() << " junctions for the positive set" << endl << endl; + + // Analyse positive set to get L0.05 of intron size + cout << "Length of intron at 95th percentile of positive set (L95): " << mf.calcIntronThreshold(isp.getJunctions()) << endl << endl; + + cout << "Saving initial positive set:" << endl; - JunctionSystem isp(passJuncs); + isp.saveAll(output.string() + ".selftrain.initialset.pos", "portcullis_isp"); - for(auto& j : passJuncs) { + for (auto& j : isp.getJunctions()) { JunctionPtr copy = make_shared(*j); copy->setGenuine(true); - pos.push_back(copy); + pos.push_back(copy); } if (!genuineFile.empty()) { - + JunctionSystem invalidPos; - for(auto& j : passJuncs) { + for (auto& j : passJuncs) { if (!j->isGenuine()) { invalidPos.addJunction(j); } } JunctionSystem missedPos; - for(auto& j : unlabelled) { + for (auto& j : unlabelled) { if (j->isGenuine()) { missedPos.addJunction(j); } @@ -535,76 +544,93 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc } void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs) { - + JuncResultMap resultMap; - + cout << "Creating initial negative set for training" << endl - << "------------------------------------------" << endl << endl - << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; + << "------------------------------------------" << endl << endl + << "Applying a set of rule-based filters in " << dataDir.string() << " to create initial negative set." << endl << endl; if (!genuineFile.empty()) { - cout << "Performance of each negative filter layer (Low FNs is important):" << endl; + cout << "Performance of each negative filter layer (Low FNs is important):" << endl; } cout << "LAYER\t"; if (!genuineFile.empty()) { cout << Performance::longHeader(); + } else { + cout << "PASS\tFAIL"; } - + JunctionList p1, p2, p3, p4, p5, p6, p7, p8, f1, f2, f3, f4, f5, f6, f7, f8; - + cout << endl << "1\t"; RuleFilter::filter(this->getIntitalNegRulesFile(1), all, p1, f1, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p1, f1, true)->toLongString(); + cout << calcPerformance(p1, f1, true)->toLongString(); + } else { + cout << p1.size() << "\t" << f1.size(); } - cout << endl << "2\t"; + cout << endl << "2\t"; RuleFilter::filter(this->getIntitalNegRulesFile(2), f1, p2, f2, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p2, f2, true)->toLongString(); + cout << calcPerformance(p2, f2, true)->toLongString(); + } else { + cout << p2.size() << "\t" << f2.size(); } cout << endl << "3\t"; RuleFilter::filter(this->getIntitalNegRulesFile(3), f2, p3, f3, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p3, f3, true)->toLongString(); + cout << calcPerformance(p3, f3, true)->toLongString(); + } else { + cout << p3.size() << "\t" << f3.size(); } cout << endl << "4\t"; RuleFilter::filter(this->getIntitalNegRulesFile(4), f3, p4, f4, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p4, f4, true)->toLongString(); + cout << calcPerformance(p4, f4, true)->toLongString(); + } else { + cout << p4.size() << "\t" << f4.size(); } cout << endl << "5\t"; RuleFilter::filter(this->getIntitalNegRulesFile(5), f4, p5, f5, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p5, f5, true)->toLongString(); + cout << calcPerformance(p5, f5, true)->toLongString(); + } else { + cout << p5.size() << "\t" << f5.size(); } cout << endl << "6\t"; RuleFilter::filter(this->getIntitalNegRulesFile(6), f5, p6, f6, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p6, f6, true)->toLongString(); + cout << calcPerformance(p6, f6, true)->toLongString(); + } else { + cout << p6.size() << "\t" << f6.size(); } cout << endl << "7\t"; RuleFilter::filter(this->getIntitalNegRulesFile(7), f6, p7, f7, "Creating initial negative set for training", resultMap); if (!genuineFile.empty()) { - cout << calcPerformance(p7, f7, true)->toLongString(); + cout << calcPerformance(p7, f7, true)->toLongString(); + } else { + cout << p7.size() << "\t" << f7.size(); } - cout << endl << "L95x5\t"; - + cout << endl << "L95x10\t"; + JunctionList passJuncs; - const uint32_t L95x5 = L95 * 5; - for(auto& j : f7) { - if (j->getIntronSize() > L95x5 && j->getMaxMMES() < 10 ) { - p7.push_back(j); - } - else { - failJuncs.push_back(j); - } + const uint32_t L95x10 = L95 * 10; + for (auto& j : f7) { + if (j->getIntronSize() > L95x10 && j->getMaxMMES() < 10) { + p8.push_back(j); + } else { + failJuncs.push_back(j); + } } if (!genuineFile.empty()) { - cout << calcPerformance(p8, failJuncs, true)->toLongString(); + cout << calcPerformance(p8, failJuncs, true)->toLongString(); + } else { + cout << p8.size() << "\t" << failJuncs.size(); } - + cout << endl << endl << "Concatenating negatives from each layer to create negative set" << endl << endl; - + passJuncs.insert(passJuncs.end(), p1.begin(), p1.end()); passJuncs.insert(passJuncs.end(), p2.begin(), p2.end()); passJuncs.insert(passJuncs.end(), p3.begin(), p3.end()); @@ -616,183 +642,188 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL // This will remove any duplicates JunctionSystem isn(passJuncs); - + isn.sort(); + + cout << "Final\t"; if (!genuineFile.empty()) { - cout << "Final\t" << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString() << endl << endl; + cout << calcPerformance(isn.getJunctions(), failJuncs, true)->toLongString(); + } + else { + cout << isn.getJunctions().size() << "\t" << failJuncs.size(); } - cout << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; + cout << endl << endl << "Found " << isn.getJunctions().size() << " junctions for the negative set" << endl << endl; cout << endl << "Saving initial negative set:" << endl; isn.saveAll(output.string() + ".selftrain.initialset.neg", "portcullis_isn"); - for(auto& j : isn.getJunctions()) { - JunctionPtr copy = make_shared(*j); - copy->setGenuine(false); - neg.push_back(copy); + for (auto& j : isn.getJunctions()) { + JunctionPtr copy = make_shared(*j); + copy->setGenuine(false); + neg.push_back(copy); } if (!genuineFile.empty()) { - - JunctionSystem invalidNeg; - JunctionSystem missedNeg; - - for(auto& j : isn.getJunctions()) { - if (j->isGenuine()) { - invalidNeg.addJunction(j); - } - } - for(auto& j : failJuncs) { - if (!j->isGenuine()) { - missedNeg.addJunction(j); - } - } - - cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; - invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); - - cout << "Saving missed negative junctions to disk:" << endl; - missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); + + JunctionSystem invalidNeg; + JunctionSystem missedNeg; + + for (auto& j : isn.getJunctions()) { + if (j->isGenuine()) { + invalidNeg.addJunction(j); + } + } + for (auto& j : failJuncs) { + if (!j->isGenuine()) { + missedNeg.addJunction(j); + } + } + + cout << "Saving genuine valid junctions in initial negative set to disk:" << endl; + invalidNeg.saveAll(output.string() + ".selftrain.initialset.invalidneg", "portcullis_invalid_isn"); + + cout << "Saving missed negative junctions to disk:" << endl; + missedNeg.saveAll(output.string() + ".selftrain.initialset.missedneg", "portcullis_missed_isn"); } } - void portcullis::JunctionFilter::printFilteringResults(const JunctionList& in, const JunctionList& pass, const JunctionList& fail, const string& prefix) { // Output stats size_t diff = in.size() - pass.size(); cout << endl << prefix << endl - << "-------------------------" << endl - << "Input contained " << in.size() << " junctions." << endl - << "Output contains " << pass.size() << " junctions." << endl - << "Filtered out " << diff << " junctions." << endl; - + << "-------------------------" << endl + << "Input contained " << in.size() << " junctions." << endl + << "Output contains " << pass.size() << " junctions." << endl + << "Filtered out " << diff << " junctions." << endl; + if (!genuineFile.empty() && exists(genuineFile)) { shared_ptr p = calcPerformance(pass, fail); cout << Performance::longHeader() << endl; cout << p->toLongString() << endl << endl; - } + } } shared_ptr portcullis::JunctionFilter::calcPerformance(const JunctionList& pass, const JunctionList& fail, bool invert) { - + uint32_t tp = 0, tn = 0, fp = 0, fn = 0; if (invert) { - for(auto& j : pass) { - if (!j->isGenuine()) tn++; else fn++; + for (auto& j : pass) { + if (!j->isGenuine()) tn++; + else fn++; } - for(auto& j : fail) { - if (j->isGenuine()) tp++; else fp++; - } - } - else { - for(auto& j : pass) { - if (j->isGenuine()) tp++; else fp++; + for (auto& j : fail) { + if (j->isGenuine()) tp++; + else fp++; + } + } else { + for (auto& j : pass) { + if (j->isGenuine()) tp++; + else fp++; } - for(auto& j : fail) { - if (!j->isGenuine()) tn++; else fn++; + for (auto& j : fail) { + if (!j->isGenuine()) tn++; + else fn++; } } - + return make_shared(tp, tn, fp, fn); } - void portcullis::JunctionFilter::doRuleBasedFiltering(const path& ruleFile, const JunctionList& all, JunctionList& pass, JunctionList& fail, const string& prefix, JuncResultMap& resultMap) { cout << "Loading JSON rule-based filtering config file: " << ruleFile.string() << endl; cout << "Filtering junctions ..."; cout.flush(); - map filterCounts = RuleFilter::filter(ruleFile, all, pass, fail, prefix, resultMap); + map filterCounts = RuleFilter::filter(ruleFile, all, pass, fail, prefix, resultMap); cout << " done." << endl << endl - << "Number of junctions failing for each filter: " << endl; + << "Number of junctions failing for each filter: " << endl; - for(map::iterator iterator = filterCounts.begin(); iterator != filterCounts.end(); iterator++) { + for (map::iterator iterator = filterCounts.begin(); iterator != filterCounts.end(); iterator++) { cout << iterator->first << ": " << iterator->second << endl; } } - + void portcullis::JunctionFilter::forestPredict(const JunctionList& all, JunctionList& pass, JunctionList& fail, ModelFeatures& mf) { - + cout << "Creating feature vector" << endl; - + Data* testingData = mf.juncs2FeatureVectors(all); - + cout << "Initialising random forest" << endl; - + shared_ptr f = nullptr; if (train) { f = make_shared(); - } - else { + } else { f = make_shared(); } - + vector catVars; - + f->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - testingData, // Data object - 0, // M Try (0 == use default) - "", // Output prefix - 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) - 1234567890, // Seed for random generator - threads, // Number of threads - IMP_GINI, // Importance measure - train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size - //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, - "", // Status var name - true, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - false, // predall - 1.0); // Sample fraction - + "Genuine", // Dependant variable name + MEM_DOUBLE, // Memory mode + testingData, // Data object + 0, // M Try (0 == use default) + "", // Output prefix + 250, //DEFAULT_SELFTRAIN_TREES, // Number of trees (will be overwritten when loading the model) + 1234567890, // Seed for random generator + threads, // Number of threads + IMP_GINI, // Importance measure + train ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size + //DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, + "", // Status var name + true, // Prediction mode + true, // Replace + catVars, // Unordered categorical variable names (vector) + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + false, // predall + 1.0); // Sample fraction + f->setVerboseOut(&cerr); - + // Load trees from saved model f->loadFromFile(modelFile.string()); - + cout << "Making predictions" << endl; f->run(verbose); - + path scorepath = output.string() + ".scores"; ofstream scoreStream(scorepath.c_str()); - + scoreStream << "Score\t" << Intron::locationOutputHeader() << "\tStrand\tSS\t" << testingData->getHeader() << endl; - - for(size_t i = 0; i < all.size(); i++) { - scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) - << "\t" << strandToChar(all[i]->getConsensusStrand()) - << "\t" << cssToChar(all[i]->getSpliceSiteType()) - << "\t" << testingData->getRow(i) << endl; + for (size_t i = 0; i < all.size(); i++) { + + scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) + << "\t" << strandToChar(all[i]->getConsensusStrand()) + << "\t" << cssToChar(all[i]->getSpliceSiteType()) + << "\t" << testingData->getRow(i) << endl; } - + scoreStream.close(); - - + + if (!genuineFile.empty() && exists(genuineFile)) { vector thresholds; - for(double i = 0.0; i <= 1.0; i += 0.01) { + for (double i = 0.0; i <= 1.0; i += 0.01) { thresholds.push_back(i); } double max_mcc = 0.0; double max_f1 = 0.0; double best_t_mcc = 0.0; double best_t_f1 = 0.0; - + cout << "Threshold\t" << Performance::longHeader() << endl; - for(auto& t : thresholds) { + for (auto& t : thresholds) { JunctionList pjl; JunctionList fjl; categorise(f, all, pjl, fjl, t); @@ -809,29 +840,27 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction best_t_f1 = t; } } - + cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; cout << "Actual threshold set at " << threshold << endl; categorise(f, all, pass, fail, threshold); - } - else { + } else { categorise(f, all, pass, fail, threshold); } - + cout << "Saved junction scores to: " << scorepath << endl; - - + + delete testingData; } void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { - - for(size_t i = 0; i < all.size(); i++) { + + for (size_t i = 0; i < all.size(); i++) { if (f->getPredictions()[i][0] <= t) { pass.push_back(all[i]); - } - else { + } else { fail.push_back(all[i]); } } @@ -855,38 +884,38 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { double threshold; bool verbose; bool help; - + struct winsize w; ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); // Declare the supported options. - po::options_description generic_options(helpMessage(), w.ws_col, w.ws_col/1.7); + po::options_description generic_options(helpMessage(), w.ws_col, w.ws_col / 1.7); generic_options.add_options() - ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), - "Output prefix for files generated by this program.") - ("filter_file,f", po::value(&filterFile), - "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") - ("model_file,m", po::value(&modelFile), - "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") + ("output,o", po::value(&output)->default_value(DEFAULT_FILTER_OUTPUT), + "Output prefix for files generated by this program.") + ("filter_file,f", po::value(&filterFile), + "If you wish to custom rule-based filter the junctions file, use this option to provide a list of the rules you wish to use. By default we don't filter using a rule-based method, we instead filter via a self-trained random forest model. See manual for more details.") + ("model_file,m", po::value(&modelFile), + "If you wish to use a custom random forest model to filter the junctions file, rather than self-training on the input dataset use this option to. See manual for more details.") ("genuine,g", po::value(&genuineFile), - "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") + "If you have a list of line separated boolean values in a file, indicating whether each junction in your input is genuine or not, then we can use that information here to gauge the accuracy of the predictions.") ("reference,r", po::value(&referenceFile), - "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") + "Reference annotation of junctions in BED format. Any junctions found by the junction analysis tool will be preserved if found in this reference file regardless of any other filtering criteria. If you need to convert a reference annotation from GTF or GFF to BED format portcullis contains scripts for this.") ("no_ml,n", po::bool_switch(&no_ml)->default_value(false), - "Disables machine learning filtering") + "Disables machine learning filtering") ("save_bad,b", po::bool_switch(&saveBad)->default_value(false), - "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") + "Saves bad junctions (i.e. junctions that fail the filter), as well as good junctions (those that pass)") ("source", po::value(&source)->default_value(DEFAULT_FILTER_SOURCE), - "The value to enter into the \"source\" field in GFF files.") + "The value to enter into the \"source\" field in GFF files.") ("max_length,l", po::value(&max_length)->default_value(0), - "Filter junctions longer than this value. Default (0) is to not filter based on length.") + "Filter junctions longer than this value. Default (0) is to not filter based on length.") ("canonical,c", po::value(&canonical)->default_value("OFF"), - "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GU-AG), S = Semi-canonical junctions (AT-AC, or GT-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") - ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), - "The number of threads to use during testing (only applies if using forest model).") - ("verbose,v", po::bool_switch(&verbose)->default_value(false), - "Print extra information") + "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GU-AG), S = Semi-canonical junctions (AT-AC, or GT-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") + ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), + "The number of threads to use during testing (only applies if using forest model).") + ("verbose,v", po::bool_switch(&verbose)->default_value(false), + "Print extra information") ("help", po::bool_switch(&help)->default_value(false), "Produce help message") ; @@ -894,10 +923,10 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { // in config file, but will not be shown to the user. po::options_description hidden_options("Hidden options"); hidden_options.add_options() - ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") - ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") - ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), - "The threshold score at which we determine a junction to be genuine or not.") + ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") + ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not.") ; // Positional option for the input bam file @@ -923,10 +952,10 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { - auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); + auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); cout << "Running portcullis in junction filter mode" << endl - << "------------------------------------------" << endl << endl; + << "------------------------------------------" << endl << endl; // Create the prepare class JunctionFilter filter(prepDir, junctionFile, output); @@ -936,14 +965,13 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { filter.setThreads(threads); filter.setMaxLength(max_length); filter.setCanonical(canonical); - + // Only set the filter rules if specified. filter.setFilterFile(filterFile); filter.setGenuineFile(genuineFile); if (modelFile.empty() && !no_ml) { - filter.setTrain(true); - } - else { + filter.setTrain(true); + } else { filter.setTrain(false); if (!no_ml) { filter.setModelFile(modelFile); @@ -951,7 +979,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } filter.setReferenceFile(referenceFile); filter.setThreshold(threshold); - + filter.filter(); return 0; From 9129e169aa9d9e9659a608075134a94baeff005f Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 11 May 2016 21:16:32 +0100 Subject: [PATCH 10/21] Reverting to sample with replacement as OOBE scores don't appear to work correctly without it... need to figure out why at some point. Fixed SSL approach but deactivated airbag feature to revert to inital model if OOBE doesn't fall below that of the initial model after incorperation of unlabelled data. Results seem to show what I would would expect to see so far... more investigation required. --- deps/ranger-0.3.8/include/ranger/Forest.h | 9 +++++++++ lib/src/ss_forest.cc | 12 ++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index f15b9957..a04b33b6 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -182,6 +182,15 @@ class Forest { void grow(bool verbose); void predict(); + + const std::vector& getCaseWeights() const { + return case_weights; + } + + void setCaseWeights(std::vector case_weights) { + this->case_weights = case_weights; + } + protected: diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 4554771e..9934bfca 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -85,15 +85,17 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { threads, // Number of threads IMP_GINI, // Importance measure DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size - "", // Status var name + "", // Status var name (not required unless using survival) false, // Prediction mode - false, // Replace + true, // Sample with replacement catVars, // Unordered categorical variable names (vector) false, // Memory saving DEFAULT_SPLITRULE, // Split rule - true, // predall + true, // Retain results for all trees (predict all) 1.0); // Sample fraction + vector case_weights = {0.7, 0.3}; + //l->setCaseWeights(case_weights); l->setVerboseOut(&cout); if (verbose) cout << "Training on labelled data" << endl; @@ -163,12 +165,13 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size "", // Status var name false, // Prediction mode - false, // Replace + true, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving DEFAULT_SPLITRULE, // Split rule true, // predall 1.0); // Sample fraction + //u->setCaseWeights(case_weights); u->run(verbose); cout << "OOBE: " << u->getOverallPredictionError() << endl; @@ -203,6 +206,7 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { } } + initOnly = false; // Revert to the best forest ForestPtr b = make_shared(); From 46e33ee7c564c43364b54f105ccff50712faaf54 Mon Sep 17 00:00:00 2001 From: Dan Date: Wed, 11 May 2016 21:54:34 +0100 Subject: [PATCH 11/21] Created a nicer interface for reinitialising random forests. --- deps/ranger-0.3.8/include/ranger/Forest.h | 35 +- deps/ranger-0.3.8/src/Forest.cpp | 1290 +++++++++++---------- lib/src/ss_forest.cc | 4 +- src/train.cc | 6 +- 4 files changed, 675 insertions(+), 660 deletions(-) diff --git a/deps/ranger-0.3.8/include/ranger/Forest.h b/deps/ranger-0.3.8/include/ranger/Forest.h index a04b33b6..6497d3f1 100644 --- a/deps/ranger-0.3.8/include/ranger/Forest.h +++ b/deps/ranger-0.3.8/include/ranger/Forest.h @@ -63,6 +63,10 @@ class Forest { std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction); + void init(MemoryMode memory_mode, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, bool prediction_mode, bool sample_with_replacement, bool memory_saving_splitting, + SplitRule splitrule, bool predict_all, double sample_fraction); void init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, @@ -120,16 +124,16 @@ class Forest { const std::vector >& getPredictions() const { return predictions; } - + const double getPrediction(int i) const { double sum = 0.0; - for(size_t j = 0; j < predictions[i].size(); j++) { + for (size_t j = 0; j < predictions[i].size(); j++) { sum += predictions[i][j]; } - + return sum / predictions[i].size(); } - + double makePrediction(int i) const; size_t getDependentVarId() const { @@ -156,23 +160,16 @@ class Forest { return is_ordered_variable; } - std::vector> getInbagCounts() const { - std::vector> result; - for (auto& tree : trees) { - result.push_back(tree->getInbagCounts()); - } - return result; - } - void setPredictionMode(bool prediction_mode) { this->prediction_mode = prediction_mode; } - void setData(Data* data) { - this->data = data; - // Assume we want to reset predictions too - //this->predictions.clear(); - } + /** + * Resets the data to a new dataset. Probably shouldn't use this unless you + * know what you are doing. Best to use "init" method instead. + * @param data + */ + void setData(Data* data, std::string dependent_variable_name, std::string status_variable_name, std::vector& unordered_variable_names); void setVerboseOut(std::ostream* verbose_out) { this->verbose_out = verbose_out; @@ -182,7 +179,7 @@ class Forest { void grow(bool verbose); void predict(); - + const std::vector& getCaseWeights() const { return case_weights; } @@ -274,7 +271,7 @@ class Forest { // Random number generator std::mt19937_64 random_number_generator; - + std::string output_prefix; ImportanceMode importance_mode; diff --git a/deps/ranger-0.3.8/src/Forest.cpp b/deps/ranger-0.3.8/src/Forest.cpp index 5f100281..2682c571 100644 --- a/deps/ranger-0.3.8/src/Forest.cpp +++ b/deps/ranger-0.3.8/src/Forest.cpp @@ -45,307 +45,322 @@ Forest::Forest() : verbose_out(0), num_trees(DEFAULT_NUM_TREE), mtry(0), min_node_size(0), num_variables(0), num_independent_variables( - 0), seed(0), dependent_varID(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement( - true), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction(1), num_threads( - DEFAULT_NUM_THREADS), data(0), overall_prediction_error(0), importance_mode(DEFAULT_IMPORTANCE_MODE), progress( - 0) { +0), seed(0), dependent_varID(0), num_samples(0), prediction_mode(false), memory_mode(MEM_DOUBLE), sample_with_replacement( +true), memory_saving_splitting(false), splitrule(DEFAULT_SPLITRULE), predict_all(false), keep_inbag(false), sample_fraction(1), num_threads( +DEFAULT_NUM_THREADS), data(0), overall_prediction_error(0), importance_mode(DEFAULT_IMPORTANCE_MODE), progress( +0) { } Forest::~Forest() { - for (auto& tree : trees) { - delete tree; - } + for (auto& tree : trees) { + delete tree; + } } - void Forest::initCpp(std::string dependent_variable_name, MemoryMode memory_mode, std::string input_file, uint mtry, - std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, - std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, - std::string split_select_weights_file, std::vector& always_split_variable_names, - std::string status_variable_name, bool sample_with_replacement, std::vector& unordered_variable_names, - bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, - double sample_fraction) { - - this->verbose_out = verbose_out; - - // Initialize data with memmode - switch (memory_mode) { - case MEM_DOUBLE: - data = new DataDouble(); - break; - case MEM_FLOAT: - data = new DataFloat(); - break; - case MEM_CHAR: - data = new DataChar(); - break; - } - - // Load data - if (verbose_out) *verbose_out << "Loading input file: " << input_file << "." << std::endl; - bool rounding_error = data->loadFromFile(input_file); - if (rounding_error && verbose_out) { - *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." - << std::endl; - } - - // Set prediction mode - bool prediction_mode = false; - if (!load_forest_filename.empty()) { - prediction_mode = true; - } - - // Call other init function - init(dependent_variable_name, memory_mode, data, mtry, output_prefix, num_trees, seed, num_threads, importance_mode, - min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, - memory_saving_splitting, splitrule, predict_all, sample_fraction); - - if (prediction_mode) { - loadFromFile(load_forest_filename); - } - // Set variables to be always considered for splitting - if (!always_split_variable_names.empty()) { - setAlwaysSplitVariables(always_split_variable_names); - } - - // TODO: Read 2d weights for tree-wise split select weights - // Load split select weights from file - if (!split_select_weights_file.empty()) { - std::vector> split_select_weights; - split_select_weights.resize(1); - loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); - if (split_select_weights[0].size() != num_variables - 1) { - throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); - } - setSplitWeightVector(split_select_weights); - } - - // Load case weights from file - if (!case_weights_file.empty()) { - loadDoubleVectorFromFile(case_weights, case_weights_file); - if (case_weights.size() != num_samples - 1) { - throw std::runtime_error("Number of case weights is not equal to number of samples."); - } - } - - // Check if all catvars are coded in integers starting at 1 - if (!unordered_variable_names.empty()) { - std::string error_message = checkUnorderedVariables(data, unordered_variable_names); - if (!error_message.empty()) { - throw std::runtime_error(error_message); - } - } + std::string output_prefix, uint num_trees, std::ostream* verbose_out, uint seed, uint num_threads, + std::string load_forest_filename, ImportanceMode importance_mode, uint min_node_size, + std::string split_select_weights_file, std::vector& always_split_variable_names, + std::string status_variable_name, bool sample_with_replacement, std::vector& unordered_variable_names, + bool memory_saving_splitting, SplitRule splitrule, std::string case_weights_file, bool predict_all, + double sample_fraction) { + + this->verbose_out = verbose_out; + + // Initialize data with memmode + switch (memory_mode) { + case MEM_DOUBLE: + data = new DataDouble(); + break; + case MEM_FLOAT: + data = new DataFloat(); + break; + case MEM_CHAR: + data = new DataChar(); + break; + } + + // Load data + if (verbose_out) *verbose_out << "Loading input file: " << input_file << "." << std::endl; + bool rounding_error = data->loadFromFile(input_file); + if (rounding_error && verbose_out) { + *verbose_out << "Warning: Rounding or Integer overflow occurred. Use FLOAT or DOUBLE precision to avoid this." + << std::endl; + } + + // Set prediction mode + bool prediction_mode = false; + if (!load_forest_filename.empty()) { + prediction_mode = true; + } + + // Call other init function + init(dependent_variable_name, memory_mode, data, mtry, output_prefix, num_trees, seed, num_threads, importance_mode, + min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + if (prediction_mode) { + loadFromFile(load_forest_filename); + } + // Set variables to be always considered for splitting + if (!always_split_variable_names.empty()) { + setAlwaysSplitVariables(always_split_variable_names); + } + + // TODO: Read 2d weights for tree-wise split select weights + // Load split select weights from file + if (!split_select_weights_file.empty()) { + std::vector> split_select_weights; + split_select_weights.resize(1); + loadDoubleVectorFromFile(split_select_weights[0], split_select_weights_file); + if (split_select_weights[0].size() != num_variables - 1) { + throw std::runtime_error("Number of split select weights is not equal to number of independent variables."); + } + setSplitWeightVector(split_select_weights); + } + + // Load case weights from file + if (!case_weights_file.empty()) { + loadDoubleVectorFromFile(case_weights, case_weights_file); + if (case_weights.size() != num_samples - 1) { + throw std::runtime_error("Number of case weights is not equal to number of samples."); + } + } + + // Check if all catvars are coded in integers starting at 1 + if (!unordered_variable_names.empty()) { + std::string error_message = checkUnorderedVariables(data, unordered_variable_names); + if (!error_message.empty()) { + throw std::runtime_error(error_message); + } + } } void Forest::initR(std::string dependent_variable_name, Data* input_data, uint mtry, uint num_trees, - std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, - std::vector>& split_select_weights, std::vector& always_split_variable_names, - std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction) { - - this->verbose_out = verbose_out; - - // Call other init function - init(dependent_variable_name, MEM_DOUBLE, input_data, mtry, "", num_trees, seed, num_threads, importance_mode, - min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, - memory_saving_splitting, splitrule, predict_all, sample_fraction); - - // Set variables to be always considered for splitting - if (!always_split_variable_names.empty()) { - setAlwaysSplitVariables(always_split_variable_names); - } - - // Set split select weights - if (!split_select_weights.empty()) { - setSplitWeightVector(split_select_weights); - } - - // Set case weights - if (!case_weights.empty()) { - if (case_weights.size() != num_samples) { - throw std::runtime_error("Number of case weights not equal to number of samples."); - } - this->case_weights = case_weights; - } - - // Keep inbag counts - this->keep_inbag = keep_inbag; + std::ostream* verbose_out, uint seed, uint num_threads, ImportanceMode importance_mode, uint min_node_size, + std::vector>&split_select_weights, std::vector& always_split_variable_names, + std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + std::vector& case_weights, bool predict_all, bool keep_inbag, double sample_fraction) { + + this->verbose_out = verbose_out; + + // Call other init function + init(dependent_variable_name, MEM_DOUBLE, input_data, mtry, "", num_trees, seed, num_threads, importance_mode, + min_node_size, status_variable_name, prediction_mode, sample_with_replacement, unordered_variable_names, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + // Set variables to be always considered for splitting + if (!always_split_variable_names.empty()) { + setAlwaysSplitVariables(always_split_variable_names); + } + + // Set split select weights + if (!split_select_weights.empty()) { + setSplitWeightVector(split_select_weights); + } + + // Set case weights + if (!case_weights.empty()) { + if (case_weights.size() != num_samples) { + throw std::runtime_error("Number of case weights not equal to number of samples."); + } + this->case_weights = case_weights; + } + + // Keep inbag counts + this->keep_inbag = keep_inbag; } -void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, - std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, - uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, - std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, - bool predict_all, double sample_fraction) { - - // Initialize data with memmode - this->data = input_data; - - // Initialize random number generator and set seed - if (seed == 0) { - std::random_device random_device; - random_number_generator.seed(random_device()); - } else { - random_number_generator.seed(seed); - } - - // Set number of threads - if (num_threads == DEFAULT_NUM_THREADS) { +void Forest::init(MemoryMode memory_mode, uint mtry, std::string output_prefix, + uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, bool prediction_mode, bool sample_with_replacement, bool memory_saving_splitting, + SplitRule splitrule, bool predict_all, double sample_fraction) { + + // Initialize random number generator and set seed + if (seed == 0) { + std::random_device random_device; + random_number_generator.seed(random_device()); + } else { + random_number_generator.seed(seed); + } + + // Set number of threads + if (num_threads == DEFAULT_NUM_THREADS) { #ifdef WIN_R_BUILD - this->num_threads = 1; + this->num_threads = 1; #else - this->num_threads = std::thread::hardware_concurrency(); + this->num_threads = std::thread::hardware_concurrency(); #endif - } else { - this->num_threads = num_threads; - } - - // Set member variables - this->num_trees = num_trees; - this->mtry = mtry; - this->seed = seed; - this->output_prefix = output_prefix; - this->importance_mode = importance_mode; - this->min_node_size = min_node_size; - this->memory_mode = memory_mode; - this->prediction_mode = prediction_mode; - this->sample_with_replacement = sample_with_replacement; - this->memory_saving_splitting = memory_saving_splitting; - this->splitrule = splitrule; - this->predict_all = predict_all; - this->sample_fraction = sample_fraction; - - // Set number of samples and variables - num_samples = data->getNumRows(); - num_variables = data->getNumCols(); - - // Convert dependent variable name to ID - if (!prediction_mode && !dependent_variable_name.empty()) { - dependent_varID = data->getVariableID(dependent_variable_name); - } - - // Set unordered factor variables - if (!prediction_mode) { - is_ordered_variable.resize(num_variables, true); - for (auto& variable_name : unordered_variable_names) { - size_t varID = data->getVariableID(variable_name); - is_ordered_variable[varID] = false; - } - } - - no_split_variables.push_back(dependent_varID); - - initInternal(status_variable_name); - - num_independent_variables = num_variables - no_split_variables.size(); - - // Sort no split variables in ascending order - std::sort(no_split_variables.begin(), no_split_variables.end()); - - // Init split select weights - split_select_weights.push_back(std::vector()); - - // Check if mtry is in valid range - if (this->mtry > num_variables - 1) { - throw std::runtime_error("mtry can not be larger than number of variables in data."); - } - - // Check if any observations samples - if ((size_t) num_samples * sample_fraction < 1) { - throw std::runtime_error("sample_fraction too small, no observations sampled."); - } + } else { + this->num_threads = num_threads; + } + + // Set member variables + this->num_trees = num_trees; + this->mtry = mtry; + this->seed = seed; + this->output_prefix = output_prefix; + this->importance_mode = importance_mode; + this->min_node_size = min_node_size; + this->memory_mode = memory_mode; + this->prediction_mode = prediction_mode; + this->sample_with_replacement = sample_with_replacement; + this->memory_saving_splitting = memory_saving_splitting; + this->splitrule = splitrule; + this->predict_all = predict_all; + this->sample_fraction = sample_fraction; +} + +void Forest::init(std::string dependent_variable_name, MemoryMode memory_mode, Data* input_data, uint mtry, + std::string output_prefix, uint num_trees, uint seed, uint num_threads, ImportanceMode importance_mode, + uint min_node_size, std::string status_variable_name, bool prediction_mode, bool sample_with_replacement, + std::vector& unordered_variable_names, bool memory_saving_splitting, SplitRule splitrule, + bool predict_all, double sample_fraction) { + + this->init(memory_mode, mtry, output_prefix, num_trees, seed, num_threads, + importance_mode, min_node_size, prediction_mode, sample_with_replacement, + memory_saving_splitting, splitrule, predict_all, sample_fraction); + + this->setData(input_data, dependent_variable_name, status_variable_name, unordered_variable_names); } void Forest::run(bool verbose) { - if (prediction_mode) { - if (verbose && verbose_out) { - *verbose_out << "Predicting .." << std::endl; - } - predict(); - } else { - if (verbose && verbose_out) { - *verbose_out << "Growing trees .." << std::endl; - } + if (prediction_mode) { + if (verbose && verbose_out) { + *verbose_out << "Predicting .." << std::endl; + } + predict(); + } else { + if (verbose && verbose_out) { + *verbose_out << "Growing trees .." << std::endl; + } - grow(verbose); + grow(verbose); - if (verbose && verbose_out) { - *verbose_out << "Computing prediction error .." << std::endl; - } - computePredictionError(); + if (verbose && verbose_out) { + *verbose_out << "Computing prediction error .." << std::endl; + } + computePredictionError(); - if (importance_mode > IMP_GINI) { - if (verbose && verbose_out) { - *verbose_out << "Computing permutation variable importance .." << std::endl; - } - computePermutationImportance(); + if (importance_mode > IMP_GINI) { + if (verbose && verbose_out) { + *verbose_out << "Computing permutation variable importance .." << std::endl; + } + computePermutationImportance(); + } } - } +} + +/** +* Resets the data to a new dataset. Probably shouldn't use this unless you +* know what you are doing. Best to use "init" method instead. +* @param data +*/ +void Forest::setData(Data* data, std::string dependent_variable_name, std::string status_variable_name, std::vector& unordered_variable_names) { + + // Record the pointer to the data object + this->data = data; + + // Set number of samples and variables + this->num_samples = data->getNumRows(); + this->num_variables = data->getNumCols(); + + // Convert dependent variable name to ID + dependent_varID = data->getVariableID(dependent_variable_name); + + // Set unordered factor variables + is_ordered_variable.resize(num_variables, true); + for (auto& variable_name : unordered_variable_names) { + size_t varID = data->getVariableID(variable_name); + is_ordered_variable[varID] = false; + } + + no_split_variables.push_back(dependent_varID); + + initInternal(status_variable_name); + + num_independent_variables = num_variables - no_split_variables.size(); + + // Sort no split variables in ascending order + std::sort(no_split_variables.begin(), no_split_variables.end()); + + // Init split select weights + split_select_weights.push_back(std::vector()); + + // Check if mtry is in valid range + if (this->mtry > num_variables - 1) { + throw std::runtime_error("mtry can not be larger than number of variables in data."); + } + + // Check if any observations samples + if ((size_t) num_samples * sample_fraction < 1) { + throw std::runtime_error("sample_fraction too small, no observations sampled."); + } } void Forest::writeOutput(std::ostream* out) { - *out << std::endl; - writeOutputInternal(); - *out << "Dependent variable name: " << data->getVariableNames()[dependent_varID] << std::endl; - *out << "Dependent variable ID: " << dependent_varID << std::endl; - *out << "Number of trees: " << num_trees << std::endl; - *out << "Sample size: " << num_samples << std::endl; - *out << "Number of independent variables: " << num_independent_variables << std::endl; - *out << "Mtry: " << mtry << std::endl; - *out << "Target node size: " << min_node_size << std::endl; - *out << "Variable importance mode: " << importance_mode << std::endl; - *out << "Memory mode: " << memory_mode << std::endl; - *out << "Seed: " << seed << std::endl; - *out << "Number of threads: " << num_threads << std::endl; - *out << std::endl; - - if (prediction_mode) { - writePredictionFile(); - } else { - *out << "Overall OOB prediction error: " << overall_prediction_error << std::endl; + *out << std::endl; + writeOutputInternal(); + *out << "Dependent variable name: " << data->getVariableNames()[dependent_varID] << std::endl; + *out << "Dependent variable ID: " << dependent_varID << std::endl; + *out << "Number of trees: " << num_trees << std::endl; + *out << "Sample size: " << num_samples << std::endl; + *out << "Number of independent variables: " << num_independent_variables << std::endl; + *out << "Mtry: " << mtry << std::endl; + *out << "Target node size: " << min_node_size << std::endl; + *out << "Variable importance mode: " << importance_mode << std::endl; + *out << "Memory mode: " << memory_mode << std::endl; + *out << "Seed: " << seed << std::endl; + *out << "Number of threads: " << num_threads << std::endl; *out << std::endl; - if (!split_select_weights.empty() & !split_select_weights[0].empty()) { - *out - << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." - << std::endl; - } + if (prediction_mode) { + writePredictionFile(); + } else { + *out << "Overall OOB prediction error: " << overall_prediction_error << std::endl; + *out << std::endl; - if (importance_mode != IMP_NONE) { - writeImportanceFile(); - } + if (!split_select_weights.empty() & !split_select_weights[0].empty()) { + *out + << "Warning: Split select weights used. Variable importance measures are only comparable for variables with equal weights." + << std::endl; + } + + if (importance_mode != IMP_NONE) { + writeImportanceFile(); + } - writeConfusionFile(); - } + writeConfusionFile(); + } } void Forest::writeImportanceFile() { - // Open importance file for writing - std::string filename = output_prefix + ".importance"; - std::ofstream importance_file; - importance_file.open(filename, std::ios::out); - if (!importance_file.good()) { - throw std::runtime_error("Could not write to importance file: " + filename + "."); - } - - // Write importance to file - for (size_t i = 0; i < variable_importance.size(); ++i) { - size_t varID = i; - for (auto& skip : no_split_variables) { - if (varID >= skip) { - ++varID; - } - } - std::string variable_name = data->getVariableNames()[varID]; - importance_file << variable_name << ": " << variable_importance[i] << std::endl; - } - - importance_file.close(); - if (verbose_out) *verbose_out << "Saved variable importance to file " << filename << "." << std::endl; + // Open importance file for writing + std::string filename = output_prefix + ".importance"; + std::ofstream importance_file; + importance_file.open(filename, std::ios::out); + if (!importance_file.good()) { + throw std::runtime_error("Could not write to importance file: " + filename + "."); + } + + // Write importance to file + for (size_t i = 0; i < variable_importance.size(); ++i) { + size_t varID = i; + for (auto& skip : no_split_variables) { + if (varID >= skip) { + ++varID; + } + } + std::string variable_name = data->getVariableNames()[varID]; + importance_file << variable_name << ": " << variable_importance[i] << std::endl; + } + + importance_file.close(); + if (verbose_out) *verbose_out << "Saved variable importance to file " << filename << "." << std::endl; } void Forest::saveToFile() { @@ -354,168 +369,168 @@ void Forest::saveToFile() { void Forest::saveToFile(std::string filename) { - // Open file for writing - std::ofstream outfile; - outfile.open(filename, std::ios::binary); - if (!outfile.good()) { - throw std::runtime_error("Could not write to output file: " + filename + "."); - } + // Open file for writing + std::ofstream outfile; + outfile.open(filename, std::ios::binary); + if (!outfile.good()) { + throw std::runtime_error("Could not write to output file: " + filename + "."); + } - // Write dependent_varID - outfile.write((char*) &dependent_varID, sizeof(dependent_varID)); + // Write dependent_varID + outfile.write((char*) &dependent_varID, sizeof (dependent_varID)); - // Write num_trees - outfile.write((char*) &num_trees, sizeof(num_trees)); + // Write num_trees + outfile.write((char*) &num_trees, sizeof (num_trees)); - // Write is_ordered_variable - saveVector1D(is_ordered_variable, outfile); + // Write is_ordered_variable + saveVector1D(is_ordered_variable, outfile); - saveToFileInternal(outfile); + saveToFileInternal(outfile); - // Write tree data for each tree - for (auto& tree : trees) { - tree->appendToFile(outfile); - } + // Write tree data for each tree + for (auto& tree : trees) { + tree->appendToFile(outfile); + } - // Close file - outfile.close(); - if (verbose_out) *verbose_out << "Saved forest to file " << filename << "." << std::endl; + // Close file + outfile.close(); + if (verbose_out) *verbose_out << "Saved forest to file " << filename << "." << std::endl; } void Forest::grow(bool verbose) { - // Create thread ranges - equalSplit(thread_ranges, 0, num_trees - 1, num_threads); + // Create thread ranges + equalSplit(thread_ranges, 0, num_trees - 1, num_threads); - // Call special grow functions of subclasses. There trees must be created. - growInternal(); + // Call special grow functions of subclasses. There trees must be created. + growInternal(); - // Init trees, create a seed for each tree, based on main seed - std::uniform_int_distribution udist; - for (size_t i = 0; i < num_trees; ++i) { - uint tree_seed; - if (seed == 0) { - tree_seed = udist(random_number_generator); - } else { - tree_seed = (i + 1) * seed; - } + // Init trees, create a seed for each tree, based on main seed + std::uniform_int_distribution udist; + for (size_t i = 0; i < num_trees; ++i) { + uint tree_seed; + if (seed == 0) { + tree_seed = udist(random_number_generator); + } else { + tree_seed = (i + 1) * seed; + } - // Get split select weights for tree - std::vector* tree_split_select_weights; - if (split_select_weights.size() > 1) { - tree_split_select_weights = &split_select_weights[i]; - } else { - tree_split_select_weights = &split_select_weights[0]; - } + // Get split select weights for tree + std::vector* tree_split_select_weights; + if (split_select_weights.size() > 1) { + tree_split_select_weights = &split_select_weights[i]; + } else { + tree_split_select_weights = &split_select_weights[0]; + } - trees[i]->init(data, mtry, dependent_varID, num_samples, tree_seed, &deterministic_varIDs, &split_select_varIDs, - tree_split_select_weights, importance_mode, min_node_size, &no_split_variables, sample_with_replacement, - &is_ordered_variable, memory_saving_splitting, splitrule, &case_weights, keep_inbag, sample_fraction); - } + trees[i]->init(data, mtry, dependent_varID, num_samples, tree_seed, &deterministic_varIDs, &split_select_varIDs, + tree_split_select_weights, importance_mode, min_node_size, &no_split_variables, sample_with_replacement, + &is_ordered_variable, memory_saving_splitting, splitrule, &case_weights, keep_inbag, sample_fraction); + } - // Init variable importance - variable_importance.resize(num_independent_variables, 0); + // Init variable importance + variable_importance.resize(num_independent_variables, 0); - // Grow trees in multiple threads + // Grow trees in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->grow(&variable_importance); - progress++; - showProgress("Growing trees..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->grow(&variable_importance); + progress++; + showProgress("Growing trees..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); + std::vector threads; + threads.reserve(num_threads); - // Initailize importance per thread - std::vector> variable_importance_threads(num_threads); - - for (uint i = 0; i < num_threads; ++i) { - if (importance_mode == IMP_GINI) { - variable_importance_threads[i].resize(num_independent_variables, 0); + // Initailize importance per thread + std::vector> variable_importance_threads(num_threads); + + for (uint i = 0; i < num_threads; ++i) { + if (importance_mode == IMP_GINI) { + variable_importance_threads[i].resize(num_independent_variables, 0); + } + + threads.push_back(std::thread(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i]))); + } + if (verbose) showProgress("Growing trees.."); + for (auto &thread : threads) { + thread.join(); } - - threads.push_back(std::thread(&Forest::growTreesInThread, this, i, &(variable_importance_threads[i]))); - } - if (verbose) showProgress("Growing trees.."); - for (auto &thread : threads) { - thread.join(); - } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif - - // Sum thread importances - if (importance_mode == IMP_GINI) { - variable_importance.resize(num_independent_variables, 0); - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variable_importance[i] += variable_importance_threads[j][i]; - } + + // Sum thread importances + if (importance_mode == IMP_GINI) { + variable_importance.resize(num_independent_variables, 0); + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variable_importance[i] += variable_importance_threads[j][i]; + } + } + variable_importance_threads.clear(); } - variable_importance_threads.clear(); - } #endif - // Divide importance by number of trees - if (importance_mode == IMP_GINI) { - for (auto& v : variable_importance) { - v /= num_trees; + // Divide importance by number of trees + if (importance_mode == IMP_GINI) { + for (auto& v : variable_importance) { + v /= num_trees; + } } - } } void Forest::predict() { - // Predict trees in multiple threads and join the threads with the main thread + // Predict trees in multiple threads and join the threads with the main thread #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data, false); - progress++; - showProgress("Predicting..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->predict(data, false); + progress++; + showProgress("Predicting..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); - for (uint i = 0; i < num_threads; ++i) { - threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, false)); - } - showProgress("Predicting.."); - for (auto &thread : threads) { - thread.join(); - } + std::vector threads; + threads.reserve(num_threads); + for (uint i = 0; i < num_threads; ++i) { + threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, false)); + } + showProgress("Predicting.."); + for (auto &thread : threads) { + thread.join(); + } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif #endif - // Call special functions for subclasses - predictInternal(); + // Call special functions for subclasses + predictInternal(); } double Forest::makePrediction(int i) const { @@ -526,11 +541,11 @@ double Forest::makePrediction(int i) const { double stdev = std::sqrt(sq_sum / predictions[i].size() - mean * mean); std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(rd()); // values near the mean are the most likely // standard deviation affects the dispersion of generated values from the mean - std::normal_distribution ndist(mean,stdev); + std::normal_distribution ndist(mean, stdev); // Return the new prediction based on the distribution return std::abs(std::round(ndist(gen))); @@ -538,355 +553,358 @@ double Forest::makePrediction(int i) const { void Forest::computePredictionError() { - // Predict trees in multiple threads + // Predict trees in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->predict(data, true); - progress++; - showProgress("Predicting..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->predict(data, true); + progress++; + showProgress("Predicting..", start_time, lap_time); + } #else - std::vector threads; - threads.reserve(num_threads); - for (uint i = 0; i < num_threads; ++i) { - threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, true)); - } - showProgress("Computing prediction error.."); - for (auto &thread : threads) { - thread.join(); - } + std::vector threads; + threads.reserve(num_threads); + for (uint i = 0; i < num_threads; ++i) { + threads.push_back(std::thread(&Forest::predictTreesInThread, this, i, data, true)); + } + showProgress("Computing prediction error.."); + for (auto &thread : threads) { + thread.join(); + } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif #endif - // Call special function for subclasses - computePredictionErrorInternal(); + // Call special function for subclasses + computePredictionErrorInternal(); } void Forest::computePermutationImportance() { - // Compute tree permutation importance in multiple threads + // Compute tree permutation importance in multiple threads #ifdef WIN_R_BUILD - progress = 0; - clock_t start_time = clock(); - clock_t lap_time = clock(); - - // Initailize importance and variance - variable_importance.resize(num_independent_variables, 0); - std::vector variance; - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - variance.resize(num_independent_variables, 0); - } - - // Compute importance - for (size_t i = 0; i < num_trees; ++i) { - trees[i]->computePermutationImportance(&variable_importance, &variance); - progress++; - showProgress("Computing permutation importance..", start_time, lap_time); - } + progress = 0; + clock_t start_time = clock(); + clock_t lap_time = clock(); + + // Initailize importance and variance + variable_importance.resize(num_independent_variables, 0); + std::vector variance; + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + variance.resize(num_independent_variables, 0); + } + + // Compute importance + for (size_t i = 0; i < num_trees; ++i) { + trees[i]->computePermutationImportance(&variable_importance, &variance); + progress++; + showProgress("Computing permutation importance..", start_time, lap_time); + } #else - progress = 0; + progress = 0; #ifdef R_BUILD - aborted = false; - aborted_threads = 0; + aborted = false; + aborted_threads = 0; #endif - std::vector threads; - threads.reserve(num_threads); + std::vector threads; + threads.reserve(num_threads); - // Initailize importance and variance - std::vector> variable_importance_threads(num_threads); - std::vector> variance_threads(num_threads); + // Initailize importance and variance + std::vector> variable_importance_threads(num_threads); + std::vector> variance_threads(num_threads); - // Compute importance - for (uint i = 0; i < num_threads; ++i) { - variable_importance_threads[i].resize(num_independent_variables, 0); - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - variance_threads[i].resize(num_independent_variables, 0); + // Compute importance + for (uint i = 0; i < num_threads; ++i) { + variable_importance_threads[i].resize(num_independent_variables, 0); + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + variance_threads[i].resize(num_independent_variables, 0); + } + threads.push_back( + std::thread(&Forest::computeTreePermutationImportanceInThread, this, i, &(variable_importance_threads[i]), + &(variance_threads[i]))); + } + showProgress("Computing permutation importance.."); + for (auto &thread : threads) { + thread.join(); } - threads.push_back( - std::thread(&Forest::computeTreePermutationImportanceInThread, this, i, &(variable_importance_threads[i]), - &(variance_threads[i]))); - } - showProgress("Computing permutation importance.."); - for (auto &thread : threads) { - thread.join(); - } #ifdef R_BUILD - if (aborted_threads > 0) { - throw std::runtime_error("User interrupt."); - } + if (aborted_threads > 0) { + throw std::runtime_error("User interrupt."); + } #endif - // Sum thread importances - variable_importance.resize(num_independent_variables, 0); - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variable_importance[i] += variable_importance_threads[j][i]; + // Sum thread importances + variable_importance.resize(num_independent_variables, 0); + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variable_importance[i] += variable_importance_threads[j][i]; + } } - } - variable_importance_threads.clear(); + variable_importance_threads.clear(); - // Sum thread variances - std::vector variance(num_independent_variables, 0); - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - for (size_t i = 0; i < num_independent_variables; ++i) { - for (uint j = 0; j < num_threads; ++j) { - variance[i] += variance_threads[j][i]; - } + // Sum thread variances + std::vector variance(num_independent_variables, 0); + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + for (size_t i = 0; i < num_independent_variables; ++i) { + for (uint j = 0; j < num_threads; ++j) { + variance[i] += variance_threads[j][i]; + } + } + variance_threads.clear(); } - variance_threads.clear(); - } #endif - for (size_t i = 0; i < variable_importance.size(); ++i) { - variable_importance[i] /= num_trees; + for (size_t i = 0; i < variable_importance.size(); ++i) { + variable_importance[i] /= num_trees; - // Normalize by variance for scaled permutation importance - if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { - if (variance[i] != 0) { - variance[i] = variance[i] / num_trees - variable_importance[i] * variable_importance[i]; - variable_importance[i] /= sqrt(variance[i] / num_trees); - } + // Normalize by variance for scaled permutation importance + if (importance_mode == IMP_PERM_BREIMAN || importance_mode == IMP_PERM_LIAW) { + if (variance[i] != 0) { + variance[i] = variance[i] / num_trees - variable_importance[i] * variable_importance[i]; + variable_importance[i] /= sqrt(variance[i] / num_trees); + } + } } - } } #ifndef WIN_R_BUILD + void Forest::growTreesInThread(uint thread_idx, std::vector* variable_importance) { - if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->grow(variable_importance); + if (thread_ranges.size() > thread_idx + 1) { + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->grow(variable_importance); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } void Forest::predictTreesInThread(uint thread_idx, const Data* prediction_data, bool oob_prediction) { if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->predict(prediction_data, oob_prediction); + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->predict(prediction_data, oob_prediction); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } void Forest::computeTreePermutationImportanceInThread(uint thread_idx, std::vector* importance, - std::vector* variance) { - if (thread_ranges.size() > thread_idx + 1) { - for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { - trees[i]->computePermutationImportance(importance, variance); + std::vector* variance) { + if (thread_ranges.size() > thread_idx + 1) { + for (size_t i = thread_ranges[thread_idx]; i < thread_ranges[thread_idx + 1]; ++i) { + trees[i]->computePermutationImportance(importance, variance); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (aborted) { - std::unique_lock lock(mutex); - ++aborted_threads; - condition_variable.notify_one(); - return; - } + if (aborted) { + std::unique_lock lock(mutex); + ++aborted_threads; + condition_variable.notify_one(); + return; + } #endif - // Increase progress by 1 tree - std::unique_lock lock(mutex); - ++progress; - condition_variable.notify_one(); + // Increase progress by 1 tree + std::unique_lock lock(mutex); + ++progress; + condition_variable.notify_one(); + } } - } } #endif void Forest::loadFromFile(std::string filename) { - if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; + if (verbose_out) *verbose_out << "Loading forest from file " << filename << "." << std::endl; - // Open file for reading - std::ifstream infile; - infile.open(filename, std::ios::binary); - if (!infile.good()) { - throw std::runtime_error("Could not read from input file: " + filename + "."); - } + // Open file for reading + std::ifstream infile; + infile.open(filename, std::ios::binary); + if (!infile.good()) { + throw std::runtime_error("Could not read from input file: " + filename + "."); + } - // Read dependent_varID and num_trees - infile.read((char*) &dependent_varID, sizeof(dependent_varID)); - infile.read((char*) &num_trees, sizeof(num_trees)); + // Read dependent_varID and num_trees + infile.read((char*) &dependent_varID, sizeof (dependent_varID)); + infile.read((char*) &num_trees, sizeof (num_trees)); - // Read is_ordered_variable - readVector1D(is_ordered_variable, infile); + // Read is_ordered_variable + readVector1D(is_ordered_variable, infile); - // Read tree data. This is different for tree types -> virtual function - loadFromFileInternal(infile); + // Read tree data. This is different for tree types -> virtual function + loadFromFileInternal(infile); - infile.close(); + infile.close(); - // Create thread ranges - equalSplit(thread_ranges, 0, num_trees - 1, num_threads); + // Create thread ranges + equalSplit(thread_ranges, 0, num_trees - 1, num_threads); } -void Forest::setSplitWeightVector(std::vector>& split_select_weights) { - - // Size should be 1 x num_independent_variables or num_trees x num_independent_variables - if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { - throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); - } - - // Reserve space - if (split_select_weights.size() == 1) { - this->split_select_weights[0].resize(num_independent_variables); - } else { - this->split_select_weights.clear(); - this->split_select_weights.resize(num_trees, std::vector(num_independent_variables)); - } - this->split_select_varIDs.resize(num_independent_variables); - deterministic_varIDs.reserve(num_independent_variables); - - // Split up in deterministic and weighted variables, ignore zero weights - for (size_t i = 0; i < split_select_weights.size(); ++i) { +void Forest::setSplitWeightVector(std::vector>&split_select_weights) { // Size should be 1 x num_independent_variables or num_trees x num_independent_variables - if (split_select_weights[i].size() != num_independent_variables) { - throw std::runtime_error("Number of split select weights not equal to number of independent variables."); + if (split_select_weights.size() != 1 && split_select_weights.size() != num_trees) { + throw std::runtime_error("Size of split select weights not equal to 1 or number of trees."); } - for (size_t j = 0; j < split_select_weights[i].size(); ++j) { - double weight = split_select_weights[i][j]; + // Reserve space + if (split_select_weights.size() == 1) { + this->split_select_weights[0].resize(num_independent_variables); + } else { + this->split_select_weights.clear(); + this->split_select_weights.resize(num_trees, std::vector(num_independent_variables)); + } + this->split_select_varIDs.resize(num_independent_variables); + deterministic_varIDs.reserve(num_independent_variables); - if (i == 0) { - size_t varID = j; - for (auto& skip : no_split_variables) { - if (varID >= skip) { - ++varID; - } - } + // Split up in deterministic and weighted variables, ignore zero weights + for (size_t i = 0; i < split_select_weights.size(); ++i) { - if (weight == 1) { - deterministic_varIDs.push_back(varID); - } else if (weight < 1 && weight > 0) { - this->split_select_varIDs[j] = varID; - this->split_select_weights[i][j] = weight; - } else if (weight < 0 || weight > 1) { - throw std::runtime_error("One or more split select weights not in range [0,1]."); + // Size should be 1 x num_independent_variables or num_trees x num_independent_variables + if (split_select_weights[i].size() != num_independent_variables) { + throw std::runtime_error("Number of split select weights not equal to number of independent variables."); } - } else { - if (weight < 1 && weight > 0) { - this->split_select_weights[i][j] = weight; - } else if (weight < 0 || weight > 1) { - throw std::runtime_error("One or more split select weights not in range [0,1]."); + for (size_t j = 0; j < split_select_weights[i].size(); ++j) { + double weight = split_select_weights[i][j]; + + if (i == 0) { + size_t varID = j; + for (auto& skip : no_split_variables) { + if (varID >= skip) { + ++varID; + } + } + + if (weight == 1) { + deterministic_varIDs.push_back(varID); + } else if (weight < 1 && weight > 0) { + this->split_select_varIDs[j] = varID; + this->split_select_weights[i][j] = weight; + } else if (weight < 0 || weight > 1) { + throw std::runtime_error("One or more split select weights not in range [0,1]."); + } + + } else { + if (weight < 1 && weight > 0) { + this->split_select_weights[i][j] = weight; + } else if (weight < 0 || weight > 1) { + throw std::runtime_error("One or more split select weights not in range [0,1]."); + } + } } - } } - } - if (deterministic_varIDs.size() > this->mtry) { - throw std::runtime_error("Number of ones in split select weights cannot be larger than mtry."); - } - if (deterministic_varIDs.size() + split_select_varIDs.size() < mtry) { - throw std::runtime_error("Too many zeros in split select weights. Need at least mtry variables to split at."); - } + if (deterministic_varIDs.size() > this->mtry) { + throw std::runtime_error("Number of ones in split select weights cannot be larger than mtry."); + } + if (deterministic_varIDs.size() + split_select_varIDs.size() < mtry) { + throw std::runtime_error("Too many zeros in split select weights. Need at least mtry variables to split at."); + } } void Forest::setAlwaysSplitVariables(std::vector& always_split_variable_names) { - deterministic_varIDs.reserve(num_independent_variables); + deterministic_varIDs.reserve(num_independent_variables); - for (auto& variable_name : always_split_variable_names) { - size_t varID = data->getVariableID(variable_name); - deterministic_varIDs.push_back(varID); - } + for (auto& variable_name : always_split_variable_names) { + size_t varID = data->getVariableID(variable_name); + deterministic_varIDs.push_back(varID); + } - if (deterministic_varIDs.size() + this->mtry > num_independent_variables) { - throw std::runtime_error( - "Number of variables to be always considered for splitting plus mtry cannot be larger than number of independent variables."); - } + if (deterministic_varIDs.size() + this->mtry > num_independent_variables) { + throw std::runtime_error( + "Number of variables to be always considered for splitting plus mtry cannot be larger than number of independent variables."); + } } #ifdef WIN_R_BUILD + void Forest::showProgress(std::string operation, clock_t start_time, clock_t& lap_time) { - // Check for user interrupt - if (checkInterrupt()) { - throw std::runtime_error("User interrupt."); - } - - double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; - if (elapsed_time > STATUS_INTERVAL) { - double relative_progress = (double) progress / (double) num_trees; - double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; - uint remaining_time = (1 / relative_progress - 1) * time_from_start; - *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " - << beautifyTime(remaining_time) << "." << std::endl; - lap_time = clock(); - } + // Check for user interrupt + if (checkInterrupt()) { + throw std::runtime_error("User interrupt."); + } + + double elapsed_time = (clock() - lap_time) / CLOCKS_PER_SEC; + if (elapsed_time > STATUS_INTERVAL) { + double relative_progress = (double) progress / (double) num_trees; + double time_from_start = (clock() - start_time) / CLOCKS_PER_SEC; + uint remaining_time = (1 / relative_progress - 1) * time_from_start; + *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " + << beautifyTime(remaining_time) << "." << std::endl; + lap_time = clock(); + } } #else + void Forest::showProgress(std::string operation) { - using std::chrono::steady_clock; - using std::chrono::duration_cast; - using std::chrono::seconds; + using std::chrono::steady_clock; + using std::chrono::duration_cast; + using std::chrono::seconds; - steady_clock::time_point start_time = steady_clock::now(); - steady_clock::time_point last_time = steady_clock::now(); - std::unique_lock lock(mutex); + steady_clock::time_point start_time = steady_clock::now(); + steady_clock::time_point last_time = steady_clock::now(); + std::unique_lock lock(mutex); - // Wait for message from threads and show output if enough time elapsed - while (progress < num_trees) { - condition_variable.wait(lock); - seconds elapsed_time = duration_cast(steady_clock::now() - last_time); + // Wait for message from threads and show output if enough time elapsed + while (progress < num_trees) { + condition_variable.wait(lock); + seconds elapsed_time = duration_cast(steady_clock::now() - last_time); - // Check for user interrupt + // Check for user interrupt #ifdef R_BUILD - if (!aborted && checkInterrupt()) { - aborted = true; - } - if (aborted && aborted_threads >= num_threads) { - return; - } + if (!aborted && checkInterrupt()) { + aborted = true; + } + if (aborted && aborted_threads >= num_threads) { + return; + } #endif - if (progress > 0 && elapsed_time.count() > STATUS_INTERVAL) { - double relative_progress = (double) progress / (double) num_trees; - seconds time_from_start = duration_cast(steady_clock::now() - start_time); - uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " - << beautifyTime(remaining_time) << "." << std::endl; - last_time = steady_clock::now(); + if (progress > 0 && elapsed_time.count() > STATUS_INTERVAL) { + double relative_progress = (double) progress / (double) num_trees; + seconds time_from_start = duration_cast(steady_clock::now() - start_time); + uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); + *verbose_out << operation << " Progress: " << round(100 * relative_progress) << "%. Estimated remaining time: " + << beautifyTime(remaining_time) << "." << std::endl; + last_time = steady_clock::now(); + } } - } } #endif diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 9934bfca..00be2846 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -110,8 +110,8 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { oobe.push_back(l->getOverallPredictionError()); cout << "OOBE: " << l->getOverallPredictionError() << endl; - l->setData(unlabelled); l->setPredictionMode(true); + l->setData(unlabelled, "Genuine", "", catVars); l->run(verbose); @@ -129,8 +129,8 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { // unlabelled data with the current model if (improved && !first) { if (verbose) cout << "Making predictions on the unlabelled set using current model" << endl; - u->setData(all); u->setPredictionMode(true); + u->setData(all, "Genuine", "", catVars); u->run(verbose); if (verbose) cout << "Made predictions." << endl; } diff --git a/src/train.cc b/src/train.cc index f1c30c27..a2db9a94 100644 --- a/src/train.cc +++ b/src/train.cc @@ -79,9 +79,9 @@ void portcullis::Train::testInstance(shared_ptr f, const JunctionList& x // Convert testing set junctions into feature vector ModelFeatures mf; Data* testingData = mf.juncs2FeatureVectors(x); - - f->setPredictionMode(true); - f->setData(testingData); + vector catvars; + f->setPredictionMode(true); + f->setData(testingData, "Genuine", "", catvars); f->run(false); delete testingData; From faff5cfa7d757767f38f8ffc58caf323f3e4a054 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 13 May 2016 18:18:32 +0100 Subject: [PATCH 12/21] Implemented undersampling and ENN routines to help try to balance data. Still need to implement oversamplers such as SMOTE or ICOTE. --- deps/ranger-0.3.8/include/ranger/Data.h | 5 + deps/ranger-0.3.8/include/ranger/DataChar.h | 2 + deps/ranger-0.3.8/include/ranger/DataDouble.h | 2 +- lib/Makefile.am | 4 +- lib/include/portcullis/ml/balancing.hpp | 51 +++++ lib/include/portcullis/ml/enn.hpp | 95 +++++++++ lib/include/portcullis/ml/model_features.hpp | 5 + lib/include/portcullis/ml/ss_forest.hpp | 2 - lib/src/enn.cc | 143 ++++++++++++++ lib/src/model_features.cc | 187 ++++++++++++------ lib/src/ss_forest.cc | 101 ++-------- src/junction_filter.cc | 69 +++++-- src/junction_filter.hpp | 4 + 13 files changed, 511 insertions(+), 159 deletions(-) create mode 100644 lib/include/portcullis/ml/balancing.hpp create mode 100644 lib/include/portcullis/ml/enn.hpp create mode 100644 lib/src/enn.cc diff --git a/deps/ranger-0.3.8/include/ranger/Data.h b/deps/ranger-0.3.8/include/ranger/Data.h index e918c6d7..cc6d8c64 100644 --- a/deps/ranger-0.3.8/include/ranger/Data.h +++ b/deps/ranger-0.3.8/include/ranger/Data.h @@ -43,6 +43,11 @@ class Data { virtual ~Data(); virtual double get(size_t row, size_t col) const = 0; + void getRow(size_t row, std::vector& output) const { + for(size_t col = 0; col < num_cols; col++) { + output.push_back(get(row, col)); + } + } size_t getVariableID(std::string variable_name); diff --git a/deps/ranger-0.3.8/include/ranger/DataChar.h b/deps/ranger-0.3.8/include/ranger/DataChar.h index 1af193ac..be428870 100644 --- a/deps/ranger-0.3.8/include/ranger/DataChar.h +++ b/deps/ranger-0.3.8/include/ranger/DataChar.h @@ -53,6 +53,8 @@ class DataChar : public Data { return (((sparse_data[idx / 4] & mask[idx % 4]) >> offset[idx % 4]) - 1); } } + + void reserveMemoryInternal() { data = new char[num_cols * num_rows]; diff --git a/deps/ranger-0.3.8/include/ranger/DataDouble.h b/deps/ranger-0.3.8/include/ranger/DataDouble.h index 27c6c967..eaed6376 100644 --- a/deps/ranger-0.3.8/include/ranger/DataDouble.h +++ b/deps/ranger-0.3.8/include/ranger/DataDouble.h @@ -53,7 +53,7 @@ class DataDouble : public Data { return result; } } - + void reserveMemoryInternal() { data = new double[num_cols * num_rows]; } diff --git a/lib/Makefile.am b/lib/Makefile.am index 1172ae1c..66ba7c6a 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -23,7 +23,8 @@ libportcullis_la_SOURCES = \ src/junction_system.cc \ src/rule_parser.cc \ src/performance.cc \ - src/ss_forest.cc + src/ss_forest.cc \ + src/enn.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis PI = include/portcullis @@ -38,6 +39,7 @@ library_include_HEADERS = \ $(PI)/ml/model_features.hpp \ $(PI)/ml/performance.hpp \ $(PI)/ml/k_fold.hpp \ + $(PI)/ml/enn.hpp \ $(PI)/ml/ss_forest.hpp \ $(PI)/kmer.hpp \ $(PI)/python_exe.hpp \ diff --git a/lib/include/portcullis/ml/balancing.hpp b/lib/include/portcullis/ml/balancing.hpp new file mode 100644 index 00000000..e9c03ab7 --- /dev/null +++ b/lib/include/portcullis/ml/balancing.hpp @@ -0,0 +1,51 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info BalancingErrorInfo; +struct BalacingException: virtual boost::exception, virtual std::exception { }; + +class UnderSampler : UnbalancedDataset { + +}; + +/** + * Object to perform classification on balanced ensembled selected from random sampling. + * + * It is based on the idea presented in the paper "Exploratory Undersampling + * Class-Imbalance Learning" by Liu et al. + */ +class EasyEnsemble : public UnderSampler { +private: + uint16_t nb_subsets; + + + +public: + + EasyEnsemble() { + + } +}; +} +} \ No newline at end of file diff --git a/lib/include/portcullis/ml/enn.hpp b/lib/include/portcullis/ml/enn.hpp new file mode 100644 index 00000000..b82dec25 --- /dev/null +++ b/lib/include/portcullis/ml/enn.hpp @@ -0,0 +1,95 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + + +#pragma once + +#include +#include +#include +using std::string; +using std::thread; +using std::vector; + +#include +#include +using boost::timer::auto_cpu_timer; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info ENNErrorInfo; +struct ENNException: virtual boost::exception, virtual std::exception { }; + +/** + * An parallel implementation of Wilson's Edited nearest neighbour algorithm. + * Uses KNN to identify nearest neighbours for each sample, then marks entries + * to be removed or to be kept if more than half of the KNNs come from a different + * class. + * KNN logic originally derived from OpenCV, modified to work with ranger data + * types + */ +class ENN { +protected: + uint16_t k; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + vector labels; + + void doSlice( uint16_t slice, vector& result ) const; + +public: + + ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels); + + int getK() const { + return k; + } + + void setK(int k) { + this->k = k; + } + + int getThreads() const { + return threads; + } + + void setThreads(int threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + + uint32_t execute( vector& results ) const; +}; + +} +} diff --git a/lib/include/portcullis/ml/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp index 4efe224c..89edd12f 100644 --- a/lib/include/portcullis/ml/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -67,6 +67,9 @@ struct Feature { class ModelFeatures { private: size_t fi; +protected: + void setRow(Data* d, size_t row, JunctionPtr j, bool labelled); + public: uint32_t L95; KmerMarkovModel exonModel; @@ -114,6 +117,8 @@ class ModelFeatures { void trainSplicingModels(const JunctionList& pass, const JunctionList& fail); Data* juncs2FeatureVectors(const JunctionList& x); + Data* juncs2FeatureVectors(const JunctionList& xl, const JunctionList& xu); + ForestPtr trainInstance(const JunctionList& x, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); diff --git a/lib/include/portcullis/ml/ss_forest.hpp b/lib/include/portcullis/ml/ss_forest.hpp index 3ff28478..14d3d444 100644 --- a/lib/include/portcullis/ml/ss_forest.hpp +++ b/lib/include/portcullis/ml/ss_forest.hpp @@ -61,8 +61,6 @@ class SemiSupervisedForest { string outputPrefix; double contribution; // How much of a contribution unlabelled data should make to the decision making - double makePrediction(ForestPtr l, ForestPtr u, int i) const; - public: SemiSupervisedForest(ModelFeatures& _mf, const JunctionList& labelled, const JunctionList& unlabelled, diff --git a/lib/src/enn.cc b/lib/src/enn.cc new file mode 100644 index 00000000..0d95045a --- /dev/null +++ b/lib/src/enn.cc @@ -0,0 +1,143 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include + +portcullis::ml::ENN::ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels) { + if (_rows != _labels.size()) { + BOOST_THROW_EXCEPTION(ENNException() << ENNErrorInfo(string( + "The supplied number of rows does not match the number of labels"))); + } + + data = _data; + rows = _rows; + cols = _cols; + labels = _labels; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + threads = _threads; + verbose = false; +} + +void portcullis::ml::ENN::doSlice(uint16_t slice, vector& result) const { + + // Get coordinates of entries to search through in this slice + uint32_t slice_size = rows / threads; + uint32_t start = slice_size * slice; + uint32_t end = (slice_size * slice) + slice_size - 1; + if (slice == threads - 1 && end <= rows) { + end = rows - 1; // Make sure we get the last few entries + } + uint32_t testcount = end - start + 1; + + vector dbuf(testcount*k, std::numeric_limits::max()); + vector rbuf(testcount*k, 0); + + // Get feature space values for KNN for each sample in slice + for (uint32_t baseidx = 0; baseidx < rows; baseidx++) { + for (uint32_t testidx = start; testidx <= end; testidx++) { + + const uint32_t ri = (testidx - start) * k; + + // Get sum of squared differences (no need to do the sqrt to get + // the Euclidean distance... this saves about 20% runtime) + double s = 0.0; + for (uint16_t i = 0; i < cols; i++) { + s += std::pow(data[baseidx * cols + i] - data[testidx * cols + i], 2.0); + } + + // Find position to add entry... + int i = k; + for (; i > 0; i--) + if (s >= dbuf[ri + i - 1]) + break; + + // ... or skip if not if this pair is not in the current set of KNN + if (i >= k) + continue; + + // Shift back entries after i + for (int j = k - 2; j >= i; j--) { + dbuf[ri + j + 1] = dbuf[ri + j]; + rbuf[ri + j + 1] = rbuf[ri + j]; + } + + // Record KNN and index + dbuf[ri + i] = s; + rbuf[ri + i] = baseidx; + } + } + + // Mark for keeping or removal + const uint16_t threshold = k / 2; + for (uint32_t testidx = start; testidx <= end; testidx++) { + + bool pos = labels[testidx]; + uint16_t pos_count = 0; + uint16_t neg_count = 0; + for (size_t i = 0; i < k; i++) { + uint32_t index = rbuf[((testidx - start) * k) + i]; + if (labels[index]) { + pos_count++; + } else { + neg_count++; + } + } + + result[testidx] = ((pos && pos_count > threshold) || (!pos && neg_count > threshold)); + } +} + +uint32_t portcullis::ml::ENN::execute(vector& results) const { + + auto_cpu_timer timer(1, " Time taken: %ws\n\n"); + + if (verbose) { + cout << "Performing Wilson's Edited Nearest Neighbour (ENN) ..."; + cout.flush(); + } + + results.clear(); + results.resize(rows); + std::fill(results.begin(), results.end(), false); + + vector t(threads); + for (uint16_t i = 0; i < threads; i++) { + t[i] = thread(&ENN::doSlice, this, i, std::ref(results)); + } + for (uint16_t i = 0; i < threads; i++) { + t[i].join(); + } + + uint32_t discard = 0; + for (auto r : results) { + if (!r) discard++; + } + uint32_t keep = results.size() - discard; + + if (verbose) { + cout << "done." << endl << "Marked " << keep << " to be kept and " << discard << " to be discarded." << endl; + } + + return discard; +} diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 258b35da..112ccbc4 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -28,6 +28,9 @@ using std::make_shared; #include #include +#include +using portcullis::ml::ENN; + #include using portcullis::Junction; @@ -167,6 +170,63 @@ void portcullis::ml::ModelFeatures::trainSplicingModels(const JunctionList& pass acceptorFModel.train(acceptors, 5); } +void portcullis::ml::ModelFeatures::setRow(Data* d, size_t row, JunctionPtr j, bool labelled) { + SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, + donorPWModel, acceptorPWModel); + + bool error = false; + d->set(0, row, j->isGenuine(), error); + + uint16_t i = 1; + + if (features[1].active) { + d->set(i++, row, j->getNbUniquelySplicedReads(), error); + } + if (features[2].active) { + d->set(i++, row, j->getNbDistinctAlignments(), error); + } + if (features[3].active) { + d->set(i++, row, j->getNbReliableAlignments(), error); + } + if (features[4].active) { + d->set(i++, row, j->getEntropy(), error); + } + if (features[5].active) { + d->set(i++, row, j->getReliable2RawRatio(), error); + } + if (features[6].active) { + d->set(i++, row, j->getMaxMinAnchor(), error); + } + if (features[7].active) { + d->set(i++, row, j->getMaxMMES(), error); + } + if (features[8].active) { + d->set(i++, row, j->getMeanMismatches(), error); + } + if (features[9].active) { + d->set(i++, row, L95 == 0 ? 0.0 : j->calcIntronScore(L95), error); + } + if (features[10].active) { + d->set(i++, row, std::min(j->getHammingDistance5p(), j->getHammingDistance3p()), error); + } + if (features[11].active) { + d->set(i++, row, isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel), error); + } + if (features[12].active) { + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.positionWeighting, error); + } + if (features[13].active) { + d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.splicingSignal, error); + } + + //Junction overhang values at each position are first converted into deviation from expected distributions + for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { + if (features[joi + 14].active) { + d->set(i++, row, j->getJunctionOverhangLogDeviation(joi), error); + } + } +} + Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) { vector headers; @@ -180,65 +240,37 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& x) Data* d = new DataDouble(headers, x.size(), headers.size()); uint32_t row = 0; - for (const auto& j : x) { - SplicingScores ss = j->calcSplicingScores(gmap, donorTModel, donorFModel, acceptorTModel, acceptorFModel, - donorPWModel, acceptorPWModel); - - bool error = false; - d->set(0, row, j->isGenuine(), error); - - uint16_t i = 1; + for (const auto& j : x) { + setRow(d, row, j, true); + row++; + } + + return d; +} + +Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& xl, const JunctionList& xu) { - if (features[1].active) { - d->set(i++, row, j->getNbUniquelySplicedReads(), error); - } - if (features[2].active) { - d->set(i++, row, j->getNbDistinctAlignments(), error); - } - if (features[3].active) { - d->set(i++, row, j->getNbReliableAlignments(), error); - } - if (features[4].active) { - d->set(i++, row, j->getEntropy(), error); - } - if (features[5].active) { - d->set(i++, row, j->getReliable2RawRatio(), error); - } - if (features[6].active) { - d->set(i++, row, j->getMaxMinAnchor(), error); - } - if (features[7].active) { - d->set(i++, row, j->getMaxMMES(), error); - } - if (features[8].active) { - d->set(i++, row, j->getMeanMismatches(), error); - } - if (features[9].active) { - d->set(i++, row, L95 == 0 ? 0.0 : j->calcIntronScore(L95), error); - } - if (features[10].active) { - d->set(i++, row, std::min(j->getHammingDistance5p(), j->getHammingDistance3p()), error); - } - if (features[11].active) { - d->set(i++, row, isCodingPotentialModelEmpty() ? 0.0 : j->calcCodingPotential(gmap, exonModel, intronModel), error); - } - if (features[12].active) { - d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.positionWeighting, error); - } - if (features[13].active) { - d->set(i++, row, isPWModelEmpty() ? 0.0 : ss.splicingSignal, error); - } - - //Junction overhang values at each position are first converted into deviation from expected distributions - for(size_t joi = 0; joi < JO_NAMES.size(); joi++) { - if (features[joi + 14].active) { - d->set(i++, row, j->getJunctionOverhangLogDeviation(joi), error); - } + vector headers; + for(auto& f : features) { + if (f.active) { + headers.push_back(f.name); } - + } + + // Convert junction list info to double* + Data* d = new DataDouble(headers, xl.size() + xu.size(), headers.size()); + + uint32_t row = 0; + for (const auto& j : xl) { + setRow(d, row, j, true); row++; } + for (const auto& j : xu) { + setRow(d, row, j, false); + row++; + } + return d; } @@ -250,6 +282,47 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun if (verbose) cout << "Creating feature vector" << endl; Data* trainingData = juncs2FeatureVectors(x); + size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); + double* m = new double[elements]; + for( uint32_t baseidx = 0; baseidx < trainingData->getNumRows(); baseidx++ ) { + double* r = &m[baseidx * (trainingData->getNumCols()-1)]; + for( size_t c = 1; c < trainingData->getNumCols(); c++) { + r[c - 1] = trainingData->get(baseidx, c); + //cout << r[c-1]; + } + //cout << endl; + } + vector labels; + for(auto& j : x) { + labels.push_back(j->isGenuine()); + } + cout << "Created data matrix" << endl; + + ENN enn(10, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + enn.setVerbose(true); + vector results; + uint32_t count = enn.execute(results); + + delete[] m; + + uint32_t pcount = 0, ncount = 0; + JunctionList x2; + for(size_t i = 0; i < x.size(); i++) { + if (x[i]->isGenuine() && !results[i]) { + pcount++; + } + else if (!x[i]->isGenuine() && !results[i]) { + ncount++; + } + else { + x2.push_back(x[i]); + } + } + + Data* trainingData2 = juncs2FeatureVectors(x2); + + cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (" << count << ")" << endl << endl; + path feature_file = outputPrefix + ".features"; if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; @@ -276,10 +349,10 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun f->init( "Genuine", // Dependant variable name MEM_DOUBLE, // Memory mode - trainingData, // Data object + trainingData2, // Data object 0, // M Try (0 == use default) outputPrefix, // Output prefix - 250, //trees, // Number of trees + trees, // Number of trees 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible threads, // Number of threads IMP_GINI, // Importance measure @@ -289,7 +362,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun true, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving - DEFAULT_SPLITRULE, // Split rule + AUC, //DEFAULT_SPLITRULE, // Split rule false, // predall 1.0); // Sample fraction diff --git a/lib/src/ss_forest.cc b/lib/src/ss_forest.cc index 00be2846..b5f0a0ba 100644 --- a/lib/src/ss_forest.cc +++ b/lib/src/ss_forest.cc @@ -47,10 +47,7 @@ portcullis::ml::SemiSupervisedForest::SemiSupervisedForest(ModelFeatures& mf, // Combine labelled and unlabelled // This is inefficient but it makes life easier... hopefully it doesn't take too // much memory - JunctionList _all; - _all.insert(_all.end(), _labelled.begin(), _labelled.end()); - _all.insert(_all.end(), _unlabelled.begin(), _unlabelled.end()); - all = mf.juncs2FeatureVectors(_all); + all = mf.juncs2FeatureVectors(_labelled, _unlabelled); if (verbose) cout << "Created combined FV with " << all->getNumRows() << " entries." << endl; @@ -70,7 +67,6 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { if (verbose) cout << "Initialising random forest" << endl; ForestPtr l = make_shared(); - ForestPtr u = make_shared(); vector catVars; @@ -115,6 +111,24 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { l->run(verbose); + ForestPtr u = make_shared(); + u->init( + MEM_DOUBLE, // Memory mode + 0, // M Try (0 == use default) + outputPrefix, // Output prefix + trees, // Number of trees + 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible + threads, // Number of threads + IMP_GINI, // Importance measure + DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size + false, // Prediction mode + true, // Replace + false, // Memory saving + DEFAULT_SPLITRULE, // Split rule + true, // predall + 1.0); // Sample fraction + u->setData(all, "Genuine", "", catVars); + // Loop until no improvement using deterministic annealing bool initOnly = false; @@ -130,7 +144,6 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { if (improved && !first) { if (verbose) cout << "Making predictions on the unlabelled set using current model" << endl; u->setPredictionMode(true); - u->setData(all, "Genuine", "", catVars); u->run(verbose); if (verbose) cout << "Made predictions." << endl; } @@ -151,27 +164,7 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { } if (verbose) cout << "Re-training newly labelled data" << endl; - u = make_shared(); - u->init( - "Genuine", // Dependant variable name - MEM_DOUBLE, // Memory mode - all, // Data object - 0, // M Try (0 == use default) - outputPrefix + "_" + std::to_string(it), // Output prefix - trees, // Number of trees - 1236456789, // Use fixed seed to avoid non-deterministic behaviour as much as possible - threads, // Number of threads - IMP_GINI, // Importance measure - DEFAULT_MIN_NODE_SIZE_PROBABILITY, // Min node size - "", // Status var name - false, // Prediction mode - true, // Replace - catVars, // Unordered categorical variable names (vector) - false, // Memory saving - DEFAULT_SPLITRULE, // Split rule - true, // predall - 1.0); // Sample fraction - //u->setCaseWeights(case_weights); + u->setPredictionMode(false); u->run(verbose); cout << "OOBE: " << u->getOverallPredictionError() << endl; @@ -202,12 +195,10 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { else { for(size_t j = labelled->getNumRows(); j < labelled->getNumRows()+unlabelled->getNumRows(); j++) { bool error = false; - all->set(0, j, unlabelled->get(j - labelled->getNumRows(), 0), error); + all->set(0, j, all->get(j, 0), error); } } - initOnly = false; - // Revert to the best forest ForestPtr b = make_shared(); b->init( @@ -234,53 +225,3 @@ ForestPtr portcullis::ml::SemiSupervisedForest::train() { return b; } - -double portcullis::ml::SemiSupervisedForest::makePrediction(ForestPtr lab, ForestPtr unlab, int i) const { - - const std::vector> lab_pred = lab->getPredictions(); - const std::vector> unlab_pred = unlab->getPredictions(); - - double lab_sum = std::accumulate(lab_pred[i].begin(), lab_pred[i].end(), 0.0); - double unlab_sum = std::accumulate(unlab_pred[i].begin(), unlab_pred[i].end(), 0.0); - - double weight_sum = (double)lab_pred[i].size() + ((double)unlab_pred[i].size() * contribution ); - - double lab_mean = lab_sum / (double)lab_pred[i].size(); - double unlab_mean = unlab_sum / (double)unlab_pred[i].size(); - double weighted_mean = (lab_sum + (unlab_sum * contribution)) / (weight_sum); - - cout << "Means: lab - " << lab_mean << "; unlab - " << unlab_mean << "; weighted - " << weighted_mean << endl; - - double contr2 = std::pow(contribution, 2); - - double lab_sq_sum = std::inner_product(lab_pred[i].begin(), lab_pred[i].end(), lab_pred[i].begin(), 0.0); - double lab_stdev = std::sqrt(lab_sq_sum / lab_pred[i].size() - lab_mean * lab_mean); - - double unlab_sq_sum = std::inner_product(unlab_pred[i].begin(), unlab_pred[i].end(), unlab_pred[i].begin(), 0.0); - double unlab_stdev = std::sqrt(unlab_sq_sum / unlab_pred[i].size() - unlab_mean * unlab_mean); - - double var = 0.0; - for(size_t j = 0; j < lab_pred[i].size(); j++) { - var += std::pow(lab_pred[i][j] - weighted_mean, 2); - } - for(size_t j = 0; j < unlab_pred[i].size(); j++) { - var += contribution * std::pow(unlab_pred[i][j] - weighted_mean, 2); - } - - var /= weight_sum; - double weighted_stdev = std::sqrt(var); - - cout << "STD dev: lab - " << lab_stdev << "; unlab - " << unlab_stdev << "; weighted - " << weighted_stdev << endl; - - - - std::random_device rd; - std::mt19937 gen(rd()); - - // values near the mean are the most likely - // standard deviation affects the dispersion of generated values from the mean - std::normal_distribution ndist(weighted_mean,weighted_stdev); - - // Return the new prediction based on the distribution - return std::abs(std::round(ndist(gen))); -} \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index cbf9934d..ab8a1018 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include using std::boolalpha; @@ -233,7 +234,8 @@ void portcullis::JunctionFilter::filter() { mf.features[26].active=false; mf.features[27].active=false; mf.features[28].active=false; - mf.features[29].active=false;*/ + mf.features[29].active=false; + */ @@ -256,8 +258,15 @@ void portcullis::JunctionFilter::filter() { mf.trainCodingPotentialModel(pos); mf.trainSplicingModels(pos, neg); cout << " done." << endl << endl; - - + + // Balance models for training + /*if (pos.size() > neg.size()) { + undersample(pos, neg.size()); + } + else if (pos.size() < neg.size()) { + undersample(neg, pos.size()); + } + cout << "Balanced datasets to size of smallest set: " << pos.size() << endl << endl;*/ // Build the training set by combining the positive and negative sets JunctionList training; @@ -271,9 +280,9 @@ void portcullis::JunctionFilter::filter() { cout << "Training Random Forest" << endl << "----------------------" << endl << endl; bool done = false; - //shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); - SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); - shared_ptr forest = ssf.train(); + shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + /*SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); + shared_ptr forest = ssf.train();*/ /* const vector importance = forest->getVariableImportance(); mf.resetActiveFeatureIndex(); @@ -441,6 +450,17 @@ void portcullis::JunctionFilter::filter() { } } +void portcullis::JunctionFilter::undersample(JunctionList& jl, size_t size) { + + std::mt19937 rng(12345); + + while(jl.size() > size) { + std::uniform_int_distribution gen(0, jl.size()); // uniform, unbiased + int i = gen(rng); + jl.erase(jl.begin()+i); + } +} + void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, JunctionList& pos, JunctionList& unlabelled, ModelFeatures& mf) { JuncResultMap resultMap; @@ -758,13 +778,8 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction cout << "Initialising random forest" << endl; - shared_ptr f = nullptr; - if (train) { - f = make_shared(); - } else { - f = make_shared(); - } - + shared_ptr f = make_shared(); + vector catVars; f->init( @@ -843,12 +858,14 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction cout << "The best F1 score of " << max_f1 << " is achieved with threshold set at " << best_t_f1 << endl; cout << "The best MCC score of " << max_mcc << " is achieved with threshold set at " << best_t_mcc << endl; - cout << "Actual threshold set at " << threshold << endl; - categorise(f, all, pass, fail, threshold); - } else { - categorise(f, all, pass, fail, threshold); + //threshold = best_t_mcc; } - + + //threshold = calcGoodThreshold(f, all); + + cout << "Threshold set at " << threshold << endl; + categorise(f, all, pass, fail, threshold); + cout << "Saved junction scores to: " << scorepath << endl; @@ -866,6 +883,22 @@ void portcullis::JunctionFilter::categorise(shared_ptr f, const Junction } } +double portcullis::JunctionFilter::calcGoodThreshold(shared_ptr f, const JunctionList& all) { + + uint32_t pos = 0; + uint32_t neg = 0; + + for (auto& p : f->getPredictions()) { + if (p[0] <= 0.5) { + pos++; + } else { + neg++; + } + } + + return (double)pos / (double)(pos+neg); +} + int portcullis::JunctionFilter::main(int argc, char *argv[]) { path junctionFile; diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 8c088fd5..f6373bed 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -301,6 +301,10 @@ class JunctionFilter { void createNegativeSet(uint32_t L95, const JunctionList& all, JunctionList& neg, JunctionList& failJuncs); + double calcGoodThreshold(shared_ptr f, const JunctionList& all); + + void undersample(JunctionList& jl, size_t size); + public: static string helpMessage() { From 695f386a8f1b7b0dc9ff9b7caa001dc7b4c6a32c Mon Sep 17 00:00:00 2001 From: Dan Date: Sun, 15 May 2016 15:57:59 +0100 Subject: [PATCH 13/21] Will need a separate KNN implementation, so taking out the KNN logic from ENN and putting into seperate files. --- lib/Makefile.am | 2 + lib/include/portcullis/ml/knn.hpp | 100 +++++++++++++++++++++++++ lib/src/knn.cc | 119 ++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 lib/include/portcullis/ml/knn.hpp create mode 100644 lib/src/knn.cc diff --git a/lib/Makefile.am b/lib/Makefile.am index 66ba7c6a..b27bbb62 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -24,6 +24,7 @@ libportcullis_la_SOURCES = \ src/rule_parser.cc \ src/performance.cc \ src/ss_forest.cc \ + src/knn.cc \ src/enn.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis @@ -39,6 +40,7 @@ library_include_HEADERS = \ $(PI)/ml/model_features.hpp \ $(PI)/ml/performance.hpp \ $(PI)/ml/k_fold.hpp \ + $(PI)/ml/knn.hpp \ $(PI)/ml/enn.hpp \ $(PI)/ml/ss_forest.hpp \ $(PI)/kmer.hpp \ diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp new file mode 100644 index 00000000..187760ce --- /dev/null +++ b/lib/include/portcullis/ml/knn.hpp @@ -0,0 +1,100 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + + +#pragma once + +#include +#include +#include +#include +using std::string; +using std::thread; +using std::vector; +using std::shared_ptr; + +#include +#include +using boost::timer::auto_cpu_timer; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info KNNErrorInfo; +struct KNNException: virtual boost::exception, virtual std::exception { }; + +/** + * An parallel implementation of K Nearest Neighbour. + * Logic originally derived from OpenCV + */ +class KNN { +protected: + uint16_t k; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + vector>> results; + + void doSlice( uint16_t slice ); + +public: + + KNN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + int getK() const { + return k; + } + + void setK(int k) { + this->k = k; + } + + int getThreads() const { + return threads; + } + + void setThreads(int threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + const vector>>& getResults() const { + return results; + } + + const vector& getNNs(size_t index) const { + return *results[index]; + } + + void execute(); +}; + +} +} diff --git a/lib/src/knn.cc b/lib/src/knn.cc new file mode 100644 index 00000000..c645c983 --- /dev/null +++ b/lib/src/knn.cc @@ -0,0 +1,119 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +#include +using std::cout; +using std::endl; +using std::make_shared; + +#include + +portcullis::ml::KNN::KNN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + threads = _threads; + verbose = false; + results.resize(_rows); +} + +void portcullis::ml::KNN::doSlice(uint16_t slice) { + + // Get coordinates of entries to search through in this slice + uint32_t slice_size = rows / threads; + uint32_t start = slice_size * slice; + uint32_t end = (slice_size * slice) + slice_size - 1; + if (slice == threads - 1 && end <= rows) { + end = rows - 1; // Make sure we get the last few entries + } + uint32_t testcount = end - start + 1; + + vector dbuf(testcount*k, std::numeric_limits::max()); + vector rbuf(testcount*k, 0); + + // Get feature space values for KNN for each sample in slice + for (uint32_t baseidx = 0; baseidx < rows; baseidx++) { + for (uint32_t testidx = start; testidx <= end; testidx++) { + + const uint32_t ri = (testidx - start) * k; + + // Get sum of squared differences (no need to do the sqrt to get + // the Euclidean distance... this saves about 20% runtime) + double s = 0.0; + for (uint16_t i = 0; i < cols; i++) { + s += std::pow(data[baseidx * cols + i] - data[testidx * cols + i], 2.0); + } + + // Find position to add entry... + int i = k; + for (; i > 0; i--) + if (s >= dbuf[ri + i - 1]) + break; + + // ... or skip if not if this pair is not in the current set of KNN + if (i >= k) + continue; + + // Shift back entries after i + for (int j = k - 2; j >= i; j--) { + dbuf[ri + j + 1] = dbuf[ri + j]; + rbuf[ri + j + 1] = rbuf[ri + j]; + } + + // Record KNN and index + dbuf[ri + i] = s; + rbuf[ri + i] = baseidx; + } + } + + // Store final results + const uint16_t threshold = k / 2; + for (uint32_t testidx = start; testidx <= end; testidx++) { + shared_ptr> knn = make_shared>(k); + + for(size_t i = 0; i < k; k++) { + (*knn)[i] = rbuf[((testidx - start) * k) + i]; + } + + results[testidx] = knn; + } +} + +void portcullis::ml::KNN::execute() { + + auto_cpu_timer timer(1, " Time taken: %ws\n\n"); + + if (verbose) { + cout << "Performing K Nearest Neighbour (KNN) ..."; + cout.flush(); + } + + vector t(threads); + for (uint16_t i = 0; i < threads; i++) { + t[i] = thread(&KNN::doSlice, this, i); + } + for (uint16_t i = 0; i < threads; i++) { + t[i].join(); + } +} + From cc885d43ec3c0a60f1cef1b1dab66d036a21f427 Mon Sep 17 00:00:00 2001 From: Dan Date: Sun, 15 May 2016 20:32:33 +0100 Subject: [PATCH 14/21] Fixed KNN and ENN implementations. --- lib/include/portcullis/ml/enn.hpp | 10 ++- lib/include/portcullis/ml/knn.hpp | 4 ++ lib/src/enn.cc | 110 ++++++++---------------------- lib/src/knn.cc | 21 ++++-- 4 files changed, 55 insertions(+), 90 deletions(-) diff --git a/lib/include/portcullis/ml/enn.hpp b/lib/include/portcullis/ml/enn.hpp index b82dec25..97bc13e1 100644 --- a/lib/include/portcullis/ml/enn.hpp +++ b/lib/include/portcullis/ml/enn.hpp @@ -37,6 +37,8 @@ namespace ml { typedef boost::error_info ENNErrorInfo; struct ENNException: virtual boost::exception, virtual std::exception { }; +const uint16_t ENN_THRESHOLD = 5; + /** * An parallel implementation of Wilson's Edited nearest neighbour algorithm. * Uses KNN to identify nearest neighbours for each sample, then marks entries @@ -55,9 +57,7 @@ class ENN { size_t rows; size_t cols; - vector labels; - - void doSlice( uint16_t slice, vector& result ) const; + vector labels; public: @@ -86,6 +86,10 @@ class ENN { void setVerbose(bool verbose) { this->verbose = verbose; } + + uint16_t getThreshold() const { + return k / 2; + } uint32_t execute( vector& results ) const; diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp index 187760ce..e5d0922d 100644 --- a/lib/include/portcullis/ml/knn.hpp +++ b/lib/include/portcullis/ml/knn.hpp @@ -18,10 +18,12 @@ #pragma once +#include #include #include #include #include +using std::ostream; using std::string; using std::thread; using std::vector; @@ -94,6 +96,8 @@ class KNN { } void execute(); + + void print(ostream& out); }; } diff --git a/lib/src/enn.cc b/lib/src/enn.cc index 0d95045a..99b9912e 100644 --- a/lib/src/enn.cc +++ b/lib/src/enn.cc @@ -19,6 +19,9 @@ using std::cout; using std::endl; +#include +using portcullis::ml::KNN; + #include portcullis::ml::ENN::ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels) { @@ -39,104 +42,49 @@ portcullis::ml::ENN::ENN(uint16_t defaultK, uint16_t _threads, double* _data, si verbose = false; } -void portcullis::ml::ENN::doSlice(uint16_t slice, vector& result) const { - - // Get coordinates of entries to search through in this slice - uint32_t slice_size = rows / threads; - uint32_t start = slice_size * slice; - uint32_t end = (slice_size * slice) + slice_size - 1; - if (slice == threads - 1 && end <= rows) { - end = rows - 1; // Make sure we get the last few entries - } - uint32_t testcount = end - start + 1; - - vector dbuf(testcount*k, std::numeric_limits::max()); - vector rbuf(testcount*k, 0); - - // Get feature space values for KNN for each sample in slice - for (uint32_t baseidx = 0; baseidx < rows; baseidx++) { - for (uint32_t testidx = start; testidx <= end; testidx++) { - - const uint32_t ri = (testidx - start) * k; - - // Get sum of squared differences (no need to do the sqrt to get - // the Euclidean distance... this saves about 20% runtime) - double s = 0.0; - for (uint16_t i = 0; i < cols; i++) { - s += std::pow(data[baseidx * cols + i] - data[testidx * cols + i], 2.0); - } - - // Find position to add entry... - int i = k; - for (; i > 0; i--) - if (s >= dbuf[ri + i - 1]) - break; - - // ... or skip if not if this pair is not in the current set of KNN - if (i >= k) - continue; +uint32_t portcullis::ml::ENN::execute(vector& results) const { - // Shift back entries after i - for (int j = k - 2; j >= i; j--) { - dbuf[ri + j + 1] = dbuf[ri + j]; - rbuf[ri + j + 1] = rbuf[ri + j]; - } + auto_cpu_timer timer(1, "ENN Time taken: %ws\n\n"); - // Record KNN and index - dbuf[ri + i] = s; - rbuf[ri + i] = baseidx; - } + if (verbose) { + cout << "Starting Wilson's Edited Nearest Neighbour (ENN)" << endl; } - // Mark for keeping or removal - const uint16_t threshold = k / 2; - for (uint32_t testidx = start; testidx <= end; testidx++) { - - bool pos = labels[testidx]; + KNN knn(k, threads, data, rows, cols); + knn.setVerbose(verbose); + knn.execute(); + + if (verbose) { + cout << "Finding outliers" << endl; + } + + results.clear(); + results.resize(rows); + uint32_t discard = 0; + + const uint16_t threshold = this->getThreshold(); + + for(size_t i = 0; i < rows; i++) { uint16_t pos_count = 0; uint16_t neg_count = 0; - for (size_t i = 0; i < k; i++) { - uint32_t index = rbuf[((testidx - start) * k) + i]; + bool pos = labels[i]; + const vector nn = knn.getNNs(i); + for (size_t j = 0; j < k; j++) { + uint32_t index = nn[j]; if (labels[index]) { pos_count++; } else { neg_count++; } } - - result[testidx] = ((pos && pos_count > threshold) || (!pos && neg_count > threshold)); - } -} - -uint32_t portcullis::ml::ENN::execute(vector& results) const { - - auto_cpu_timer timer(1, " Time taken: %ws\n\n"); - - if (verbose) { - cout << "Performing Wilson's Edited Nearest Neighbour (ENN) ..."; - cout.flush(); + results[i] = ((pos && pos_count > threshold) || (!pos && neg_count > threshold)); + if (!results[i]) discard++; } - results.clear(); - results.resize(rows); - std::fill(results.begin(), results.end(), false); - - vector t(threads); - for (uint16_t i = 0; i < threads; i++) { - t[i] = thread(&ENN::doSlice, this, i, std::ref(results)); - } - for (uint16_t i = 0; i < threads; i++) { - t[i].join(); - } - - uint32_t discard = 0; - for (auto r : results) { - if (!r) discard++; - } uint32_t keep = results.size() - discard; if (verbose) { - cout << "done." << endl << "Marked " << keep << " to be kept and " << discard << " to be discarded." << endl; + cout << "Marked " << keep << " to be kept and " << discard << " to be discarded." << endl; } return discard; diff --git a/lib/src/knn.cc b/lib/src/knn.cc index c645c983..2c2fd43a 100644 --- a/lib/src/knn.cc +++ b/lib/src/knn.cc @@ -17,6 +17,7 @@ #include #include +using std::ostream; using std::cout; using std::endl; using std::make_shared; @@ -87,12 +88,12 @@ void portcullis::ml::KNN::doSlice(uint16_t slice) { } // Store final results - const uint16_t threshold = k / 2; for (uint32_t testidx = start; testidx <= end; testidx++) { - shared_ptr> knn = make_shared>(k); + shared_ptr> knn = make_shared>(); - for(size_t i = 0; i < k; k++) { - (*knn)[i] = rbuf[((testidx - start) * k) + i]; + for(size_t i = 0; i < k; i++) { + uint32_t index = rbuf[((testidx - start) * k) + i]; + knn->push_back(index); } results[testidx] = knn; @@ -101,7 +102,7 @@ void portcullis::ml::KNN::doSlice(uint16_t slice) { void portcullis::ml::KNN::execute() { - auto_cpu_timer timer(1, " Time taken: %ws\n\n"); + auto_cpu_timer timer(1, " Time taken: %ws\n"); if (verbose) { cout << "Performing K Nearest Neighbour (KNN) ..."; @@ -114,6 +115,14 @@ void portcullis::ml::KNN::execute() { } for (uint16_t i = 0; i < threads; i++) { t[i].join(); - } + } } +void portcullis::ml::KNN::print(ostream& out) { + for(auto& i : results) { + for(auto j : *i) { + out << j << " "; + } + out << endl; + } +} \ No newline at end of file From 4b9ee954de691fe0f9cc507fcf8a81cecf3852ac Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 16 May 2016 20:50:14 +0100 Subject: [PATCH 15/21] Implemented SMOTE. --- lib/Makefile.am | 4 +- lib/include/portcullis/ml/knn.hpp | 8 +- lib/include/portcullis/ml/model_features.hpp | 2 +- .../ml/{balancing.hpp => smote.hpp} | 63 ++++++++++++--- lib/src/model_features.cc | 36 ++++++++- lib/src/smote.cc | 79 +++++++++++++++++++ src/junction_filter.cc | 15 ++-- src/train.cc | 4 +- 8 files changed, 184 insertions(+), 27 deletions(-) rename lib/include/portcullis/ml/{balancing.hpp => smote.hpp} (53%) create mode 100644 lib/src/smote.cc diff --git a/lib/Makefile.am b/lib/Makefile.am index b27bbb62..bf45e80c 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -25,7 +25,8 @@ libportcullis_la_SOURCES = \ src/performance.cc \ src/ss_forest.cc \ src/knn.cc \ - src/enn.cc + src/enn.cc \ + src/smote.cc library_includedir=$(includedir)/portcullis-@PACKAGE_VERSION@/portcullis PI = include/portcullis @@ -42,6 +43,7 @@ library_include_HEADERS = \ $(PI)/ml/k_fold.hpp \ $(PI)/ml/knn.hpp \ $(PI)/ml/enn.hpp \ + $(PI)/ml/smote.hpp \ $(PI)/ml/ss_forest.hpp \ $(PI)/kmer.hpp \ $(PI)/python_exe.hpp \ diff --git a/lib/include/portcullis/ml/knn.hpp b/lib/include/portcullis/ml/knn.hpp index e5d0922d..2b16e2f5 100644 --- a/lib/include/portcullis/ml/knn.hpp +++ b/lib/include/portcullis/ml/knn.hpp @@ -63,19 +63,19 @@ class KNN { KNN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols); - int getK() const { + uint16_t getK() const { return k; } - void setK(int k) { + void setK(uint16_t k) { this->k = k; } - int getThreads() const { + uint16_t getThreads() const { return threads; } - void setThreads(int threads) { + void setThreads(uint16_t threads) { this->threads = threads; } diff --git a/lib/include/portcullis/ml/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp index 89edd12f..035ce6f4 100644 --- a/lib/include/portcullis/ml/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -120,7 +120,7 @@ class ModelFeatures { Data* juncs2FeatureVectors(const JunctionList& xl, const JunctionList& xu); - ForestPtr trainInstance(const JunctionList& x, string outputPrefix, + ForestPtr trainInstance(const JunctionList& pos, const JunctionList& neg, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); void resetActiveFeatureIndex() { diff --git a/lib/include/portcullis/ml/balancing.hpp b/lib/include/portcullis/ml/smote.hpp similarity index 53% rename from lib/include/portcullis/ml/balancing.hpp rename to lib/include/portcullis/ml/smote.hpp index e9c03ab7..f758fac0 100644 --- a/lib/include/portcullis/ml/balancing.hpp +++ b/lib/include/portcullis/ml/smote.hpp @@ -22,12 +22,9 @@ namespace portcullis { namespace ml { -typedef boost::error_info BalancingErrorInfo; -struct BalacingException: virtual boost::exception, virtual std::exception { }; +typedef boost::error_info SmoteErrorInfo; +struct SmoteException: virtual boost::exception, virtual std::exception { }; -class UnderSampler : UnbalancedDataset { - -}; /** * Object to perform classification on balanced ensembled selected from random sampling. @@ -35,17 +32,63 @@ class UnderSampler : UnbalancedDataset { * It is based on the idea presented in the paper "Exploratory Undersampling * Class-Imbalance Learning" by Liu et al. */ -class EasyEnsemble : public UnderSampler { +class Smote { private: - uint16_t nb_subsets; - + uint16_t k; + uint16_t smoteness; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + double* synthetic; + size_t s_rows; public: - EasyEnsemble() { - + Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + ~Smote() { + delete[] synthetic; } + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + double* getSynthetic() { + return synthetic; + } + + size_t getNbSynthRows() const { + return s_rows; + } + + void execute(); + + }; } } \ No newline at end of file diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 112ccbc4..0ed5084b 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -29,13 +29,17 @@ using std::make_shared; #include #include +#include using portcullis::ml::ENN; +using portcullis::ml::Smote; #include using portcullis::Junction; #include +#include "portcullis/junction_system.hpp" + void portcullis::ml::ModelFeatures::initGenomeMapper(const path& genomeFile) { @@ -276,9 +280,39 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& xl -portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& x, +portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& pos, const JunctionList& neg, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { + if (verbose) cout << "Creating feature vector" << endl; + Data* negData = juncs2FeatureVectors(neg); + + size_t nelements = negData->getNumRows()*(negData->getNumCols()-1); + double* nm = new double[nelements]; + for( uint32_t baseidx = 0; baseidx < negData->getNumRows(); baseidx++ ) { + double* r = &nm[baseidx * (negData->getNumCols()-1)]; + for( size_t c = 1; c < negData->getNumCols(); c++) { + r[c - 1] = negData->get(baseidx, c); + //cout << r[c-1]; + } + //cout << endl; + } + + uint16_t N = pos.size() / neg.size(); + + Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols()-1); + smote.execute(); + cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; + + JunctionList training; + training.reserve(pos.size() + neg.size()); + training.insert(training.end(), pos.begin(), pos.end()); + training.insert(training.end(), neg.begin(), neg.end()); + + JunctionSystem trainingSystem(training); + trainingSystem.sort(); + JunctionList x = trainingSystem.getJunctions(); + + if (verbose) cout << "Creating feature vector" << endl; Data* trainingData = juncs2FeatureVectors(x); diff --git a/lib/src/smote.cc b/lib/src/smote.cc new file mode 100644 index 00000000..d8250a8a --- /dev/null +++ b/lib/src/smote.cc @@ -0,0 +1,79 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::ml::KNN; + +#include + +portcullis::ml::Smote::Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + smoteness = _smoteness < 1 ? 1 : _smoteness; + threads = _threads; + verbose = false; + + s_rows = smoteness * rows; + synthetic = new double[s_rows * cols]; +} + +void portcullis::ml::Smote::execute() { + + auto_cpu_timer timer(1, "SMOTE Time taken: %ws\n\n"); + + if (verbose) { + cout << "Starting Synthetic Minority Oversampling Technique (SMOTE)" << endl; + } + + uint32_t new_index = 0; + + KNN knn(k, threads, data, rows, cols); + knn.setVerbose(verbose); + knn.execute(); + + std::mt19937 rng(12345); + std::uniform_int_distribution igen(0, k); + std::uniform_real_distribution dgen(0, 1); + + + for(size_t i = 0; i < rows; i++) { + uint16_t N = smoteness; + while(N > 0) { + const vector nns = knn.getNNs(i); + uint32_t nn = nns[igen(rng)]; // Nearest neighbour row index + + for(size_t j = 0; j < cols; j++) { + double dif = data[(nn * cols) + j] - data[(i * cols) + j]; + double gap = dgen(rng); + synthetic[(new_index * cols) + j] = data[(i * cols) + j] + gap * dif; + } + + new_index++; + N--; + } + } +} diff --git a/src/junction_filter.cc b/src/junction_filter.cc index ab8a1018..37b0fe91 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -269,18 +269,17 @@ void portcullis::JunctionFilter::filter() { cout << "Balanced datasets to size of smallest set: " << pos.size() << endl << endl;*/ // Build the training set by combining the positive and negative sets - JunctionList training; - training.reserve(pos.size() + neg.size()); - training.insert(training.end(), pos.begin(), pos.end()); - training.insert(training.end(), neg.begin(), neg.end()); - - JunctionSystem trainingSystem(training); - trainingSystem.sort(); + + JunctionSystem posSystem(pos); + posSystem.sort(); + + JunctionSystem negSystem(neg); + negSystem.sort(); cout << "Training Random Forest" << endl << "----------------------" << endl << endl; bool done = false; - shared_ptr forest = mf.trainInstance(trainingSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + shared_ptr forest = mf.trainInstance(posSystem.getJunctions(), negSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); /*SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); shared_ptr forest = ssf.train();*/ /* diff --git a/src/train.cc b/src/train.cc index a2db9a94..025aad8c 100644 --- a/src/train.cc +++ b/src/train.cc @@ -159,7 +159,7 @@ void portcullis::Train::train() { cout << "Training on full dataset" << endl; - ForestPtr f = ModelFeatures().trainInstance(junctions, outputPrefix.string(), trees, threads, false, true); + ForestPtr f = ModelFeatures().trainInstance(junctions, JunctionList(), outputPrefix.string(), trees, threads, false, true); f->saveToFile(); f->writeOutput(&cout); @@ -194,7 +194,7 @@ void portcullis::Train::train() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - ForestPtr f = ModelFeatures().trainInstance(train, outputPrefix.string(), trees, threads, false, false); + ForestPtr f = ModelFeatures().trainInstance(train, JunctionList(), outputPrefix.string(), trees, threads, false, false); // Test model instance testInstance(f, test); From ad1ca288ae3c5da495ba9fddfa316d83e21fe5fc Mon Sep 17 00:00:00 2001 From: Dan Date: Tue, 17 May 2016 20:31:24 +0100 Subject: [PATCH 16/21] Smote seems to process data... need to check if it's having an impact on the results. --- lib/include/portcullis/ml/smote.hpp | 4 ++++ lib/src/model_features.cc | 25 ++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/include/portcullis/ml/smote.hpp b/lib/include/portcullis/ml/smote.hpp index f758fac0..61532160 100644 --- a/lib/include/portcullis/ml/smote.hpp +++ b/lib/include/portcullis/ml/smote.hpp @@ -86,6 +86,10 @@ class Smote { return s_rows; } + double getSynth(size_t row, size_t col) const { + return synthetic[(row * cols) + col]; + } + void execute(); diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 0ed5084b..6944ca92 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -314,7 +314,26 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun if (verbose) cout << "Creating feature vector" << endl; - Data* trainingData = juncs2FeatureVectors(x); + Data* otd = juncs2FeatureVectors(x); + + Data* trainingData = new DataDouble(otd->getVariableNames(), otd->getNumRows() + smote.getNbSynthRows(), otd->getNumCols()); + bool error = false; + cout << "0" << endl; + for(size_t i = 0; i < otd->getNumRows(); i++) { + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData->set(j, i, otd->get(i, j), error); + } + } + cout << "1" << endl; + size_t k = otd->getNumRows(); + + for(size_t i = 0; i < smote.getNbSynthRows(); i++) { + trainingData->set(0, i, 0, error); // Set genuine (i.e. not genuine) flag + for(size_t j = 1; j < trainingData->getNumCols(); j++) { + trainingData->set(j, k++, smote.getSynth(i, j-1), error); + } + } + cout << "2" << endl; size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); double* m = new double[elements]; @@ -327,8 +346,8 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun //cout << endl; } vector labels; - for(auto& j : x) { - labels.push_back(j->isGenuine()); + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + labels.push_back(trainingData->get(i, 0) == 1.0); } cout << "Created data matrix" << endl; From b432693bc2acb72726c4d1ceeeba100930ef43cb Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Wed, 18 May 2016 17:12:54 +0100 Subject: [PATCH 17/21] Implemented SMOTE correctly. However, there appears to be an issue with this commit around creation of the actual training set for the random forest model... need to investigate... --- data/selftrain_initial_pos.layer1.json | 6 +- lib/include/portcullis/ml/smote.hpp | 8 +- lib/src/enn.cc | 4 - lib/src/model_features.cc | 158 ++++++++++++++++--------- lib/src/smote.cc | 20 ++++ tests/Makefile.am | 1 + tests/smote_tests.cpp | 60 ++++++++++ 7 files changed, 195 insertions(+), 62 deletions(-) create mode 100644 tests/smote_tests.cpp diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json index f128bacb..154ae9f8 100644 --- a/data/selftrain_initial_pos.layer1.json +++ b/data/selftrain_initial_pos.layer1.json @@ -22,7 +22,7 @@ }, "M19-mean_mismatches": { "operator": "lte", - "value": 0.5 + "value": 1.0 }, "M20-nb_usrs": { "operator": "gte", @@ -30,8 +30,8 @@ }, "M22-rel2raw": { "operator": "gte", - "value": 0.33 + "value": 0.25 } }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M11-entropy & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" } diff --git a/lib/include/portcullis/ml/smote.hpp b/lib/include/portcullis/ml/smote.hpp index 61532160..86929675 100644 --- a/lib/include/portcullis/ml/smote.hpp +++ b/lib/include/portcullis/ml/smote.hpp @@ -16,6 +16,10 @@ // ******************************************************************* #pragma once +#include +#include +using std::ostream; +using std::string; #include @@ -89,10 +93,10 @@ class Smote { double getSynth(size_t row, size_t col) const { return synthetic[(row * cols) + col]; } - + void execute(); - + void print(ostream& out) const; }; } } \ No newline at end of file diff --git a/lib/src/enn.cc b/lib/src/enn.cc index 99b9912e..8abb7bef 100644 --- a/lib/src/enn.cc +++ b/lib/src/enn.cc @@ -46,10 +46,6 @@ uint32_t portcullis::ml::ENN::execute(vector& results) const { auto_cpu_timer timer(1, "ENN Time taken: %ws\n\n"); - if (verbose) { - cout << "Starting Wilson's Edited Nearest Neighbour (ENN)" << endl; - } - KNN knn(k, threads, data, rows, cols); knn.setVerbose(verbose); knn.execute(); diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 6944ca92..464216e4 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -283,58 +283,97 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& xl portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& pos, const JunctionList& neg, string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { - if (verbose) cout << "Creating feature vector" << endl; - Data* negData = juncs2FeatureVectors(neg); - - size_t nelements = negData->getNumRows()*(negData->getNumCols()-1); - double* nm = new double[nelements]; - for( uint32_t baseidx = 0; baseidx < negData->getNumRows(); baseidx++ ) { - double* r = &nm[baseidx * (negData->getNumCols()-1)]; - for( size_t c = 1; c < negData->getNumCols(); c++) { - r[c - 1] = negData->get(baseidx, c); - //cout << r[c-1]; + // Work out number of times to duplicate negative set + int N = (pos.size() / neg.size()) - 1; + + // Duplicate pointers to negative set + JunctionList neg2; + neg2.reserve(neg.size()); + neg2.insert(neg2.end(), neg.begin(), neg.end()); + + uint32_t smote_rows = 0; + double* smote_data = 0; + if (N > 0) { + + cout << "Oversampling negative set to balance with positive set using SMOTE" << endl; + Data* negData = juncs2FeatureVectors(neg); + + size_t nelements = negData->getNumRows()*(negData->getNumCols()-1); + double* nm = new double[nelements]; + for( uint32_t baseidx = 0; baseidx < negData->getNumRows(); baseidx++ ) { + double* r = &nm[baseidx * (negData->getNumCols()-1)]; + for( size_t c = 1; c < negData->getNumCols(); c++) { + r[c - 1] = negData->get(baseidx, c); + //cout << r[c-1]; + } + //cout << endl; + } + + uint16_t N = pos.size() / neg.size(); + + Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols()-1); + smote.execute(); + smote_rows = smote.getNbSynthRows(); + smote_data = new double[smote_rows * (negData->getNumCols()-1)]; + double* sd = smote.getSynthetic(); + for(size_t i = 0; i < smote_rows * (negData->getNumCols()-1); i++) { + smote_data[i] = sd[i]; + } + cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; + } + else { + + cout << "Undersampling negative set to balance with positive set" << endl; + std::mt19937 rng(12345); + while(neg2.size() > pos.size()) { + std::uniform_int_distribution gen(0, neg2.size()); // uniform, unbiased + int i = gen(rng); + neg2.erase(neg2.begin()+i); } - //cout << endl; } - uint16_t N = pos.size() / neg.size(); - Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols()-1); - smote.execute(); - cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; + if (verbose) cout << endl << "Combining positive, negative " << (N > 0 ? "and synthetic negative " : "") << "datasets." << endl; JunctionList training; - training.reserve(pos.size() + neg.size()); + training.reserve(pos.size() + neg2.size()); training.insert(training.end(), pos.begin(), pos.end()); - training.insert(training.end(), neg.begin(), neg.end()); + training.insert(training.end(), neg2.begin(), neg2.end()); JunctionSystem trainingSystem(training); trainingSystem.sort(); JunctionList x = trainingSystem.getJunctions(); - if (verbose) cout << "Creating feature vector" << endl; Data* otd = juncs2FeatureVectors(x); - Data* trainingData = new DataDouble(otd->getVariableNames(), otd->getNumRows() + smote.getNbSynthRows(), otd->getNumCols()); - bool error = false; - cout << "0" << endl; - for(size_t i = 0; i < otd->getNumRows(); i++) { - for(size_t j = 0; j < trainingData->getNumCols(); j++) { - trainingData->set(j, i, otd->get(i, j), error); - } - } - cout << "1" << endl; - size_t k = otd->getNumRows(); + // Create data to correct size + Data* trainingData = N > 0 ? + new DataDouble( + otd->getVariableNames(), + otd->getNumRows() + smote_rows, + otd->getNumCols()) + : otd; - for(size_t i = 0; i < smote.getNbSynthRows(); i++) { - trainingData->set(0, i, 0, error); // Set genuine (i.e. not genuine) flag - for(size_t j = 1; j < trainingData->getNumCols(); j++) { - trainingData->set(j, k++, smote.getSynth(i, j-1), error); + if (N > 0) { + bool error = false; + for(size_t i = 0; i < otd->getNumRows(); i++) { + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData->set(j, i, otd->get(i, j), error); + } + } + + size_t k = otd->getNumRows(); + for(size_t i = 0; i < smote_rows; i++) { + trainingData->set(0, i, 0, error); // Set genuine (i.e. not genuine) flag + for(size_t j = 1; j < trainingData->getNumCols(); j++) { + trainingData->set(j, k++, smote_data[(i * (trainingData->getNumCols() - 1)) + j-1], error); + } } + delete[] smote_data; } - cout << "2" << endl; - + + if (verbose) cout << endl << "Converting training data for ENN" << endl; size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); double* m = new double[elements]; for( uint32_t baseidx = 0; baseidx < trainingData->getNumRows(); baseidx++ ) { @@ -344,14 +383,16 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun //cout << r[c-1]; } //cout << endl; - } + } + + if (verbose) cout << endl << "Extracting labels for ENN" << endl; vector labels; for(size_t i = 0; i < trainingData->getNumRows(); i++) { labels.push_back(trainingData->get(i, 0) == 1.0); } - cout << "Created data matrix" << endl; - ENN enn(10, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + cout << endl << "Starting Wilson's Edited Nearest Neighbour (ENN) to clean decision region" << endl; + ENN enn(5, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); enn.setVerbose(true); vector results; uint32_t count = enn.execute(results); @@ -360,33 +401,44 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun uint32_t pcount = 0, ncount = 0; JunctionList x2; - for(size_t i = 0; i < x.size(); i++) { - if (x[i]->isGenuine() && !results[i]) { + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if (trainingData->get(i, 0) == 1.0 && !results[i]) { pcount++; } - else if (!x[i]->isGenuine() && !results[i]) { + else if (trainingData->get(i, 0) == 0.0 && !results[i]) { ncount++; } - else { - x2.push_back(x[i]); - } } - Data* trainingData2 = juncs2FeatureVectors(x2); + cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (Total=" << count << ")" << endl << endl; - cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (" << count << ")" << endl << endl; - path feature_file = outputPrefix + ".features"; - if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; - - ofstream fout(feature_file.c_str(), std::ofstream::out); + Data* trainingData2 = new DataDouble( + trainingData->getVariableNames(), + trainingData->getNumRows() - count, + trainingData->getNumCols()); - fout << Intron::locationOutputHeader() << "\t" << trainingData->getHeader() << endl; - for(size_t i = 0; i < x.size(); i++) { - fout << *(x[i]->getIntron()) << "\t" << trainingData->getRow(i) << endl; + size_t new_index = 0; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if ((trainingData->get(i, 0) == 1.0 && results[i]) || (trainingData->get(i, 0) == 0.0 && !results[i])) { + bool error = false; + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData2->set(j, new_index, trainingData->get(i, j), error); + } + new_index++; + } } - fout.close(); + + /*path feature_file = outputPrefix + ".features"; + if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; + + ofstream fout(feature_file.c_str(), std::ofstream::out); + fout << Intron::locationOutputHeader() << "\t" << trainingData2->getHeader() << endl; + for(size_t i = 0; i < x2.size(); i++) { + fout << *(x2[i]->getIntron()) << "\t" << trainingData2->getRow(i) << endl; + } + fout.close();*/ if (verbose) cout << "Initialising random forest" << endl; ForestPtr f = nullptr; diff --git a/lib/src/smote.cc b/lib/src/smote.cc index d8250a8a..c611e527 100644 --- a/lib/src/smote.cc +++ b/lib/src/smote.cc @@ -77,3 +77,23 @@ void portcullis::ml::Smote::execute() { } } } + +void portcullis::ml::Smote::print(ostream& out) const { + + out << "Input:" << endl; + for(size_t i = 0; i < rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << data[(i * cols) + j] << " "; + } + out << endl; + } + + out << endl << "Synthetic:" << endl; + for(size_t i = 0; i < s_rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << synthetic[(i * cols) + j] << " "; + } + out << endl; + } + out << endl; +} \ No newline at end of file diff --git a/tests/Makefile.am b/tests/Makefile.am index d6d27253..a3040b86 100755 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -25,6 +25,7 @@ check_PROGRAMS = check_unit_tests check_unit_tests_SOURCES = bam_tests.cpp \ seq_utils_tests.cpp \ kmer_tests.cpp \ + smote_tests.cpp \ intron_tests.cpp \ junction_tests.cpp \ check_portcullis.cc diff --git a/tests/smote_tests.cpp b/tests/smote_tests.cpp new file mode 100644 index 00000000..0c8b9056 --- /dev/null +++ b/tests/smote_tests.cpp @@ -0,0 +1,60 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include + +#include +#include +#include +using std::cout; +using std::cerr; +using std::endl; +using std::stringstream; + + +#include +#include +#include +namespace bfs = boost::filesystem; +using bfs::path; + +#include +using portcullis::ml::Smote; + + +TEST(smote, simple) { + + double data[] = { + 0.2, 0.4, 1.5, 2.3, 0.0, + 0.3, 0.3, 2.6, 5.2, 0.1, + 0.3, 0.3, 2.4, 5.2, 0.1, + 0.1, 0.2, 0.5, 3.1, 0.3, + 0.3, 0.3, 2.6, 5.2, 0.9, + 0.3, 0.3, 2.6, 5.2, 0.1, + 0.3, 0.7, 2.6, 5.2, 0.1, + 0.2, 0.3, 2.6, 4.2, 0.1, + 0.3, 0.8, 2.6, 2.2, 0.1, + 1.3, 1.3, 2.6, 8.2, 0.8 + }; + + Smote smote(3, 2, 1, data, 10, 5); + smote.execute(); + //smote.print(cerr); + + EXPECT_EQ(smote.getNbSynthRows(), 20); +} + From 80210abbc14894ed0f7156bad2126de7bce70eb4 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Thu, 19 May 2016 17:02:04 +0100 Subject: [PATCH 18/21] Modifications to ENN and positive selection layers. Seems to give good performance on tophat, gsnap and STAR, however hisat gives less precise results. Also started an implementation of ICOTE but not sure if I'll continue with this. --- data/selftrain_initial_pos.layer1.json | 70 +++++----- data/selftrain_initial_pos.layer2.json | 86 ++++++------- data/selftrain_initial_pos.layer3.json | 150 +++++++++++----------- lib/include/portcullis/ml/enn.hpp | 15 ++- lib/include/portcullis/ml/icote.hpp | 106 +++++++++++++++ lib/src/enn.cc | 4 +- lib/src/icote.cc | 171 +++++++++++++++++++++++++ lib/src/model_features.cc | 72 +++++++---- src/junction_filter.cc | 13 +- 9 files changed, 500 insertions(+), 187 deletions(-) create mode 100644 lib/include/portcullis/ml/icote.hpp create mode 100644 lib/src/icote.cc diff --git a/data/selftrain_initial_pos.layer1.json b/data/selftrain_initial_pos.layer1.json index 154ae9f8..bd1fd95d 100644 --- a/data/selftrain_initial_pos.layer1.json +++ b/data/selftrain_initial_pos.layer1.json @@ -1,37 +1,37 @@ { - "parameters": { - "M4-nb_rel_aln": { - "operator": "gte", - "value": 1 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 8 - }, - "M11-entropy": { - "operator": "gt", - "value": 1.0 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 5 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 5 - }, - "M19-mean_mismatches": { - "operator": "lte", - "value": 1.0 - }, - "M20-nb_usrs": { - "operator": "gte", - "value": 1 - }, - "M22-rel2raw": { - "operator": "gte", - "value": 0.25 - } - }, - "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 8 + }, + "M11-entropy": { + "operator": "gt", + "value": 1.0 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 4 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 4 + }, + "M19-mean_mismatches": { + "operator": "lte", + "value": 1.0 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 1 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.25 + } + }, + "expression": "M4-nb_rel_aln & M13-hamming5p & M14-hamming3p & M12-maxmmes & M20-nb_usrs & M19-mean_mismatches & M22-rel2raw" } diff --git a/data/selftrain_initial_pos.layer2.json b/data/selftrain_initial_pos.layer2.json index b03d88ac..a4c2b0b4 100644 --- a/data/selftrain_initial_pos.layer2.json +++ b/data/selftrain_initial_pos.layer2.json @@ -1,45 +1,45 @@ { - "parameters": { - "M4-nb_rel_aln": { - "operator": "gte", - "value": 5 - }, - "M4-nb_rel_aln.2": { - "operator": "gte", - "value": 3 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 25 - }, - "M12-maxmmes.2": { - "operator": "gt", - "value": 15 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 8 - }, - "M13-hamming5p.2": { - "operator": "gte", - "value": 10 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 8 - }, - "M14-hamming3p.2": { - "operator": "gte", - "value": 10 - }, - "M19-mean_mismatches": { - "operator": "lt", - "value": 0 - }, - "M19-mean_mismatches.2": { - "operator": "lt", - "value": 0.25 - } - }, - "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" + "parameters": { + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 3 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gt", + "value": 12 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 7 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 9 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 9 + }, + "M19-mean_mismatches": { + "operator": "lte", + "value": 0 + }, + "M19-mean_mismatches.2": { + "operator": "lt", + "value": 0.33 + } + }, + "expression": "( M4-nb_rel_aln & M12-maxmmes ) | ( M4-nb_rel_aln.2 & M12-maxmmes.2 & M13-hamming5p & M14-hamming3p & M19-mean_mismatches.2 ) | ( M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches )" } diff --git a/data/selftrain_initial_pos.layer3.json b/data/selftrain_initial_pos.layer3.json index 349d1317..18464d1e 100644 --- a/data/selftrain_initial_pos.layer3.json +++ b/data/selftrain_initial_pos.layer3.json @@ -1,77 +1,77 @@ { - "parameters": { - "M1-canonical_ss": { - "operator": "in", - "value": ["C"] - }, - "M1-canonical_ss.2": { - "operator": "in", - "value": ["S"] - }, - "M1-canonical_ss.3": { - "operator": "in", - "value": ["N"] - }, - "M4-nb_rel_aln": { - "operator": "gte", - "value": 5 - }, - "M4-nb_rel_aln.2": { - "operator": "gte", - "value": 1 - }, - "M12-maxmmes": { - "operator": "gte", - "value": 20 - }, - "M12-maxmmes.2": { - "operator": "gte", - "value": 10 - }, - "M11-entropy": { - "operator": "gt", - "value": 5.0 - }, - "M11-entropy.2": { - "operator": "gt", - "value": 2.0 - }, - "M13-hamming5p": { - "operator": "gte", - "value": 7 - }, - "M13-hamming5p.2": { - "operator": "gte", - "value": 8 - }, - "M14-hamming3p": { - "operator": "gte", - "value": 7 - }, - "M14-hamming3p.2": { - "operator": "gte", - "value": 8 - }, - "M19-mean_mismatches": { - "operator": "eq", - "value": 0 - }, - "M19-mean_mismatches": { - "operator": "lt", - "value": 0.1 - }, - "M20-nb_usrs": { - "operator": "gte", - "value": 5 - }, - "M22-rel2raw": { - "operator": "gte", - "value": 0.5 - }, - "M22-rel2raw.2": { - "operator": "gte", - "value": 0.75 - } - }, - "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" + "parameters": { + "M1-canonical_ss": { + "operator": "in", + "value": ["C"] + }, + "M1-canonical_ss.2": { + "operator": "in", + "value": ["S"] + }, + "M1-canonical_ss.3": { + "operator": "in", + "value": ["N"] + }, + "M4-nb_rel_aln": { + "operator": "gte", + "value": 5 + }, + "M4-nb_rel_aln.2": { + "operator": "gte", + "value": 1 + }, + "M12-maxmmes": { + "operator": "gte", + "value": 20 + }, + "M12-maxmmes.2": { + "operator": "gte", + "value": 10 + }, + "M11-entropy": { + "operator": "gt", + "value": 3.0 + }, + "M11-entropy.2": { + "operator": "gt", + "value": 1.5 + }, + "M13-hamming5p": { + "operator": "gte", + "value": 6 + }, + "M13-hamming5p.2": { + "operator": "gte", + "value": 7 + }, + "M14-hamming3p": { + "operator": "gte", + "value": 6 + }, + "M14-hamming3p.2": { + "operator": "gte", + "value": 7 + }, + "M19-mean_mismatches": { + "operator": "eq", + "value": 0 + }, + "M19-mean_mismatches": { + "operator": "lt", + "value": 0.1 + }, + "M20-nb_usrs": { + "operator": "gte", + "value": 5 + }, + "M22-rel2raw": { + "operator": "gte", + "value": 0.5 + }, + "M22-rel2raw.2": { + "operator": "gte", + "value": 0.75 + } + }, + "expression": "( M1-canonical_ss ) | ( M1-canonical_ss.2 & M22-rel2raw & M13-hamming5p & M14-hamming3p ) | ( M1-canonical_ss.3 & M22-rel2raw.2 & M13-hamming5p.2 & M14-hamming3p.2 & M19-mean_mismatches & M11-entropy.2 )" } diff --git a/lib/include/portcullis/ml/enn.hpp b/lib/include/portcullis/ml/enn.hpp index 97bc13e1..96b8ece1 100644 --- a/lib/include/portcullis/ml/enn.hpp +++ b/lib/include/portcullis/ml/enn.hpp @@ -50,6 +50,7 @@ const uint16_t ENN_THRESHOLD = 5; class ENN { protected: uint16_t k; + uint16_t threshold; uint16_t threads; bool verbose; @@ -63,19 +64,19 @@ class ENN { ENN(uint16_t defaultK, uint16_t _threads, double* _data, size_t _rows, size_t _cols, vector& _labels); - int getK() const { + uint16_t getK() const { return k; } - void setK(int k) { + void setK(uint16_t k) { this->k = k; } - int getThreads() const { + uint16_t getThreads() const { return threads; } - void setThreads(int threads) { + void setThreads(uint16_t threads) { this->threads = threads; } @@ -87,8 +88,12 @@ class ENN { this->verbose = verbose; } + void setThreshold(uint16_t threshold) { + this->threshold = threshold; + } + uint16_t getThreshold() const { - return k / 2; + return threshold; } diff --git a/lib/include/portcullis/ml/icote.hpp b/lib/include/portcullis/ml/icote.hpp new file mode 100644 index 00000000..d0c5f3fd --- /dev/null +++ b/lib/include/portcullis/ml/icote.hpp @@ -0,0 +1,106 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#pragma once +#include +#include +using std::ostream; +using std::string; + +#include + +namespace portcullis { +namespace ml { + +typedef boost::error_info IcoteErrorInfo; +struct IcoteException: virtual boost::exception, virtual std::exception { }; + + +/** + * Object to perform classification on balanced ensembled selected from random sampling. + * + * It is based on the idea presented in the paper "Exploratory Undersampling + * Class-Imbalance Learning" by Liu et al. + */ +class Icote { +private: + uint16_t k; + uint16_t duplications; + uint16_t threads; + bool verbose; + + double* data; + size_t rows; + size_t cols; + + +protected: + + bool isConstant(size_t col); + + void fselect(); + +public: + + Smote(uint16_t defaultK, uint16_t _smoteness, uint16_t _threads, double* _data, size_t _rows, size_t _cols); + + ~Smote() { + delete[] synthetic; + } + + uint16_t getK() const { + return k; + } + + void setK(uint16_t k) { + this->k = k; + } + + uint16_t getThreads() const { + return threads; + } + + void setThreads(uint16_t threads) { + this->threads = threads; + } + + bool isVerbose() const { + return verbose; + } + + void setVerbose(bool verbose) { + this->verbose = verbose; + } + + double* getSynthetic() { + return synthetic; + } + + size_t getNbSynthRows() const { + return s_rows; + } + + double getSynth(size_t row, size_t col) const { + return synthetic[(row * cols) + col]; + } + + void execute(); + + void print(ostream& out) const; +}; +} +} \ No newline at end of file diff --git a/lib/src/enn.cc b/lib/src/enn.cc index 8abb7bef..e82580d4 100644 --- a/lib/src/enn.cc +++ b/lib/src/enn.cc @@ -40,6 +40,8 @@ portcullis::ml::ENN::ENN(uint16_t defaultK, uint16_t _threads, double* _data, si k = defaultK; threads = _threads; verbose = false; + + threshold = k / 2; } uint32_t portcullis::ml::ENN::execute(vector& results) const { @@ -73,7 +75,7 @@ uint32_t portcullis::ml::ENN::execute(vector& results) const { neg_count++; } } - results[i] = ((pos && pos_count > threshold) || (!pos && neg_count > threshold)); + results[i] = ((pos && pos_count >= threshold) || (!pos && neg_count >= threshold)); if (!results[i]) discard++; } diff --git a/lib/src/icote.cc b/lib/src/icote.cc new file mode 100644 index 00000000..7d59969e --- /dev/null +++ b/lib/src/icote.cc @@ -0,0 +1,171 @@ +// ******************************************************************** +// This file is part of Portcullis. +// +// Portcullis is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Portcullis is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Portcullis. If not, see . +// ******************************************************************* + +#include +using std::cout; +using std::endl; + +#include +using portcullis::ml::KNN; + +#include + +portcullis::ml::Icote::Icote(uint16_t defaultK, uint16_t _duplications, uint16_t _threads, double* _data, size_t _rows, size_t _cols) { + + data = _data; + rows = _rows; + cols = _cols; + if (_rows < defaultK && _rows < 100) + k = _rows; + else + k = defaultK; + duplications = _duplications < 1 ? 1 : _duplications; + threads = _threads; + verbose = false; + + s_rows = duplications * rows; + +} + +void portcullis::ml::Smote::execute() { + + auto_cpu_timer timer(1, "ICOTE Time taken: %ws\n\n"); + + if (verbose) { + cout << "Starting Immume Centroids Oversampling Technique (ICOTE)" << endl; + } + + // Attribute selection + // To reduce computational cost remove constant attributes + if (verbose) cout << "Selecting non-constant features" << endl; + + vector is_const(cols); + vector const_val(cols); + uint16_t nb_const_col = 0; + for(size_t k = 0; k < cols; k++) { + is_const[k] = this->isConstant(k); + if (is_const[k]) { + nb_const_col++; + const_val[k] = data[k]; + } + } + uint16_t sel_col = cols - nb_const_col; + + if (verbose) cout << "Ignoring " << const_col << " columns as values are constant across all samples" << endl; + + double* ag = new double[rows * sel_col]; + for(size_t i = 0; i < rows; i++) { + size_t new_col = 0; + for(size_t j = 0; j < cols; j++) { + if (!is_const[j]) { + ag[(i * sel_col) + new_col++] = data[(i*cols) + j]; + } + } + } + + if (verbose) cout << "Created non-constant feature matrix" << endl; + + + // Normalisation + if (verbose) cout << "Normalising feature matrix" << endl; + vector min_col(sel_col); + vector max_col(sel_col); + for(size_t j = 0; j < sel_col; j++) { + double min_v = std::numeric_limits::max(); + double max_v = std::numeric_limits::min(); + for(size_t i = 0; i < rows; i++) { + double x = ag[(i * sel_col) + j]; + min_v = std::min(min_v, x); + max_v = std::max(max_v, x); + } + min_col[j] = min_v; + max_col[j] = max_v; + } + + for(size_t j = 0; j < sel_col; j++) { + double min_x = min_col[j]; + double max_x = max_col[j]; + for(size_t i = 0; i < rows; i++) { + double x = ag[(i * sel_col) + j]; + ag[(i * sel_col) + j] = (x - min_x) / (max_x - min_x); + } + } + if (verbose) cout << "Normalised feature matrix" << endl; + + if (verbose) cout << "Generating Random Antibodies" << endl; + vector ab(k * sel_col); + std::mt19937 rng(12345); + std::uniform_real_distribution dgen(0.0, 1.0); + + for(size_t i = 0; i < k; i++) { + for(size_t j = 0; j < sel_col; j++) { + ab[(i*sel_col) + j] = dgen(rng); + } + } + if (verbose) cout << "Random Antibodies generated" << endl; + + uint16_t N = 0; + while (N < 10) { + vector M(duplications * rows * sel_col); + + for(size_t i = 0; i < rows; i++) { + vector dist(rows * duplications * sel_col); + for(size_t j = 0; j < duplications; j++) { + double s = 0.0; + for (size_t c = 0; c < sel_cols; c++) { + s += std::pow(ag[i * sel_cols + c] - ab[i * sel_cols + c], 2.0); + } + dist[(i * sel_cols) + j] = std::sqrt(s); + } + } + + if (verbose) cout << "Mutating antibodies" << endl; + + + double s = 0.0; + for (size_t j = 0; j < sel_cols; j++) { + s += std::pow(antibodies[i * sel_cols + j] - attr_sel[i * sel_cols + j], 2.0); + } + double ak = (1.0 / sel_cols) * std::sqrt(ak); + } + + if (verbose) cout << "Antibodies mutated" << endl; + + } + + +} + +void portcullis::ml::Smote::print(ostream& out) const { + + out << "Input:" << endl; + for(size_t i = 0; i < rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << data[(i * cols) + j] << " "; + } + out << endl; + } + + out << endl << "Synthetic:" << endl; + for(size_t i = 0; i < s_rows; i++) { + for(size_t j = 0; j < cols; j++) { + out << synthetic[(i * cols) + j] << " "; + } + out << endl; + } + out << endl; +} \ No newline at end of file diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 464216e4..0e8b4f6f 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -284,7 +284,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { // Work out number of times to duplicate negative set - int N = (pos.size() / neg.size()) - 1; + const int N = (pos.size() / neg.size()) - 1; // Duplicate pointers to negative set JunctionList neg2; @@ -297,11 +297,12 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun cout << "Oversampling negative set to balance with positive set using SMOTE" << endl; Data* negData = juncs2FeatureVectors(neg); - - size_t nelements = negData->getNumRows()*(negData->getNumCols()-1); + const int SC = negData->getNumCols() - 1; + + size_t nelements = negData->getNumRows() * SC ; double* nm = new double[nelements]; for( uint32_t baseidx = 0; baseidx < negData->getNumRows(); baseidx++ ) { - double* r = &nm[baseidx * (negData->getNumCols()-1)]; + double* r = &nm[baseidx * SC]; for( size_t c = 1; c < negData->getNumCols(); c++) { r[c - 1] = negData->get(baseidx, c); //cout << r[c-1]; @@ -309,14 +310,12 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun //cout << endl; } - uint16_t N = pos.size() / neg.size(); - Smote smote(5, N, threads, nm, negData->getNumRows(), negData->getNumCols()-1); smote.execute(); smote_rows = smote.getNbSynthRows(); - smote_data = new double[smote_rows * (negData->getNumCols()-1)]; + smote_data = new double[smote_rows * SC]; double* sd = smote.getSynthetic(); - for(size_t i = 0; i < smote_rows * (negData->getNumCols()-1); i++) { + for(size_t i = 0; i < smote_rows * SC; i++) { smote_data[i] = sd[i]; } cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; @@ -351,24 +350,26 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun Data* trainingData = N > 0 ? new DataDouble( otd->getVariableNames(), - otd->getNumRows() + smote_rows, + x.size() + smote_rows, otd->getNumCols()) : otd; if (N > 0) { + const int SC = trainingData->getNumCols() - 1; bool error = false; for(size_t i = 0; i < otd->getNumRows(); i++) { for(size_t j = 0; j < trainingData->getNumCols(); j++) { trainingData->set(j, i, otd->get(i, j), error); - } + } } - - size_t k = otd->getNumRows(); + + size_t k = x.size(); for(size_t i = 0; i < smote_rows; i++) { - trainingData->set(0, i, 0, error); // Set genuine (i.e. not genuine) flag + trainingData->set(0, k, 0.0, error); // Set genuine (i.e. not genuine) flag for(size_t j = 1; j < trainingData->getNumCols(); j++) { - trainingData->set(j, k++, smote_data[(i * (trainingData->getNumCols() - 1)) + j-1], error); - } + trainingData->set(j, k, smote_data[(i * SC) + j-1], error); + } + k++; } delete[] smote_data; } @@ -380,19 +381,29 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun double* r = &m[baseidx * (trainingData->getNumCols()-1)]; for( size_t c = 1; c < trainingData->getNumCols(); c++) { r[c - 1] = trainingData->get(baseidx, c); - //cout << r[c-1]; } - //cout << endl; } - if (verbose) cout << endl << "Extracting labels for ENN" << endl; + if (verbose) cout << "Extracting labels for ENN" << endl; vector labels; + uint32_t p = 0, n = 0, o = 0; for(size_t i = 0; i < trainingData->getNumRows(); i++) { labels.push_back(trainingData->get(i, 0) == 1.0); + if (trainingData->get(i, 0) == 1.0) { + p++; + } + else if (trainingData->get(i, 0) == 0.0) { + n++; + } + else { + o++; + } } + cout << "P: " << p << "; N: " << n << "; O: " << o << endl; cout << endl << "Starting Wilson's Edited Nearest Neighbour (ENN) to clean decision region" << endl; - ENN enn(5, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + ENN enn(3, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + enn.setThreshold(3); enn.setVerbose(true); vector results; uint32_t count = enn.execute(results); @@ -415,20 +426,30 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun Data* trainingData2 = new DataDouble( trainingData->getVariableNames(), - trainingData->getNumRows() - count, + trainingData->getNumRows() - pcount, trainingData->getNumCols()); size_t new_index = 0; + pcount = 0, ncount = 0; for(size_t i = 0; i < trainingData->getNumRows(); i++) { - if ((trainingData->get(i, 0) == 1.0 && results[i]) || (trainingData->get(i, 0) == 0.0 && !results[i])) { + if (results[i]) { // || trainingData->get(i, 0) == 0.0) { bool error = false; for(size_t j = 0; j < trainingData->getNumCols(); j++) { trainingData2->set(j, new_index, trainingData->get(i, j), error); } new_index++; - } + if (trainingData->get(i, 0) == 1) { + pcount++; + } + else { + ncount++; + } + } } + delete trainingData; + + cout << "Final training set contains " << pcount << " positive entries and " << ncount << " negative entries" << endl; /*path feature_file = outputPrefix + ".features"; if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; @@ -439,7 +460,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun fout << *(x2[i]->getIntron()) << "\t" << trainingData2->getRow(i) << endl; } fout.close();*/ - + if (verbose) cout << "Initialising random forest" << endl; ForestPtr f = nullptr; if (probabilityMode) { @@ -464,7 +485,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun probabilityMode ? DEFAULT_MIN_NODE_SIZE_PROBABILITY : DEFAULT_MIN_NODE_SIZE_CLASSIFICATION, // Min node size "", // Status var name false, // Prediction mode - true, // Replace + false, // Replace catVars, // Unordered categorical variable names (vector) false, // Memory saving AUC, //DEFAULT_SPLITRULE, // Split rule @@ -474,6 +495,9 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun if (verbose) cout << "Training" << endl; f->setVerboseOut(&cerr); f->run(verbose); + cout << "OOBE: " << f->getOverallPredictionError() << endl; + + delete trainingData2; return f; } \ No newline at end of file diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 37b0fe91..57ab2855 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -239,6 +239,7 @@ void portcullis::JunctionFilter::filter() { + double ratio = 0.0; if (train) { @@ -253,6 +254,10 @@ void portcullis::JunctionFilter::filter() { cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl; + ratio = 1.0 - ((double)pos.size() / (double)(pos.size() + neg.size())); + + cout << "Pos to neg ratio: " << ratio << endl << endl; + cout << "Training markov models ..."; cout.flush(); mf.trainCodingPotentialModel(pos); @@ -501,10 +506,10 @@ void portcullis::JunctionFilter::createPositiveSet(const JunctionList& all, Junc cout << p3.size() << "\t" << unlabelled.size(); } - cout << endl << "L95x1.5\t"; + cout << endl << "L95x1.2\t"; - const uint32_t L95 = mf.calcIntronThreshold(p2); - const uint32_t pos_length_limit = L95 * 1.5; + const uint32_t L95 = mf.calcIntronThreshold(p3); + const uint32_t pos_length_limit = L95 * 1.2; JunctionList passJuncs; for (auto& j : p3) { @@ -636,7 +641,7 @@ void portcullis::JunctionFilter::createNegativeSet(uint32_t L95, const JunctionL JunctionList passJuncs; const uint32_t L95x10 = L95 * 10; for (auto& j : f7) { - if (j->getIntronSize() > L95x10 && j->getMaxMMES() < 10) { + if (j->getIntronSize() > L95x10 && j->getMaxMMES() < 12) { p8.push_back(j); } else { failJuncs.push_back(j); From 14ec70c31934664f80289a05b9dd75aa2eb3b952 Mon Sep 17 00:00:00 2001 From: Dan Date: Fri, 20 May 2016 14:59:39 +0100 Subject: [PATCH 19/21] Setting default filter threshold to 0.5 --- src/junction_filter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index f6373bed..258cfd0b 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -81,7 +81,7 @@ const string ST_IPOS_RULES_FILE = "selftrain_initial_pos"; const string ST_INEG_RULES_FILE = "selftrain_initial_neg"; const uint16_t DEFAULT_FILTER_THREADS = 1; const uint16_t DEFAULT_SELFTRAIN_TREES = 100; -const double DEFAULT_FILTER_THRESHOLD = 0.625; +const double DEFAULT_FILTER_THRESHOLD = 0.5; class JunctionFilter { From ec84c70f33426e63ff653e2559c0dd1982df8824 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 27 May 2016 16:18:58 +0100 Subject: [PATCH 20/21] Added ability to control SMOTE and ENN from the command line. --- lib/include/portcullis/ml/model_features.hpp | 2 +- lib/src/model_features.cc | 147 ++++++++++--------- src/junction_filter.cc | 14 +- src/junction_filter.hpp | 19 +++ src/train.cc | 4 +- 5 files changed, 111 insertions(+), 75 deletions(-) diff --git a/lib/include/portcullis/ml/model_features.hpp b/lib/include/portcullis/ml/model_features.hpp index 035ce6f4..a6c762df 100644 --- a/lib/include/portcullis/ml/model_features.hpp +++ b/lib/include/portcullis/ml/model_features.hpp @@ -121,7 +121,7 @@ class ModelFeatures { ForestPtr trainInstance(const JunctionList& pos, const JunctionList& neg, string outputPrefix, - uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose); + uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose, bool smote, bool enn); void resetActiveFeatureIndex() { fi = 0; diff --git a/lib/src/model_features.cc b/lib/src/model_features.cc index 0e8b4f6f..72e929e8 100644 --- a/lib/src/model_features.cc +++ b/lib/src/model_features.cc @@ -281,7 +281,7 @@ Data* portcullis::ml::ModelFeatures::juncs2FeatureVectors(const JunctionList& xl portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const JunctionList& pos, const JunctionList& neg, - string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose) { + string outputPrefix, uint16_t trees, uint16_t threads, bool probabilityMode, bool verbose, bool smote, bool enn) { // Work out number of times to duplicate negative set const int N = (pos.size() / neg.size()) - 1; @@ -293,7 +293,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun uint32_t smote_rows = 0; double* smote_data = 0; - if (N > 0) { + if (N > 0 && smote) { cout << "Oversampling negative set to balance with positive set using SMOTE" << endl; Data* negData = juncs2FeatureVectors(neg); @@ -320,7 +320,7 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun } cout << "Number of synthesized entries: " << smote.getNbSynthRows() << endl; } - else { + else if (N <= 0 && smote) { cout << "Undersampling negative set to balance with positive set" << endl; std::mt19937 rng(12345); @@ -347,14 +347,14 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun Data* otd = juncs2FeatureVectors(x); // Create data to correct size - Data* trainingData = N > 0 ? + Data* trainingData = N > 0 && smote ? new DataDouble( otd->getVariableNames(), x.size() + smote_rows, otd->getNumCols()) : otd; - if (N > 0) { + if (N > 0 && smote) { const int SC = trainingData->getNumCols() - 1; bool error = false; for(size_t i = 0; i < otd->getNumRows(); i++) { @@ -374,83 +374,90 @@ portcullis::ml::ForestPtr portcullis::ml::ModelFeatures::trainInstance(const Jun delete[] smote_data; } - if (verbose) cout << endl << "Converting training data for ENN" << endl; - size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); - double* m = new double[elements]; - for( uint32_t baseidx = 0; baseidx < trainingData->getNumRows(); baseidx++ ) { - double* r = &m[baseidx * (trainingData->getNumCols()-1)]; - for( size_t c = 1; c < trainingData->getNumCols(); c++) { - r[c - 1] = trainingData->get(baseidx, c); - } - } - - if (verbose) cout << "Extracting labels for ENN" << endl; - vector labels; - uint32_t p = 0, n = 0, o = 0; - for(size_t i = 0; i < trainingData->getNumRows(); i++) { - labels.push_back(trainingData->get(i, 0) == 1.0); - if (trainingData->get(i, 0) == 1.0) { - p++; - } - else if (trainingData->get(i, 0) == 0.0) { - n++; - } - else { - o++; - } - } - cout << "P: " << p << "; N: " << n << "; O: " << o << endl; - - cout << endl << "Starting Wilson's Edited Nearest Neighbour (ENN) to clean decision region" << endl; - ENN enn(3, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); - enn.setThreshold(3); - enn.setVerbose(true); vector results; - uint32_t count = enn.execute(results); - - delete[] m; - - uint32_t pcount = 0, ncount = 0; - JunctionList x2; - for(size_t i = 0; i < trainingData->getNumRows(); i++) { - if (trainingData->get(i, 0) == 1.0 && !results[i]) { - pcount++; + + if (enn) { + if (verbose) cout << endl << "Converting training data for ENN" << endl; + size_t elements = trainingData->getNumRows()*(trainingData->getNumCols()-1); + double* m = new double[elements]; + for( uint32_t baseidx = 0; baseidx < trainingData->getNumRows(); baseidx++ ) { + double* r = &m[baseidx * (trainingData->getNumCols()-1)]; + for( size_t c = 1; c < trainingData->getNumCols(); c++) { + r[c - 1] = trainingData->get(baseidx, c); + } + } + + if (verbose) cout << "Extracting labels for ENN" << endl; + vector labels; + uint32_t p = 0, n = 0, o = 0; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + labels.push_back(trainingData->get(i, 0) == 1.0); + if (trainingData->get(i, 0) == 1.0) { + p++; + } + else if (trainingData->get(i, 0) == 0.0) { + n++; + } + else { + o++; + } } - else if (trainingData->get(i, 0) == 0.0 && !results[i]) { - ncount++; + cout << "P: " << p << "; N: " << n << "; O: " << o << endl; + + cout << endl << "Starting Wilson's Edited Nearest Neighbour (ENN) to clean decision region" << endl; + ENN enn(3, threads, m, trainingData->getNumRows(), trainingData->getNumCols()-1, labels); + enn.setThreshold(3); + enn.setVerbose(true); + uint32_t count = enn.execute(results); + + delete[] m; + + uint32_t pcount = 0, ncount = 0; + JunctionList x2; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if (trainingData->get(i, 0) == 1.0 && !results[i]) { + pcount++; + } + else if (trainingData->get(i, 0) == 0.0 && !results[i]) { + ncount++; + } } + + cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (Total=" << count << ")" << endl << endl; } - cout << "Should discard " << pcount << " + entries and " << ncount << " - entries (Total=" << count << ")" << endl << endl; - + uint32_t pcount = 0, ncount = 0; - Data* trainingData2 = new DataDouble( + Data* trainingData2 = enn ? + new DataDouble( trainingData->getVariableNames(), trainingData->getNumRows() - pcount, - trainingData->getNumCols()); - - size_t new_index = 0; - pcount = 0, ncount = 0; - for(size_t i = 0; i < trainingData->getNumRows(); i++) { - if (results[i]) { // || trainingData->get(i, 0) == 0.0) { - bool error = false; - for(size_t j = 0; j < trainingData->getNumCols(); j++) { - trainingData2->set(j, new_index, trainingData->get(i, j), error); - } - new_index++; - if (trainingData->get(i, 0) == 1) { - pcount++; - } - else { - ncount++; + trainingData->getNumCols()) : + trainingData; + + if (enn) { + size_t new_index = 0; + for(size_t i = 0; i < trainingData->getNumRows(); i++) { + if (results[i]) { // || trainingData->get(i, 0) == 0.0) { + bool error = false; + for(size_t j = 0; j < trainingData->getNumCols(); j++) { + trainingData2->set(j, new_index, trainingData->get(i, j), error); + } + new_index++; + if (trainingData->get(i, 0) == 1) { + pcount++; + } + else { + ncount++; + } } } + + delete trainingData; + + cout << "Final training set contains " << pcount << " positive entries and " << ncount << " negative entries" << endl; } - delete trainingData; - - cout << "Final training set contains " << pcount << " positive entries and " << ncount << " negative entries" << endl; - /*path feature_file = outputPrefix + ".features"; if (verbose) cout << "Saving feature vector to disk: " << feature_file << endl; diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 57ab2855..458258ad 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -88,6 +88,8 @@ portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _ju source = DEFAULT_FILTER_SOURCE; verbose = false; threshold = DEFAULT_FILTER_THRESHOLD; + smote = true; + enn = true; } void portcullis::JunctionFilter::filter() { @@ -284,7 +286,7 @@ void portcullis::JunctionFilter::filter() { cout << "Training Random Forest" << endl << "----------------------" << endl << endl; bool done = false; - shared_ptr forest = mf.trainInstance(posSystem.getJunctions(), negSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true); + shared_ptr forest = mf.trainInstance(posSystem.getJunctions(), negSystem.getJunctions(), output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, true, true, smote, enn); /*SemiSupervisedForest ssf(mf, trainingSystem.getJunctions(), unlabelled2, output.string() + ".selftrain", DEFAULT_SELFTRAIN_TREES, threads, 0.1, true); shared_ptr forest = ssf.train();*/ /* @@ -918,6 +920,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { int32_t max_length; string canonical; string source; + bool no_smote; + bool no_enn; double threshold; bool verbose; bool help; @@ -963,7 +967,11 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), - "The threshold score at which we determine a junction to be genuine or not.") + "The threshold score at which we determine a junction to be genuine or not.") + ("no_smote", po::bool_switch(&no_smote)->default_value(false), + "Use this flag to disable synthetic oversampling") + ("no_enn", po::bool_switch(&no_enn)->default_value(false), + "Use this flag to disable Edited Nearest Neighbour") ; // Positional option for the input bam file @@ -1016,6 +1024,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { } filter.setReferenceFile(referenceFile); filter.setThreshold(threshold); + filter.setSmote(!no_smote); + filter.setENN(!no_enn); filter.filter(); diff --git a/src/junction_filter.hpp b/src/junction_filter.hpp index 258cfd0b..34698e04 100644 --- a/src/junction_filter.hpp +++ b/src/junction_filter.hpp @@ -104,6 +104,8 @@ class JunctionFilter { bool filterNovel; string source; double threshold; + bool smote; + bool enn; bool verbose; @@ -220,6 +222,23 @@ class JunctionFilter { this->threads = threads; } + bool isENN() const { + return enn; + } + + void setENN(bool enn) { + this->enn = enn; + } + + bool isSmote() const { + return smote; + } + + void setSmote(bool smote) { + this->smote = smote; + } + + void setCanonical(const string& canonical) { vector modes; boost::split( modes, canonical, boost::is_any_of(","), boost::token_compress_on ); diff --git a/src/train.cc b/src/train.cc index 025aad8c..7a02e3dc 100644 --- a/src/train.cc +++ b/src/train.cc @@ -159,7 +159,7 @@ void portcullis::Train::train() { cout << "Training on full dataset" << endl; - ForestPtr f = ModelFeatures().trainInstance(junctions, JunctionList(), outputPrefix.string(), trees, threads, false, true); + ForestPtr f = ModelFeatures().trainInstance(junctions, JunctionList(), outputPrefix.string(), trees, threads, false, true, false, false); f->saveToFile(); f->writeOutput(&cout); @@ -194,7 +194,7 @@ void portcullis::Train::train() { kf.getFold(i, back_inserter(train), back_inserter(test)); // Train on this particular set - ForestPtr f = ModelFeatures().trainInstance(train, JunctionList(), outputPrefix.string(), trees, threads, false, false); + ForestPtr f = ModelFeatures().trainInstance(train, JunctionList(), outputPrefix.string(), trees, threads, false, false, false, false); // Test model instance testInstance(f, test); From 44df64d120aa8c95d58d5998b3fb3e516534bcb2 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Thu, 16 Jun 2016 11:28:20 +0100 Subject: [PATCH 21/21] Improved the way the markov chains are used to score new examples by reduciung the agressiveness of the way scores are reduced if the example contains a missing kmer in the model. This seems to provide a boost to performance. Inverted the way scores are represented. Now a score of 0.0 is not a junction and 1.0 is a junction. Working on ICOTE which might provide an improvement over the SMOTE technique... still WIP. --- lib/src/icote.cc | 3 +++ lib/src/markov_model.cc | 13 ++++++++++++- src/junction_filter.cc | 24 +++++++++++------------- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/lib/src/icote.cc b/lib/src/icote.cc index 7d59969e..127f8f29 100644 --- a/lib/src/icote.cc +++ b/lib/src/icote.cc @@ -131,6 +131,9 @@ void portcullis::ml::Smote::execute() { } dist[(i * sel_cols) + j] = std::sqrt(s); } + + // Clone K antibodies in proportion to antigen-antibody affinity + } if (verbose) cout << "Mutating antibodies" << endl; diff --git a/lib/src/markov_model.cc b/lib/src/markov_model.cc index 98564207..5af054a7 100644 --- a/lib/src/markov_model.cc +++ b/lib/src/markov_model.cc @@ -56,12 +56,23 @@ void portcullis::ml::KmerMarkovModel::train(const vector& input, const u double portcullis::ml::KmerMarkovModel::getScore(const string& seq) { string s = SeqUtils::makeClean(seq); double score = 1.0; + uint32_t no_count = 0; for(uint16_t i = order; i < s.size(); i++){ - score *= model[s.substr(i-order, order)][s.substr(i, 1)]; + double m = model[s.substr(i-order, order)][s.substr(i, 1)]; + if (m != 0.0) { + score *= m; + } + else { + no_count++; + } } if(score == 0.0) { return -100.0; } + else if (no_count > 2) { + // Add a penalty for situations where we repeatedly don't find a kmer in the tranining set + score /= ((double)no_count * 0.5); + } return log(score); } diff --git a/src/junction_filter.cc b/src/junction_filter.cc index 458258ad..5860ad54 100644 --- a/src/junction_filter.cc +++ b/src/junction_filter.cc @@ -824,7 +824,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction for (size_t i = 0; i < all.size(); i++) { - scoreStream << f->getPredictions()[i][0] << "\t" << *(all[i]->getIntron()) + scoreStream << (1.0 - f->getPredictions()[i][0]) << "\t" << *(all[i]->getIntron()) << "\t" << strandToChar(all[i]->getConsensusStrand()) << "\t" << cssToChar(all[i]->getSpliceSiteType()) << "\t" << testingData->getRow(i) << endl; @@ -881,7 +881,7 @@ void portcullis::JunctionFilter::forestPredict(const JunctionList& all, Junction void portcullis::JunctionFilter::categorise(shared_ptr f, const JunctionList& all, JunctionList& pass, JunctionList& fail, double t) { for (size_t i = 0; i < all.size(); i++) { - if (f->getPredictions()[i][0] <= t) { + if ((1.0 - f->getPredictions()[i][0]) >= t) { pass.push_back(all[i]); } else { fail.push_back(all[i]); @@ -895,7 +895,7 @@ double portcullis::JunctionFilter::calcGoodThreshold(shared_ptr f, const uint32_t neg = 0; for (auto& p : f->getPredictions()) { - if (p[0] <= 0.5) { + if ((1.0 - p[0]) >= 0.5) { pos++; } else { neg++; @@ -921,7 +921,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { string canonical; string source; bool no_smote; - bool no_enn; + bool enn; double threshold; bool verbose; bool help; @@ -955,6 +955,10 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { "Keep junctions based on their splice site status. Valid options: OFF,C,S,N. Where C = Canonical junctions (GU-AG), S = Semi-canonical junctions (AT-AC, or GT-AG), N = Non-canonical. OFF means, keep all junctions (i.e. don't filter by canonical status). User can separate options by a comma to keep two categories.") ("threads,t", po::value(&threads)->default_value(DEFAULT_FILTER_THREADS), "The number of threads to use during testing (only applies if using forest model).") + ("enn", po::bool_switch(&enn)->default_value(false), + "Use this flag to enable Edited Nearest Neighbour to clean decision region") + ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), + "The threshold score at which we determine a junction to be genuine or not. Increase value towards 0.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.") ("verbose,v", po::bool_switch(&verbose)->default_value(false), "Print extra information") ("help", po::bool_switch(&help)->default_value(false), "Produce help message") @@ -965,13 +969,9 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { po::options_description hidden_options("Hidden options"); hidden_options.add_options() ("prep_data_dir,i", po::value(&prepDir), "Path to directory containing prepared data.") - ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") - ("threshold", po::value(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD), - "The threshold score at which we determine a junction to be genuine or not.") + ("junction_file", po::value(&junctionFile), "Path to the junction file to process.") ("no_smote", po::bool_switch(&no_smote)->default_value(false), - "Use this flag to disable synthetic oversampling") - ("no_enn", po::bool_switch(&no_enn)->default_value(false), - "Use this flag to disable Edited Nearest Neighbour") + "Use this flag to disable synthetic oversampling") ; // Positional option for the input bam file @@ -995,8 +995,6 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { return 1; } - - auto_cpu_timer timer(1, "\nPortcullis junction filter completed.\nTotal runtime: %ws\n\n"); cout << "Running portcullis in junction filter mode" << endl @@ -1025,7 +1023,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) { filter.setReferenceFile(referenceFile); filter.setThreshold(threshold); filter.setSmote(!no_smote); - filter.setENN(!no_enn); + filter.setENN(enn); filter.filter();