From 0a7f455b913a0a0d9d554a23d68ead3cb9ad741f Mon Sep 17 00:00:00 2001 From: grenaud Date: Wed, 8 Feb 2023 15:15:26 +0100 Subject: [PATCH] added seed and version --- README.md | 2 +- libgab | 2 +- src/Makefile | 12 ++- src/MergeTrimReads.cpp | 22 ++-- src/MergeTrimReads.h | 11 +- src/get_version.sh | 9 ++ src/leeHom.cpp | 239 +++++++++++++++++++++++++++++------------ 7 files changed, 216 insertions(+), 81 deletions(-) create mode 100755 src/get_version.sh diff --git a/README.md b/README.md index ab1be68..bfd7c9e 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ Contact your sequencing center or core and ask them to send you the sequence of For single-end, no. For paired-end, mate have to be consecutive in the file (i.e. `First mate 1`, `second mate 1`, `First mate 2`, `second mate 2`, _etc.._ ). This can be done by either using raw data from the sequencer or sorting by name. ## Does my BAM file have to be unmapped? -BAM files can also be used to store unmapped reads. Ideally, leehom should be used prior to mapping since reads will map to a more accurate location if the adapters have been removed and overlapping stretches have been merged. +BAM files can also be used to store unmapped reads. leeHom should be used prior to mapping since reads will map to a more accurate location if the adapters have been removed and overlapping stretches have been merged. ## What do the `FF` tags in the BAM file mean? diff --git a/libgab b/libgab index e08a186..81da9c6 160000 --- a/libgab +++ b/libgab @@ -1 +1 @@ -Subproject commit e08a18635571fa35c888c07e18e31b0c15cead23 +Subproject commit 81da9c622242225c76494b7eb2b8c9af03aebc12 diff --git a/src/Makefile b/src/Makefile index aaac175..d8cf8aa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -50,12 +50,22 @@ all: ${BAMTOOLSLIBOBJ} ${LIBGABLIBOBJ} MergeTrimReads.o leeHom %.o: %.cpp ${CXX} -c ${CXXFLAGS} $^ -leeHom: leeHom.o MergeTrimReads.o ${LIBGABLIBOBJ} +version.h: + ./get_version.sh + +leeHom: version.h leeHom.o MergeTrimReads.o ${LIBGABLIBOBJ} @echo "linking" @echo ${LDLIBS} @echo ${LDFLAGS} ${CXX} -o $@ $^ $(LDLIBS) $(LDFLAGS) +static: version.h leeHom.o MergeTrimReads.o ${LIBGABLIBOBJ} + @echo "staic linking" + @echo ${LDLIBS} + @echo ${LDFLAGS} + ${CXX} -static -o leeHom $^ $(LDLIBS) $(LDFLAGS) + + ${LIBGABINC}/libgab.h: rm -rf ../libgab/ mkdir ../libgab/ diff --git a/src/MergeTrimReads.cpp b/src/MergeTrimReads.cpp index add94a3..2d1765c 100644 --- a/src/MergeTrimReads.cpp +++ b/src/MergeTrimReads.cpp @@ -411,12 +411,18 @@ void MergeTrimReads::initMerge(){ inline double MergeTrimReads::randomGen(){ // static bool initialized = false; + if(initialized == false){ - struct timeval time; - gettimeofday(&time,NULL); - srand(time.tv_usec); + if(seedSpecified){ + srand(seed); + }else{ + //use time of day as seed + struct timeval time; + gettimeofday(&time,NULL); + srand(time.tv_usec); + } initialized=true; - return (double(rand())/double(RAND_MAX)); + return (double(rand())/double(RAND_MAX)); }else{ return (double(rand())/double(RAND_MAX)); } @@ -2058,8 +2064,8 @@ MergeTrimReads::MergeTrimReads (const string& forward_, const string& reverse_, const string& key1_, const string& key2_, const bool trimKey_, const string& ikey_, - int trimcutoff_,bool allowMissing_, - bool ancientDNA_,double location_,double scale_,bool useDist_,int qualOffset): + const int trimcutoff_,const bool allowMissing_, + const bool ancientDNA_,const double location_,const double scale_,const bool useDist_,const int qualOffset,const unsigned int seed_,const bool seedSpecified_): //bool mergeoverlap_) : min_length (5), qualOffset (qualOffset), @@ -2075,7 +2081,9 @@ MergeTrimReads::MergeTrimReads (const string& forward_, const string& reverse_, location(location_), scale(scale_), - useDist(useDist_) + useDist(useDist_), + seed(seed_), + seedSpecified(seedSpecified_) { initialized = false; diff --git a/src/MergeTrimReads.h b/src/MergeTrimReads.h index 8c25f90..0147820 100644 --- a/src/MergeTrimReads.h +++ b/src/MergeTrimReads.h @@ -86,6 +86,8 @@ class MergeTrimReads{ size_t min_overlap_seqs; + unsigned int seed; + bool seedSpecified; /* // Key variables /// */ bool handle_key; @@ -302,7 +304,14 @@ class MergeTrimReads{ public: MergeTrimReads (const string& forward_, const string& reverse_, const string& chimera_, const string& key1_="", const string& key2_="",const bool trimKey_=false,const string& ikey_="", - int trimcutoff_=1,bool allowMissing_=false,bool ancientDNA_=false,double location_=-1.0,double scale_=-1.0,bool useDist_=false,int qualOffset=33); + const int trimcutoff_=1, + const bool allowMissing_=false, + const bool ancientDNA_=false, + const double location_=-1.0, + const double scale_=-1.0, + const bool useDist_=false, + const int qualOffset=33, + const unsigned int seed=0,const bool seedSpecified=false); MergeTrimReads(const MergeTrimReads & other); ~MergeTrimReads(); diff --git a/src/get_version.sh b/src/get_version.sh new file mode 100755 index 0000000..a03f1bd --- /dev/null +++ b/src/get_version.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +version=$(git describe --tags $(git rev-list --tags --max-count=1)) + +if [ $? -eq 0 ]; then + echo "#define VERSION \"$version\"" > version.h +else + echo "#define VERSION "NA"" > version.h +fi diff --git a/src/leeHom.cpp b/src/leeHom.cpp index a561198..f8feb3c 100644 --- a/src/leeHom.cpp +++ b/src/leeHom.cpp @@ -12,6 +12,7 @@ #include "FastQParser.h" #include "libgab.h" +#include "version.h" //TODO char qualscore to like @@ -29,13 +30,13 @@ using namespace std; using namespace BamTools; typedef struct { - ogzstream single; - ogzstream pairr1; - ogzstream pairr2; + ogzstream * single; + ogzstream * pairr1; + ogzstream * pairr2; - ogzstream singlef; - ogzstream pairr1f; - ogzstream pairr2f; + ogzstream * singlef; + ogzstream * pairr1f; + ogzstream * pairr2f; } fqwriters; @@ -1485,15 +1486,15 @@ bool checkForWritingLoopFQ(fqwriters * onereadgroup,int * lastWrittenChunk,Merge } } } - - onereadgroup->pairr2f<<"@"<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<dataToProcess->at(i).s2<dataToProcess->at(i).q2<pairr1f<<"@"<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<dataToProcess->at(i).s1<dataToProcess->at(i).q1<pairr1f)<<"@"<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<dataToProcess->at(i).s1<dataToProcess->at(i).q1<pairr2f)<<"@"<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<dataToProcess->at(i).s2<dataToProcess->at(i).q2<dataToProcess->at(i).sequence != ""){ //new sequence - onereadgroup->single<<"@"<dataToProcess->at(i).d1<<" " <dataToProcess->at(i).cmt<dataToProcess->at(i).sequence<dataToProcess->at(i).quality<single)<<"@"<dataToProcess->at(i).d1<<" " <dataToProcess->at(i).cmt<dataToProcess->at(i).sequence<dataToProcess->at(i).quality<dataToProcess->at(i).sequence.length()+umif+umir+umtf+umtr) > max(dataToWrite->dataToProcess->at(i).s1.length(),dataToWrite->dataToProcess->at(i).s2.length()) ){ mtr->incrementCountmergedoverlap(); @@ -1504,8 +1505,8 @@ bool checkForWritingLoopFQ(fqwriters * onereadgroup,int * lastWrittenChunk,Merge }else{ //keep as is mtr->incrementCountnothing(); - onereadgroup->pairr2<<"@"<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<< endl <dataToProcess->at(i).s2<dataToProcess->at(i).q2<pairr1<<"@"<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<< endl <dataToProcess->at(i).s1<dataToProcess->at(i).q1<pairr1)<<"@"<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<< endl <dataToProcess->at(i).s1<dataToProcess->at(i).q1<pairr2)<<"@"<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<< endl <dataToProcess->at(i).s2<dataToProcess->at(i).q2<singlef<<""<dataToProcess->at(i).d1<<" " <dataToProcess->at(i).cmt<dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <singlef)<<""<dataToProcess->at(i).d1<<" " <dataToProcess->at(i).cmt<dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).sequence != ""){ //new sequence mtr->incrementCounttrimmed(); - onereadgroup->single<<""<dataToProcess->at(i).d1<<" "<dataToProcess->at(i).cmt <dataToProcess->at(i).sequence <dataToProcess->at(i).quality<single)<<""<dataToProcess->at(i).d1<<" "<dataToProcess->at(i).cmt <dataToProcess->at(i).sequence <dataToProcess->at(i).quality<incrementCountnothing(); - onereadgroup->single<<""<dataToProcess->at(i).d1<<" "<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <single)<<""<dataToProcess->at(i).d1<<" "<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <(argv[i+1]); + i++; + continue; + } if(strcmp(argv[i],"--umif") == 0 ){ umif = destringify(argv[i+1]); @@ -2062,12 +2100,19 @@ int main (int argc, char *argv[]) { return 1; } }else{ - if(fastqoutfile==""){ + if(!fastqoutfile.empty() && !intfastqoutfile.empty()){ + cerr<<"If running in fastq input/output mode, cannot specify both -fqo and -fqt."<open(outdirs.c_str(), ios::out); + onereadgroup.singlef->open(outdirsf.c_str(), ios::out); + + if(!onereadgroup.single->good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<open(outdirs.c_str(), ios::out); + onereadgroup.singlef=onereadgroup.single; + + if(!onereadgroup.single->good()){ cerr<<"Cannot write to file "<open(outdirs.c_str(), ios::out); + onereadgroup.pairr1->open(outdir1.c_str(), ios::out); + onereadgroup.pairr2->open(outdir2.c_str(), ios::out); - string outdirs = fastqoutfile+".fq.gz"; - string outdir1 = fastqoutfile+"_r1.fq.gz"; - string outdir2 = fastqoutfile+"_r2.fq.gz"; - - string outdirsf = fastqoutfile+".fail.fq.gz"; - string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; - string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; + onereadgroup.singlef->open(outdirsf.c_str(), ios::out); + onereadgroup.pairr1f->open(outdir1f.c_str(), ios::out); + onereadgroup.pairr2f->open(outdir2f.c_str(), ios::out); - onereadgroup.single.open(outdirs.c_str(), ios::out); - onereadgroup.pairr1.open(outdir1.c_str(), ios::out); - onereadgroup.pairr2.open(outdir2.c_str(), ios::out); + if(!onereadgroup.single->good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<open(outdirs.c_str(), ios::out); - if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<good()){ cerr<<"Cannot write to file "<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<dataToProcess->at(i).s2<dataToProcess->at(i).q2<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<dataToProcess->at(i).s1<dataToProcess->at(i).q1<dataToProcess->at(i).d1<<"/1" <dataToProcess->at(i).cmt<dataToProcess->at(i).s1<dataToProcess->at(i).q1<dataToProcess->at(i).d2<<"/2" <dataToProcess->at(i).cmt<dataToProcess->at(i).s2<dataToProcess->at(i).q2<dataToProcess->at(i).sequence != ""){ //new sequence - onereadgroup.single<<"@"<dataToProcess->at(i).d1 << dataToWrite->dataToProcess->at(i).cmt<dataToProcess->at(i).sequence<dataToProcess->at(i).quality<dataToProcess->at(i).d1 << dataToWrite->dataToProcess->at(i).cmt<dataToProcess->at(i).sequence<dataToProcess->at(i).quality<dataToProcess->at(i).sequence.length()+umif+umir+umiTf+umiTr) > max(dataToWrite->dataToProcess->at(i).s1.length(),dataToWrite->dataToProcess->at(i).s2.length()) ){ mtr->incrementCountmergedoverlap(); @@ -2472,11 +2550,11 @@ int main (int argc, char *argv[]) { }else{ //keep as is mtr->incrementCountnothing(); if(trimKey){ - onereadgroup.pairr2<<"@"<dataToProcess->at(i).d2<<"/2"<dataToProcess->at(i).cmt <dataToProcess->at(i).s2.substr( key2.length() )<dataToProcess->at(i).q2.substr( key2.length() )<dataToProcess->at(i).d1<<"/1"<dataToProcess->at(i).cmt <dataToProcess->at(i).s1.substr( key1.length() )<dataToProcess->at(i).q1.substr( key1.length() )<dataToProcess->at(i).d1<<"/1"<dataToProcess->at(i).cmt <dataToProcess->at(i).s1.substr( key1.length() )<dataToProcess->at(i).q1.substr( key1.length() )<dataToProcess->at(i).d2<<"/2"<dataToProcess->at(i).cmt <dataToProcess->at(i).s2.substr( key2.length() )<dataToProcess->at(i).q2.substr( key2.length() )<dataToProcess->at(i).d2<<"/2"<dataToProcess->at(i).cmt <dataToProcess->at(i).s2 <dataToProcess->at(i).q2 <dataToProcess->at(i).d1<<"/1"<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).d1<<"/1"<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).d2<<"/2"<dataToProcess->at(i).cmt <dataToProcess->at(i).s2 <dataToProcess->at(i).q2 <dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).sequence != ""){ //new sequence mtr->incrementCounttrimmed(); - onereadgroup.single<<""<dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).sequence <dataToProcess->at(i).quality<dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).sequence <dataToProcess->at(i).quality<incrementCountnothing(); if(trimKey){ - onereadgroup.single<<""<dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1.substr( key1.length() )<dataToProcess->at(i).q1.substr( key1.length() )<dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1.substr( key1.length() )<dataToProcess->at(i).q1.substr( key1.length() )<dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <dataToProcess->at(i).d1<<""<dataToProcess->at(i).cmt <dataToProcess->at(i).s1 <dataToProcess->at(i).q1 <close(); + onereadgroup.singlef->close(); + delete(onereadgroup.single); + delete(onereadgroup.singlef); + }else{ + onereadgroup.single->close(); + delete(onereadgroup.single); + } - onereadgroup.singlef.close(); - onereadgroup.pairr1f.close(); - onereadgroup.pairr2f.close(); + }else{ + if(!fastqoutfile.empty()){ + onereadgroup.single->close(); + onereadgroup.pairr1->close(); + onereadgroup.pairr2->close(); + + onereadgroup.singlef->close(); + onereadgroup.pairr1f->close(); + onereadgroup.pairr2f->close(); + + delete(onereadgroup.single); + delete(onereadgroup.pairr1); + delete(onereadgroup.pairr2); + + delete(onereadgroup.singlef); + delete(onereadgroup.pairr1f); + delete(onereadgroup.pairr2f); + + }else{ + onereadgroup.single->close(); + delete(onereadgroup.single); + } }