From be675e9f57f8aa8874120370bb2fb4ea1745347b Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:20:49 +0800 Subject: [PATCH 01/13] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 40e0f52..2c96354 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ================================================================================= ## Overall Here we provide the package of OMSV with all source codes (C++, bash script, and Matlab codes) and binary files. The scrips of OMSV have been tested in Debian GNU/Linux 9.0 (stretch) and CentOS Linux release 7.3.1611 (Core) platform and the matlab code was developed in Matlab R2011b (7.13.0.564) 64-bit (glnxa64). The way to start our pipeline within a few steps and the details of the programs, including input parameters and supported file formats, are shown in "Readme\_OMSV.txt". -**All data** (raw optical mapping data and alignments) are accessible at [here](https://drive.google.com/drive/folders/0B4T3OTv54s2DbDdLRFdqdERXZkE). The alignment file, as well as the 2-round split alignment, of NA12878 are provided for the quick start. +**All data** (raw optical mapping data and alignments) are accessible at [Zenodo repository](https://doi.org/10.5281/zenodo.886387). The alignment file, as well as the 2-round split alignment, of NA12878 are provided for the quick start. ================================================================================= ## Quick Start @@ -14,14 +14,14 @@ To quickly try our OMSV Caller, please first download the package and alignment >cd OMSV >chmod 777 makefile >./makefile ->./callSV.sh (Run the original version, of which the results are consistent with the published results) +>./callSV.sh (Run the latest version, of which the results should be slightly different) -callSV\_old.sh (callSV.sh) provide the examples of calling the OMSV callers (Matlab is required to call CNVs). Then you can find the resulting lists in SV\_result with prefix 12878 (e.g. 12878Indel.osv, 12878Mixed\_indel.osv, and 12878Complex\_total.bed). +callSV.sh provide the examples of calling the OMSV callers (Matlab is required to call CNVs). Then you can find the resulting lists in SV\_result with prefix 12878 (e.g. 12878Indel.osv, 12878Mixed\_indel.osv, and 12878Complex\_total.bed). ================================================================================= ## Parameters and Settings of the tools: -+ OMSV/OMSV\_mixedIndel (type ./OMSV to check the parameters) to call indels and mixed indels and site variations: ++ OMSV (type ./OMSV to check the parameters) to call indels and mixed indels and site variations: - -inputLabel: Default value: 878. The index/label of genome. - -outputFolder: Default value: ./. The path of the folder to store the output fils. - -SVoutputFile: Default value: Detected_structual_variants. The prefix of the file name of SVs (.osv). From 081704187e1990846e6c116e1754e5af69360834 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:23:44 +0800 Subject: [PATCH 02/13] Update callSV.sh --- callSV.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/callSV.sh b/callSV.sh index a77d94c..d97d2b1 100644 --- a/callSV.sh +++ b/callSV.sh @@ -7,18 +7,17 @@ fi if [ $step -lt 1 ] then -echo "1. Start to call large indels" +echo "1. Start to call large indels and site of NA12878" mkdir temp + ./OMSV -inputLabel 12878 -outputFolder SV_result/ -SVoutputFile Indel -chrMapFile hg38_r.cmap -optAlignFile data/NA12878_700bp_hg38_combRefOMB2.oma -optTempFolder temp/ -rm -f SV_result/*_List.txt -./postFilter.sh SV_result/12878Indel_2000.osv 2000 -echo "2. Start to call mixed indels" -./OMSV_mixedIndel -inputLabel 12878 -outputFolder SV_result/ -SVoutputFile Mixed_indel -chrMapFile hg38_r.cmap -optAlignFile data/NA12878_700bp_hg38_combRefOMB2.oma -optTempFolder temp/ -rm -f SV_result/*_List.txt -./postFilter.sh SV_result/12878Mixed_indel_2000.osv 2000 -sed '/Homozygous/d' SV_result/12878Mixed_indel_2000.osv | sed '/Heterozygous/d' > SV_result/12878Mixed_indel_2000.osv_tp && mv SV_result/12878Mixed_indel_2000.osv_tp SV_result/12878Mixed_indel_2000.osv +sed '/.site\|#/!d' SV_result/12878Indel_2000.osv > SV_result/12878Sites_2000.osv +sed '/Homozygous\|Heterozygous/d' SV_result/12878Indel_2000.osv > SV_result/12878Mixed_indel_2000.osv +sed -i '/.site\|HDI\|HI1I2\|HD1D2/d' SV_result/12878Indel_2000.osv +rm -f SV_result/*_List.txt +./postFilter.sh SV_result/12878Indel_2000.osv 2000 rm -r -f temp fi From a022d94890b09b54d381bc98a4cb1062751da3b8 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:24:09 +0800 Subject: [PATCH 03/13] Update makefile --- makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/makefile b/makefile index a450b90..5be3368 100644 --- a/makefile +++ b/makefile @@ -1,5 +1,4 @@ -g++ -std=c++11 source/OMSV_v3.cpp -o OMSV -g++ -std=c++11 source/OMSV_mixedIndel_v3.cpp -o OMSV_mixedIndel +g++ -std=c++11 source/OMSV_v4.cpp -o OMSV g++ -std=c++11 source/filterSV.cpp -o filterSV g++ -std=c++11 source/adjust_split.cpp -o adjust_split g++ -std=c++11 source/breakpoint.cpp -o breakpoint From 26b48e877b51ac7dc9ea9b1c67ea34b05072d4e7 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:26:25 +0800 Subject: [PATCH 04/13] Delete OMSV_mixedIndel_v3.cpp --- source/OMSV_mixedIndel_v3.cpp | 2048 --------------------------------- 1 file changed, 2048 deletions(-) delete mode 100644 source/OMSV_mixedIndel_v3.cpp diff --git a/source/OMSV_mixedIndel_v3.cpp b/source/OMSV_mixedIndel_v3.cpp deleted file mode 100644 index 30ffe89..0000000 --- a/source/OMSV_mixedIndel_v3.cpp +++ /dev/null @@ -1,2048 +0,0 @@ -#include "NoError.h" -// Can choose Alden, Ref, CHM1, CHM1_2, Angel1, Angel2, Angel3, Angel4, -// Ref_Sim, Alden_sim_T0, Sim_Ref_OMDP, Sim_Alden_OMDP -// Pipeline_Real, Pipeline_Sim, CHM1ToASM, Autism, Schizo -// Real_Ref, Real_Assembly_Hg38, Real_Molecule_Assembly -// Sim_Haploid_Pipeline, Sim_Diploid_Pipeline -// Sim_Hap_Ref, Sim_Dip_Ref -// YH, Sim2_Haploid_OMBlast -// Han // Alden1_1, Alden1_1000, Alden2_1, Alden2_1000 -//#define Alden2_1000 - -FILE* inputChrConsen; -FILE* inputOptAlign; -FILE* outputResult1; -FILE* outputResult2; -FILE* outputVariantResultFile; -char inputAlignmentFileName[1000]; -char outputFileLocation[500]; -FILE *inputAlignmentFile; -FILE *outputFileList[500]; -LL numberOfOpticalMap; -//#if defined (Real_Assembly_Hg38) || defined(Real_Molecule_Assembly) //FILE* outputMoleculeToAssembly; //FILE* outputAssembly; -//#endif -char tempString[10000]; -LL tempLongLong; -double tempDouble; -double tempDistanceDouble[20000]; -double C[9000][9000]; -bool canOpenFile = true; - - -double minIndelSize; -double minIndelRatio; -double resolutionLimit; -LL inputTrio; -double digestionRate; -double falseCutRate; -double pValueCutOff; -double cauchyMean; -double cauchyScale; -LL numberOfSupportEachSide; -double confidenceLimit; -LL numberOfSupportIndelMolecule; -LL numberOfSupportSignalMolecule; -double likelihoodRatioCutOff; -//Start add -struct opticalMapType{ - char mapId[100]; - LL refStartIndex; - LL refEndIndex; - LL refStart; - LL refEnd; - LL optStart; - LL optEnd; - LL chrId; - bool orientation; - double score; - double confidence; -// char hitEnum[1000]; - char hitEnum[1000]; - double fpr=0;//// - double fnr=0;//// - double alignRate=0;//// - vector position; -/* void calc()//// - {//// - int tot_fp = 0, tot_fn = 0; - for (int i = 0; hitEnum[i]!='\0'; i++) - { - int curr = 0; - int fp, fn; - int j; - for (j = i; hitEnum[j] >= '0' && hitEnum[j] <= '9'; j++) - { - curr = curr * 10 + (hitEnum[j] - '0'); - } - if (hitEnum[j] == 'I') - { - fp++; - tot_fp += curr; - } - else if (hitEnum[j] == 'D') - { - fn++; - tot_fn += curr; - } - i = j; - } - int mol_len = 0; - int ali_len = 0; - int cnt = 0; - for (auto ele:position) - { - cnt++; - mol_len += ele; - if (cnt >= min(optStart,optEnd)&&cnt <= max(optStart,optEnd)) ali_len+=ele; - } - fpr = tot_fp*1.0/ali_len; - fnr = tot_fn*1.0/(abs(refEndIndex-refStartIndex)+1); - alignRate = ali_len*1.0/mol_len; - }////*/ - void print(){ - printf("%s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", mapId, score, confidence, fpr,fnr,alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); -// printf("%s %lf %lf %lld %lld %lld %lld %lld %s %lld ", mapId, score, confidence, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); - for (LL i=0; i<(LL)position.size(); i++) - printf("%d ", position[i]); - printf("\n"); - } - void print(FILE* targetFile){ - sprintf(tempString, "%lld", inputTrio); - fprintf(targetFile, "%s %s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", tempString, mapId, score, confidence, fpr, fnr, alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); -// fprintf(targetFile, "%s %s %lf %lf %lld %lld %lld %lld %lld %s %lld ", tempString, mapId, score, confidence, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); - for (LL i=0; i<(LL)position.size(); i++) - fprintf(targetFile, "%d ", position[i]); - fprintf(targetFile, "\n"); - } -}; -bool ss(opticalMapType q, opticalMapType w){ - LL e = strcmp(q.mapId, w.mapId); - return ((e<0?true:false) || (e==0 && q.chrId < w.chrId) || (e == 0 && q.chrId == w.chrId && q.optStart < w.optEnd)); -} - -vector opticalMap1; - -void readSourceFile(){ - if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); - int numberOfDL = 0; - char ttt; - char ttts[100000]; - while(fgetc(inputAlignmentFile) == '#') - { - numberOfDL++; - fgets(ttts,100000,inputAlignmentFile); - } - LL totSize = 0; - while(fscanf(inputAlignmentFile,"%s",ttts)==1){ - totSize++; - fgets(ttts,100000,inputAlignmentFile); - } - fclose(inputAlignmentFile); - if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); - opticalMap1.clear(); - opticalMap1.resize(totSize+10000); -// opticalMap1 = new opticalMapType[20000000]; - for (LL i=0; i 0 - && opticalMap1[i+1].refStart - opticalMap1[i].refEnd < 5000000){ - opticalMap1[tempCC] = opticalMap1[i]; - opticalMap1[tempCC].optStart = opticalMap1[i].optEnd; - opticalMap1[tempCC].optEnd = opticalMap1[i+1].optStart; - opticalMap1[tempCC].refStart = opticalMap1[i].refEnd; - opticalMap1[tempCC].refEnd = opticalMap1[i+1].refStart; - opticalMap1[tempCC].score = 10.0; - opticalMap1[tempCC].confidence = 0.1; - opticalMap1[tempCC].orientation = true; -// opticalMap1[tempCC].calc();//// - opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// - opticalMap1[tempCC].fnr = opticalMap1[i].fnr + opticalMap1[i+1].fnr;//// - //opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// - opticalMap1[tempCC].position.clear(); -// opticalMap1[tempCC].hitEnum = new char[4]; - strcpy(opticalMap1[tempCC].hitEnum, "FFF"); - if (opticalMap1[tempCC].optStart > opticalMap1[tempCC].optEnd) - tempCC--; - tempCC++; - if (tempCC==totSize){ - totSize+=50000; - opticalMap1.resize(totSize); - } - } - } - numberOfOpticalMap = tempCC; - opticalMap1.resize(numberOfOpticalMap); -} - -void outputToBillMapDestinationFile(){ - for (LL i=0; i numOfChr) continue; - if (outputFileList[opticalMap1[i].chrId] == NULL) - initOutput(opticalMap1[i].chrId,"w+"); - else - initOutput(opticalMap1[i].chrId,"a+"); - opticalMap1[i].print(outputFileList[opticalMap1[i].chrId]); - fclose(outputFileList[opticalMap1[i].chrId]); - } - opticalMap1.clear(); - vector().swap(opticalMap1); -} - - -//stop add - -void setDefault() { - digestionRate = 0.875; - falseCutRate = 0.000010; - pValueCutOff = 1e-9; - cauchyMean = 1.0096; - cauchyScale = 0.0291; - numberOfSupportEachSide = 1; - confidenceLimit = 9; - numberOfSupportIndelMolecule = 10; //coverage of the indel - numberOfSupportSignalMolecule = 10; - likelihoodRatioCutOff = 1e6; // Settings - minIndelSize = 2000; - minIndelRatio = 0.05; - resolutionLimit = 700; - inputTrio = 0507; -} - -LL newChrSite = 0; -LL numberOfHomoMissingFragment = 0; -LL numberOfHomoDeletedSNP = 0; -LL homoMissingSite = 0; -LL numberOfNoSupportSV = 0; -LL homoNonMissingSite = 0; -LL homoInsertSite = 0; -LL homoNonInsertSite = 0; LL heterMissingSite = 0; LL heterNonMissingSite = 0; LL heterInsertSite = 0; LL heterNonInsertSite = 0; LL numberOfNickingSite = 0; LL numberOfOM878 = 0; LL numberOfOM891 = 0; -LL numberOfDetailOpticalMap = 0; LL distancePairCount = 0; LL numberOfVariant = 0; LL numberOfSupportedSV = 0; - -LL tempPre = -1; LL tempCounter = 0; vector listOfChromosome; - -// shortestDist[10] refers to distance between 9-11 -//double shortestDist[500000]; -//LL shortestDistCount[500000]; - -// shortestGapDist[10] refers to distance between 10-11 -//double shortestGapDist[500000]; -//LL shortestGapDistCount[500000]; - - -// 0 = Homo Ins/Del // 1 = Homo Non-Ins/Del // 2 = Heter Ins/Del // 3 = Heter Non-Ins/Del // 4 = nothing - -LL curId; int chrId; - - -bool feq(double x, double y){ - //return fabs(x - y) <= eps; - return fabs(x - y) <= eps*(min(fabs(x),fabs(y)));//should be normalized by the values of x and y -} -bool feq1(double x, double y){ - return fabs(x - y) <= eps; - // return fabs(x - y) <= eps*fabs(min(x,y));//should be normalized by the values of x and y -} - - -LL max(LL x, LL y){ - return x > y ? x : y; -} - -LL min(LL x, LL y){ - return x > y ? y : x; -} - -double average(vector a){ - double ans = 0; - for (LL i=0; i<(LL)a.size(); i++) - ans += a[i]; - ans /= a.size(); - return ans; -} - - -struct variantType{ - LL people; - LL chr; - LL start; - LL end; - LL size; - LL sizeExt; - LL support; -// double score; -// double confidence; -// double fpr;//// - // double fnr;//// -// double alignRate;//// - LL oppoSupp;//// - double ratio; - double ratioExt; - double likelihood; - bool isSignal; - bool isDel; - bool isHomo; - bool isSupported; - void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSizeExt, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double inputRatioExt, double likely, LL supp, LL IoppoSupp) - {//version for SV -// fpr = Ifpr;//// - // fnr = Ifnr;//// - // alignRate = IalignRate;//// - oppoSupp = IoppoSupp;//// -// score = Iscore; -// confidence = Iconfidence; - people = inputPeople; - chr = inputChr; - start = inputStart; - end = inputEnd; - size = inputSize; - sizeExt = inputSizeExt; - ratio = inputRatio; - ratioExt = inputRatioExt; - likelihood = likely; - support = supp; - isSignal = (inputSignal == 1); - isDel = (inputDel == 1); - isHomo = (inputHomo == 1); - isSupported = (inputSupport == 1); - }; - void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double likely, LL supp, LL IoppoSupp) - {//version for signal changes -// fpr = Ifpr;//// - // fnr = Ifnr;//// - // alignRate = IalignRate;//// - oppoSupp = IoppoSupp;//// -// score = Iscore; -// confidence = Iconfidence; - people = inputPeople; - chr = inputChr; - start = inputStart; - end = inputEnd; - size = inputSize; - sizeExt = 0; - ratio = inputRatio; - ratioExt = 0; - likelihood = likely; - support = supp; - isSignal = (inputSignal == 1); - isDel = (inputDel == 1); - isHomo = (inputHomo == 1); - isSupported = (inputSupport == 1); - }; - bool operator<(const variantType &x) const{ - return (chr < x.chr || (chr == x.chr && start < x.start) || (chr == x.chr && start == x.start && end < x.end) || (chr == x.chr && start == x.start && end == x.end && isSupported)); - } - - /*void print(){ - printf("%lld %lld %lld %lld %lld %d %d %d\n", people, chr, start, end, size, isSignal, isDel, isHomo); - };*/ -}; -struct chrType{ - LL length; - LL numberOfSites; -// LL position[50000]; - // distance[0] = position[1] - position[0] -// LL distance[50000]; -// LL oldDistance[50000]; -// LL gapDistance[50000]; - // coverage[1] refer to nicking site 1 -// LL coverage[50000]; - // occurrence[1] refer to nicking site 1 -// LL occurrence[50000]; - // gapCount[1] refers to gap between site 1 to 2 -// LL gapCount[50000]; -// LL gapCoverage[50000]; - // If it is significant by Kevin's definition -// bool gapSigni[50000]; -// bool signi[50000]; - // distance Difference -// double distanceDifference[50000]; -// double gapDistanceDifference[50000]; - LL* position = new LL[10]; - LL* distance = new LL[10]; - int* coverage = new int[10]; - int* occurrence = new int[10]; - int* gapCount = new int[10]; - int* gapCoverage = new int[10]; - bool* gapSigni = new bool[10]; - bool* signi = new bool[10]; -}; -chrType chromosome; - -struct optAlignType{ - LL belongs; - char mapId[100]; - LL optStart; - LL optEnd; - LL refStart; - LL refEnd; - LL numberOfSites; - double score; - double confidence; - char hitEnum[1200]; - - // position = distance[i+1] - distance[i] - int position[2500]; - int oldPosition[2500]; - double fpr;//// - double fnr;//// - double alignRate;//// - -}; - -struct distanceType{ - char mapId[20]; - LL start; - LL end; -// double score; -// double confidence; -// double fpr;//// - // double fnr;//// - // double alignRate;//// - double distance; - LL cnt; - bool operator<(const distanceType &x) const{ - return (start < x.start || (start == x.start && end < x.end)); - } - void print(){ - printf("chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end]-chromosome.position[start], distance, cnt); - } - void printToFile(FILE* outputFile){ - fprintf(outputFile, "chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end] - chromosome.position[start], distance, cnt); - } -}; -vector opticalMap; -vector distancePair; -vector variant; -void init(){ - // Has reduced by half -// opticalMap = new optAlignType[3000000]; -// distancePair = new distanceType[100000000]; - variant.clear(); - variant.resize(100000); -} -char outputFolder[1000]; - -FILE* createInputTrioFile(const char *suffix, LL inputTrio, const char *mode){ - char nameOfFile[1000]; - char buffer[200]; - memset(buffer, 0, sizeof(buffer)); - strcpy(nameOfFile, outputFolder); - sprintf(buffer, "%lld_%g_", inputTrio,minIndelSize); - strcat(nameOfFile, buffer); - strcat(nameOfFile, suffix); - //printf("The file name is: %s\nAnd mode is %s\n",nameOfFile,mode); - FILE* retFile = fopen(nameOfFile, mode); - if (retFile == NULL) perror("error in opening file!"); - return retFile; -} - - - -void initResultFullList(char* SVoutputFile){ - char nameOfFile[1000]; - char buffer[200]; - char extPart[200]; - outputResult1 = createInputTrioFile("Distance_Pairs_List.txt", inputTrio, "w+"); - outputResult2 = createInputTrioFile("Signal_Indels_List.txt", inputTrio, "w+"); /* -#if defined(Real_Molecule_Assembly) - outputMoleculeToAssembly = createInputTrioFile("/local/shared/bill/BillMapFromREAL_Molecule_ASM_HG38/distanceList/", "moleculeToAssembly_List.txt", inputTrio, "w+"); -#endif if defined(Real_Assembly_Hg38) -*/ - sprintf(extPart,"_%g",minIndelSize); - memset(nameOfFile, 0, sizeof(nameOfFile)); - strcpy(nameOfFile, outputFolder); - sprintf(buffer, "%lld", inputTrio); - strcat(nameOfFile, buffer); - strcat(nameOfFile, SVoutputFile); - strcat(nameOfFile, extPart); - strcat(nameOfFile, ".osv"); -// printf("%s\n",nameOfFile); - if ((outputVariantResultFile = fopen(nameOfFile, "w+")) == NULL){ - perror("error opening file(): nameOfFile"); - } -} - -bool overlapped(LL x1, LL y1, LL x2, LL y2){ - if (x1 >= x2 && x1 <= y2) return true; - if (y1 >= x2 && y1 <= y2) return true; - if (x2 >= x1 && x2 <= y1) return true; - if (y2 >= x1 && y2 <= y1) return true; - return false; -} -void getChromosomeList(char* chrMapFile){ - listOfChromosome.clear(); - char nameOfFile[1000]; - strcpy(nameOfFile, chrMapFile); - if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL){ - perror("error opening file(): ChromosomeInfo"); - } - fscanf(inputChrConsen,"%s",tempString); - while(tempString[0]=='#'){ - fgets(tempString, 10000, inputChrConsen); - fscanf(inputChrConsen,"%s",tempString); - } - LL tempChrId, tempNumberOfSites; - double tempDouble1, tempDouble2; - tempChrId = atoll(tempString); - if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); - listOfChromosome.push_back(tempChrId); - fgets(tempString, 10000, inputChrConsen); - while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ - listOfChromosome.push_back(tempChrId); - fgets(tempString, 10000, inputChrConsen); - } - fclose(inputChrConsen); - sort(listOfChromosome.begin(), listOfChromosome.end()); - vector::iterator tempIt; - tempIt = unique(listOfChromosome.begin(), listOfChromosome.end()); - listOfChromosome.resize(distance(listOfChromosome.begin(), tempIt)); - printf("Number of Chromosomes: %lld\n", (LL)listOfChromosome.size()); -} - -void readChromosomeInfo(int chr,char* chrMapFile){ - char nameOfFile[1000]; - strcpy(nameOfFile, chrMapFile); - if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL){ - perror("error opening file(): ChromosomeInfo"); - } - fscanf(inputChrConsen,"%s",tempString); - while(tempString[0]=='#'){ - fgets(tempString, 10000, inputChrConsen); - fscanf(inputChrConsen,"%s",tempString); - } - LL cc = 0; - LL tempChrId, tempNumberOfSites; - double tempDouble1, tempDouble2; - tempChrId = atoll(tempString); - bool done= false; - delete[] chromosome.position; - delete[] chromosome.distance; - delete[] chromosome.coverage; - delete[] chromosome.occurrence; - delete[] chromosome.gapCount; - delete[] chromosome.gapCoverage; - delete[] chromosome.gapSigni; - delete[] chromosome.signi; - if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); - if (tempChrId == chr){ - chromosome.numberOfSites = tempNumberOfSites; - chromosome.position = new LL[tempNumberOfSites+10]; - chromosome.distance = new LL[tempNumberOfSites+10]; - chromosome.coverage = new int[tempNumberOfSites+10]; - chromosome.occurrence = new int[tempNumberOfSites+10]; - chromosome.gapCount = new int[tempNumberOfSites+10]; - chromosome.gapCoverage = new int[tempNumberOfSites+10]; - chromosome.gapSigni = new bool[tempNumberOfSites+10]; - chromosome.signi = new bool[tempNumberOfSites+10]; - done = true; - chromosome.position[cc] = (LL) tempDouble2; - chromosome.length = (LL) tempDouble1; - chromosome.gapCount[cc] = 0; - chromosome.coverage[cc] = 0; - chromosome.occurrence[cc] = 0; - cc++; - } - fgets(tempString, 10000, inputChrConsen); - while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ - if (tempChrId == chr){ - chromosome.numberOfSites = tempNumberOfSites; - if (!done){ - chromosome.position = new LL[tempNumberOfSites+10]; - chromosome.distance = new LL[tempNumberOfSites+10]; - chromosome.coverage = new int[tempNumberOfSites+10]; - chromosome.occurrence = new int[tempNumberOfSites+10]; - chromosome.gapCount = new int[tempNumberOfSites+10]; - chromosome.gapCoverage = new int[tempNumberOfSites+10]; - chromosome.gapSigni = new bool[tempNumberOfSites+10]; - chromosome.signi = new bool[tempNumberOfSites+10]; - done = true; - } - chromosome.position[cc] = (LL) tempDouble2; - chromosome.length = (LL) tempDouble1; - chromosome.gapCount[cc] = 0; - chromosome.coverage[cc] = 0; - chromosome.occurrence[cc] = 0; - cc++; - } - fgets(tempString, 10000, inputChrConsen); - } - chromosome.numberOfSites++; - for (LL i=0; i tempMap; -// vector tempVec[10000]; - LL oppoSupp = 0; -/* bool vectorOverFlow = false; - for (LL i=start-1; i>=0; i--){ - if (distancePair[i].start >= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ - string mapFrom(distancePair[i].mapId); - LL mapTo = (LL)tempMap.size() + 1; - if (tempMap.count(mapFrom) == 0){ - tempMap.insert(make_pair(mapFrom, mapTo)); - } - LL newIndex = tempMap[mapFrom]; - if (newIndex >= 10000){ - vectorOverFlow = true; - break; - } - tempVec[newIndex].push_back(distancePair[i]); - } else - break; - } - if (vectorOverFlow) return; - for (LL i=end+1; i= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ - string mapFrom(distancePair[i].mapId); - LL mapTo = (LL)tempMap.size() + 1; - if (tempMap.count(mapFrom) == 0){ - tempMap.insert(make_pair(mapFrom, mapTo)); - } - LL newIndex = tempMap[mapFrom]; - if (newIndex >= 10000){ - vectorOverFlow = true; - break; - } - tempVec[newIndex].push_back(distancePair[i]); - } else - break; - } - if (vectorOverFlow) return;*/ - double averageScore = 0;//// - double averageConfidence = 0;//// - double averageFPR = 0;//// - double averageFNR = 0;//// - double averageAlignRate = 0;//// -/* for (LL i=1; i<(LL)tempMap.size()+1; i++){ - double tempMoleCoverSize = 0; - double tempMoleRealSize = 0; - double tempScore = 0;//// - double tempConfidence =0;//// - double tempFPR = 0;//// - double tempFNR = 0;//// - double tempAlignRate = 0;//// - for (LL j=0; j<(LL)tempVec[i].size(); j++){ - tempMoleCoverSize += (chromosome.position[tempVec[i][j].end] - chromosome.position[tempVec[i][j].start]); - tempMoleRealSize += tempVec[i][j].distance; - } - if (feq1(tempMoleCoverSize, referenceDistance)) - { - tempDistanceDouble[cnt++] = tempMoleRealSize/referenceDistance; - if (abs(tempMoleRealSize - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - } - //double averageScore = 0; - //double averageConfidence = 0; - for (LL i=start; i<=end; i++) - { - tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; - if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - sort(tempDistanceDouble, tempDistanceDouble + cnt);*/ - - vector maps; - vector dists; - maps.clear(); - dists.clear(); -// fprintf(outputResult1, "%d %lld %lld %lld %lld\t", chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], distancePair[start].start+1, distancePair[start].end+1); - for (LL i=start; i<=end; i++) - { - bool fal = true; - for (LL j = 0; j < (LL)maps.size(); j++) - { - if (maps[j] == atol(distancePair[i].mapId) && abs(dists[j] - distancePair[i].distance) < 1){fal = false;break;} - } - if (fal) - { - maps.push_back(atol(distancePair[i].mapId)); - dists.push_back(distancePair[i].distance); - tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; - // fprintf(outputResult1, "%lld:%lld ",atol(distancePair[i].mapId),(LL)distancePair[i].distance); - //printf("Let me see: %g\t%g\t%lld\n", distancePair[i].distance,referenceDistance,oppoSupp); - if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - } - // fprintf(outputResult1, "%lld\n",(LL)maps.size()); - // for (LL i = 0; i < (LL)maps.size(); i++) - // fprintf(outputResult1, "%lld:%lld ",maps[i],(LL)dists[i]); - //fprintf(outputResult1, "\n"); -// fprintf(outputResult1, "%lld\n",cnt); - sort(tempDistanceDouble, tempDistanceDouble + cnt); - if (cnt < numberOfSupportIndelMolecule) {//why 2500? - // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0); - // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0); - // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0); - numberOfNoSupportSV++; - return; - } - //use another vector to remove the outliers in two ends to purify the results - //int outRange = static_cast(cnt/10)>1?static_cast(cnt/10):1; - int outlierRange = 0; - double tempDistanceDoubleNoOutlier[cnt]; - LL cntNoOutlier = 0; - for (LL i = outlierRange; i < cnt-outlierRange;) - { - tempDistanceDoubleNoOutlier[cntNoOutlier++] = tempDistanceDouble[i++]; - } - double sampleMean = 0; - for (LL i=0; i nullHypothesisLikelihood) - { - // Update Homo Indel Variant: also need to consider HI1I2, HD1D2, HDI - if (feq(tempMaxLikelihood, homoIndelHypothesisLikelihood)) - { - if (homoIndelHypothesisLikelihood/nullHL > hetIHL/nullHL*hetDHL/nullHL) - { // if homoHL_ratio > hetIHL_ratio*hetDHL_ratio, we consider homo Indel - //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; - if (tempSizeChange < 0) - { - posFlag = 1; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, tempMaxLikelihood, cnt, oppoSupp); - } - else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) - { - posFlag = 2; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, tempMaxLikelihood, cnt, oppoSupp); - } - } - else if ( ( hetIHL/likelihoodRatioCutOff>nullHL )&&( hetDHL/likelihoodRatioCutOff>nullHL ) )//(hetIHL/nullHL>likelihoodRatioCutOff&&hetDHL/nullHL>likelihoodRatioCutOff) - { // if homoHL_ratio > hetIHL_ratio*hetDHL_ratio, we consider the potential HI1I2, HD1D2, HDI - double disRight = heterInsHypothesisMaxLikelihoodSizeChanged; - double disLeft = heterDelHypothesisMaxLikelihoodSizeChanged; - double disBias = disRight - disLeft;//&&(max(hetIHL/hetDHL,hetDHL/hetIHL)<1e10) - if ( ( cnt>=numberOfSupportIndelMolecule*1.5)&&\ - ( fabs(disBias)>=max(minIndelSize,minIndelRatio*max(fabs(disRight),fabs(disLeft))) ) ) - //( hetIHL/likelihoodRatioCutOff/likelihoodRatioCutOff>nullHL )&&( hetDHL/likelihoodRatioCutOff/likelihoodRatioCutOff>nullHL ) ) - {// if all three differences are significant, then call HI1I2, HD1D2, HDI resp., and store them into separate two variants with the same ref region first - if (disRight > 0) - { - posFlag = 3; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag, hetIHL, cnt, oppoSupp); - } - else - { - posFlag = 4; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 1, 0, 1, hetIHL/nullHL, posFlag, hetIHL, cnt, oppoSupp); - } - if (disLeft > 0) - { - posFlag = 5; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 0, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); - } - else - { - posFlag = 6; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); - } - } - else - {// else also output homo Indel - //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; - if (tempSizeChange < 0) - { - posFlag = 7; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) - { - posFlag = 8; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - } - } - else - { - if (tempSizeChange < 0) - { - posFlag = 9; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) - { - posFlag = 10; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - } - } - // Update Heter InDel Variant, need to modified tomorrow - else if (feq(tempMaxLikelihood, hetIHL)||feq(tempMaxLikelihood, hetDHL)) - { - double disRight = heterInsHypothesisMaxLikelihoodSizeChanged; - double disLeft = heterDelHypothesisMaxLikelihoodSizeChanged; - double disBias = disRight - disLeft; - - if ( (hetDHL/likelihoodRatioCutOff > nullHL) && (hetIHL/likelihoodRatioCutOff > nullHL) ) - {//if two sides are both confident, then we need to check the possible HI1I2, HD1D2, and HDI - //printf("Actually no such cases?\n"); - if ( (cnt>=numberOfSupportIndelMolecule*1.5 )&&\ - ( fabs(disBias)>=max(minIndelSize,minIndelRatio*max(fabs(disRight),fabs(disLeft))) ) ) - //( min(hetDHL,hetIHL)/likelihoodRatioCutOff/likelihoodRatioCutOff > nullHL ) ) - {// if all three differences are significant, then call HI1I2, HD1D2, HDI resp., and store them into separate two variants with the sam$ - if (disRight > 0) - { - posFlag = -1; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL, cnt, oppoSupp); - } - else - { - posFlag = -2; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 1, 0, 1, hetIHL/nullHL, posFlag, hetIHL,cnt, oppoSupp); - } - if (disLeft > 0) - { - posFlag = -3; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 0, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); - } - else - { - posFlag = -4; - // printf("Find one case with position flag %d\n",posFlag); - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt, oppoSupp); - } - } - else if ((disRight*disLeft>0)&&(homoIndelHypothesisLikelihood/nullHL > max(max(hetIHL,hetDHL)/min(hetIHL,hetDHL),likelihoodRatioCutOff)))//is it appropriate? No, I need to - {// else if homo is more significant when adjust the heter case (reduce significance if both side support the same case), then output homo Indel - //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; - if (tempSizeChange < 0) - { - posFlag = -5; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) - { - posFlag = -6; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); - } - } - else - if (hetIHL>hetDHL) - { - posFlag = -7; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL,cnt, oppoSupp); - } - else //if ((hetIHL= max(minIndelSize, minIndelRatio * referenceDistance))) - { - posFlag = -8; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt, oppoSupp); - } - } - else - { - if (hetIHL>hetDHL) - { - posFlag = -9; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL,cnt, oppoSupp); - } - else //if (fabs(disLeft) >= max(minIndelSize, minIndelRatio * referenceDistance)) - { - posFlag = -10; - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt,oppoSupp); - } - } - } - } -// if (chromosome.position[distancePair[start].start]==9810098) // { // printf("Let me see, the segment if (%lld, %lld), and the position flag is %d.\n",chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], posFlag); // } -} - -void completePairs() -{ -// sort(distancePair, distancePair + distancePairCount); - sort(distancePair.begin(), distancePair.end()); - vector extraDP; - extraDP.clear(); - LL ost = -1, oed = -1; - for (LL i = 0; i < distancePairCount; i++) - { - if (distancePair[i].start!= ost || distancePair[i].end!=oed)//this locate the non-redundant reference regions (NRRR) - { - distanceType tempPair; - ost = distancePair[i].start; - oed = distancePair[i].end; - if (oed-ost == 1)continue; - for (LL j = 0; j <= i-1; j++)//this search the possible sub-regions - { - //LL ted = distancePair[j].end; - if (distancePair[j].start < ost) continue; - else if (distancePair[j].start > ost) break; - else//for each sub-region holding the same start point with NRRR, we explore the consecutive regions on the same molecule - { - tempPair = distancePair[j]; - bool rep = true; - for (LL k = j+1; distancePair[k].start= oed) break; - } - if (tempPair.end == oed) //no need - { - rep = false; - for (LL l = i; l < distancePairCount; l++) - { - if (distancePair[i] < distancePair[l])break; - else if (strcmp(distancePair[l].mapId, tempPair.mapId)==0) {rep = true; break;} - } - } - if (!rep) extraDP.push_back(tempPair); - } - } - } - else - continue; - } - printf("There are %lld extra distance pairs are added.\n",(LL)extraDP.size()); - // also need to include the new distance pairs into the original set - distancePair.resize(distancePairCount+extraDP.size()); - for (LL i = 0; i < (LL)extraDP.size(); i++){ - distancePair[distancePairCount++]=extraDP[i]; - } -} -void detectByDistancePair(){ - sort(distancePair.begin(), distancePair.end());// sort(distancePair, distancePair + distancePairCount); - distancePair.resize(distancePairCount+1);// printDistancePair(); - distancePair[distancePairCount].start = -1; - distancePair[distancePairCount].end = -1; - distancePairCount++; - LL tempHead = 0, tempTail = 0, tempPreNumVar, tempPreNumNoSuppVar; - for (LL i=0; i=distancePairCount?0:distancePair[i+1].distance; - } - distancePairCount = cc; - distancePair.resize(cc); -// memset(shortestDist, 0, sizeof(shortestDist)); -// memset(shortestDistCount, 0, sizeof(shortestDistCount)); -// memset(shortestGapDist, 0, sizeof(shortestGapDist)); -// memset(shortestGapDistCount, 0, sizeof(shortestGapDistCount)); -// for (LL i=0; i= '0' && opticalMap[i].hitEnum[j] <= '9'){ - tempCount = tempCount * 10 + (opticalMap[i].hitEnum[j]-'0'); - } - else { - if (opticalMap[i].hitEnum[j] == 'M'){ - for (LL k=0; kdistancePair.size()-10000)distancePair.resize(distancePair.size()+100000); - } - distancePair.resize(distancePairCount); - opticalMap.clear(); - vector().swap(opticalMap); -} -void NCRcalculation(){ - for (int i = 0; i < 9000; ++i){ - C[i][0] = C[i][i] = 1; - for (int j = 1; j < i; ++j) - C[i][j] = (C[i - 1][j] + C[i - 1][j - 1]); - for (int j = i + 1; j < 9000; ++j) - C[i][j] = 0; - } -} - -void deletionTest(int chrId){ -// if (chrId == 1){ -// fprintf(outputResult2, "ID\tChr\tSite_Index\tPosition\tCoverage\tOccurence\tP\tU\tP-value\tH0\tHa\tHb\tRef_Dis\tMolecule_Dis\tCount\tType\n"); -// } - for (LL i=0; i 5000) continue; - - // printf("%lld %lld %lld %lld\n", i, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i]); - numberOfNickingSite++; - double pValue = 0; - double likelihoodh0 = 0; - double likelihoodha = 0; - double likelihoodhb = 0; - double likelihoodRatio = 0; - for (LL j=chromosome.coverage[i] - chromosome.occurrence[i]; j<=chromosome.coverage[i]; j++){ - pValue += C[chromosome.coverage[i]][j]*pow(1-digestionRate, j)*pow(digestionRate, chromosome.coverage[i] - j); - } - likelihoodh0 = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(1-digestionRate, chromosome.coverage[i] - chromosome.occurrence[i])*pow(digestionRate, chromosome.occurrence[i]); - - double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); - double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); - - atLeastOneFalseCutRate = 0.1; - likelihoodha = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(atLeastOneFalseCutRate, chromosome.occurrence[i])*pow(1 - atLeastOneFalseCutRate, chromosome.coverage[i] - chromosome.occurrence[i]); - // double likelihoodRatio2 = likelihoodh0/likelihoodha; - likelihoodRatio = pow((1-digestionRate)/(1-atLeastOneFalseCutRate), chromosome.coverage[i] - chromosome.occurrence[i]) * pow(digestionRate/atLeastOneFalseCutRate, chromosome.occurrence[i]); - - LL x = chromosome.coverage[i] - chromosome.occurrence[i]; - LL k = chromosome.coverage[i]; - double p = digestionRate; - double u = atLeastOneFalseCutRate; - for (LL k1 = 0; k1 <= k; k1++){ - double tempLhb = 0; - LL k2 = k - k1; - for (LL y=max(0, x - k2); y <= min(k1, x); y++){ - tempLhb += C[k1][y]*C[k2][x-y]*pow(1-p, y)*pow(p, k1-y)*pow(u, y+k2-x)*pow(1-u, x-y); - } - - likelihoodhb += C[k][k1]*tempLhb; - } - likelihoodhb /= pow(2, k); - - -// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, (i+1=0?chromosome.position[i-1]:0), (LL)shortestDist[i], shortestDistCount[i], "Del"); - - -// chromosome.distanceDifference[i] = (i+1=0?chromosome.position[i-1]:0) - shortestDist[i]; - - - if (pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) - chromosome.signi[i] = true; - else chromosome.signi[i] = false; - - if (chromosome.signi[i]){ - if (likelihoodha > likelihoodhb){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 1, 1, pValue, likelihoodha, x, 0); - } else { - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 0, 1, pValue, likelihoodhb, x, 0); - } - if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); - } - // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Miss"); - } -} - -void insertionTest(int chrId){ - for (LL i=0; i 5000) continue; - - double pValue = 0; - double likelihoodh0 = 0; - double likelihoodha = 0; - double likelihoodhb = 0; - double likelihoodRatio = 0; - double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); - double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); - - - /* - TEST - atLeastOneFalseCutRate = 0.1; - TEST - */ - for (LL j=chromosome.gapCount[i]; j<=chromosome.gapCoverage[i]; j++){ - pValue += C[chromosome.gapCoverage[i]][j]*pow(atLeastOneFalseCutRate, j)*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - j); - } - likelihoodh0 = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(atLeastOneFalseCutRate, chromosome.gapCount[i])*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); - likelihoodha = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(digestionRate, chromosome.gapCount[i])*pow(1-digestionRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); - likelihoodRatio = pow((1-atLeastOneFalseCutRate)/(1-digestionRate), chromosome.gapCoverage[i] - chromosome.gapCount[i]) * pow(atLeastOneFalseCutRate/digestionRate, chromosome.gapCount[i]); - - // printf("%10lE %10lE\n", likelihoodh0 / likelihoodha, likelihoodRatio); - - LL x = chromosome.gapCount[i]; - LL k = chromosome.gapCoverage[i]; - double p = digestionRate; - double u = atLeastOneFalseCutRate; - for (LL k1 = 0; k1 <= k; k1++){ - double tempLhb = 0; - LL k2 = k - k1; - for (LL y=max(0, x - k2); y <= min(k1, x); y++){ - tempLhb += C[k1][y]*C[k2][x-y]*pow(u, y)*pow(1-u, k1-y)*pow(1-p, y+k2-x)*pow(p, x-y); - } - likelihoodhb += C[k][k1]*tempLhb; - } - likelihoodhb /= pow(2, k); - - -// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, ((i+1= numberOfSupportSignalMolecule && pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) - chromosome.gapSigni[i] = true; - else chromosome.gapSigni[i] = false; - - if (chromosome.gapSigni[i]){ - if (likelihoodha > likelihoodhb){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 1, 1, pValue, likelihoodha, x, 0); - } else { - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 0, 1, pValue, likelihoodha, x, 0); - } - if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); - } - // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId,chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Ins"); - } -} - -void initStatData(){ - homoMissingSite = 0; - homoNonMissingSite = 0; - homoInsertSite = 0; - homoNonInsertSite = 0; - heterMissingSite = 0; - heterNonMissingSite = 0; - heterInsertSite = 0; - heterNonInsertSite = 0; - numberOfNickingSite = 0; - numberOfHomoDeletedSNP = 0; - numberOfHomoMissingFragment = 0; - numberOfVariant = 0; - numberOfNoSupportSV = 0; -} -void initData(){ - memset(chromosome.position, 0, sizeof(chromosome.position)); - memset(chromosome.coverage, 0, sizeof(chromosome.coverage)); - memset(chromosome.occurrence, 0, sizeof(chromosome.occurrence)); - memset(chromosome.gapCoverage, 0, sizeof(chromosome.gapCoverage)); - memset(chromosome.gapCount, 0, sizeof(chromosome.gapCount)); - chromosome.length = 0; - chromosome.numberOfSites = 0; - //memset(opticalMap, 0, sizeof(opticalMap)); - //memset(distancePair, 0, sizeof(distancePair)); - distancePairCount = 0; -} -void correctOverlapVariant(){ -// delete opticalMap; -// delete distancePair; -// variantType *tempVariant = new variantType[500000]; -// variantType *tempRealVariant = new variantType[500000]; - vector tempVariant; - tempVariant.resize(variant.size()+10000); - vector tempRealVariant; - tempRealVariant.resize(variant.size()+10000); - LL tempCnt = 0, tempRealCnt = 0; - // Consider Segmental Only - for (LL i=0; i=tempRealVariant[sRep+1].ratio?sRep:sRep+1; - LL chooseY = tempRealVariant[sRep].ratio= tempRealVariant[tempIndex].end) || \ - (tempRealVariant[tempIndex].start <= tempRealVariant[tempIndex+1].start && tempRealVariant[tempIndex].end >= tempRealVariant[tempIndex+1].end) - )) - tempIndex++; - tempEnd = tempIndex; - LL tempMaxLength = -1, tempMaxLengthIndex = -1; - LL tempDelCnt = 0, tempInsCnt = 0, tempHomoCnt = 0, tempHeterCnt = 0; - -////////////////////////S1:find the maximum length//////////////////////////////// -/* - // Find non-signal variants first - for (LL i=tempStart; i<=tempEnd; i++){ - if (!tempRealVariant[i].isSignal && tempRealVariant[i].end - tempRealVariant[i].start > tempMaxLength){ - tempMaxLength = tempRealVariant[i].end - tempRealVariant[i].start; - tempMaxLengthIndex = i; - } - } - // If can't find, find others - if (tempMaxLengthIndex!=-1) tempVariant[tempCnt++] = tempRealVariant[tempMaxLengthIndex]; -*/ -////////////////////////S2:find the cases without completely covering any others/////////////////////////////////////////////// - for (LL i=tempStart; i<=tempEnd; i++) - { - if (tempRealVariant[i].isSignal)continue; - bool flag = true; - for (LL j = tempStart; j <=tempEnd; j++) - { - if (i==j)continue; - if (tempRealVariant[i].end >= tempRealVariant[j].end && tempRealVariant[i].start <= tempRealVariant[j].start) - { - flag = false; - break; - } - } - if (flag) tempVariant[tempCnt++] = tempRealVariant[i]; - } - //tempCnt++; - tempStart = tempEnd+1; - } - - - - printf("numberOfRealVariant after filter overlapping: %lld\n", tempCnt); - - for (LL i=0; i0&&variant[i].sizeExt>0) - { - strcpy(tempH,"HI1I2"); - strcpy(tempIndel,"Insertions"); - } - else if (variant[i].size<0&&variant[i].sizeExt<0) - { - strcpy(tempH,"HD1D2"); - strcpy(tempIndel,"Deletions"); - } - else if (variant[i].size*variant[i].sizeExt<0) - { - strcpy(tempH,"HDI"); - strcpy(tempIndel,"Both"); - } - else - { - strcpy(tempH,(variant[i].isHomo?"Homozygous":"Heterozygous")); - strcpy(tempIndel,(variant[i].isDel?"Deletion":"Insertion")); - } -// if (strcmp(tempH,"Homozygous")!=0&&strcmp(tempH,"Heterozygous")!=0){ - fprintf(outputVariantResultFile, "%lld\t%lld\t%lld\t%s%s\t%lld\t%lf\t%lld\tsizeChange=%lld\t%s\t%g\t%g\t%lld\t%lld\n", variant[i].chr, variant[i].start, variant[i].end, tempIndel, variant[i].isSignal?".site":"", variant[i].sizeExt, 0.0, inputTrio, variant[i].size, tempH, variant[i].likelihood, variant[i].ratio,variant[i].support, variant[i].oppoSupp); - numberOfSupportedSV++; -// } - } -} - -int main(int argv, char *argc[]){ - printf("\n"); - setDefault(); - char SVoutputFile[1000]; - memset(SVoutputFile,0,sizeof(SVoutputFile)); - strcpy(SVoutputFile,"Detected_structual_variants"); - memset(outputFolder,0,sizeof(outputFolder)); - char chrMapFile[1000]; - int numOfChr = 24; - memset(chrMapFile,0,sizeof(chrMapFile)); - strcpy(inputAlignmentFileName,"Alden_1000_c666_1/combined_1000_c1.oma"); - strcpy(outputFileLocation,"Alden_1000_c666_1/"); - strcpy(chrMapFile,"/home/lil/OM/RefMaps_c666_1/GRCh38_25chr.condense.1000.cmap"); - if (argv == 1) - { - printf("\nThe valid parameters are described as follows:\n"); - printf("\t-inputLabel: \n\t\t Default value: %lld. The index of genome in the trio data. No effect if only investigate one genome.\n",inputTrio); - printf("\t-outputFolder: \n\t\t The path of the folder to store the output fils. This folder must be exist!\n"); - printf("\t-SVoutputFile: \n\t\t Default value: %s. The file name of SVs (.osv).\n",SVoutputFile); - printf("\t-chrMapFile: \n\t\t Default value: %s. The file name of the reference map(.cmap).\n",chrMapFile); - printf("\t-optAlignFile: \n\t\t Default value: %s. The file name of the alignment map file(.oma).\n",inputAlignmentFileName); - printf("\t-optTempFolder: \n\t\t Default value: %s. The folder to store the processed alignment maps by chromosomes.\n",outputFileLocation); - printf("\t-likelihoodRatioCutOff: \n\t\t Default value: %g. The cutoff of the likelihood ratio for SV all hypothesis. The default value changes along with the experiment data.\n",likelihoodRatioCutOff); - printf("\t-numberOfSupportIndelMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment being called SVs. The default value changes along with the experiment data.\n",numberOfSupportIndelMolecule); - printf("\t-numberOfSupportSignalMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment to call signal variations. The default value changes along with the experiment data.\n",numberOfSupportSignalMolecule); - printf("\t-minIndelSize: \n\t\t Default value (b): %g. The minimum length of a segment to call SVs.\n",minIndelSize); - printf("\t-minIndelRatio: \n\t\t Default value: %g. The length proportion of a minimum SV could be detected on a segment. E.g. segment = 10000b, then the length of the minimum SV should be larger than 10000*0.05=500b.\n",minIndelRatio); - printf("\t-resolutionLimit: \n\t\t Default value: %g. The minimum length of a segment to call signal variations.\n",resolutionLimit); - printf("\t-digestionRate: \n\t\t Default value: %g. The digestion rate of labels (signals) measured in the experiment.\n",digestionRate); - printf("\t-falseCutRate: \n\t\t Default value: %g. The rate of false cut of a non-label position.\n",falseCutRate); - printf("\t-pValueCutOff: \n\t\t Default value: %g. The cutoff of p-value when call signal variations.\n",pValueCutOff); - printf("\t-cauchyMean: \n\t\t Default value: %g. The mean value of cauchy distribution of null hypothesis when calling SVs. Reset a new value only if you have good reason.\n",cauchyMean); - printf("\t-cauchyScale: \n\t\t Default value: %g. The parameter to calculate cauchy distribution. Reset a new value only if you have good reason.\n",cauchyScale); - printf("\t-confidenceLimit: \n\t\t Default value: %g. The lowest alignment confidence for molecules (optical maps) to call SVs or signal variations.\n",confidenceLimit); - printf("\t-numberOfChromosome: \n\t\t Default value: %d. The first n chromosomes to detect SVs.\n",numOfChr); - return -1; - } - bool paraWrongFlag = false; - for (int i = 1; i < argv; i=i+2) - { - string temp(argc[i]); - //printf("What is the parameter?? %s \n",temp.c_str()); - if (temp.compare("-inputLabel")==0) - inputTrio = atol(argc[i+1]); - else if (temp.compare("-numberOfChromosome")==0) - numOfChr = atoi(argc[i+1]); - else if (temp.compare("-optAlignFile")==0) - { - memset(inputAlignmentFileName,0,sizeof(inputAlignmentFileName)); - strcpy(inputAlignmentFileName, argc[i+1]); - } - else if (temp.compare("-optTempFolder")==0) - { - memset(outputFileLocation,0,sizeof(outputFileLocation)); - strcpy(outputFileLocation, argc[i+1]); - } - else if (temp.compare("-SVoutputFile")==0) - { - memset(SVoutputFile,0,sizeof(SVoutputFile)); - strcpy(SVoutputFile,argc[i+1]); - } - else if (temp.compare("-chrMapFile")==0) - { - memset(chrMapFile,0,sizeof(chrMapFile)); - strcpy(chrMapFile,argc[i+1]); - } - else if (temp.compare("-outputFolder")==0) - strcpy(outputFolder,argc[i+1]); - else if (temp.compare("-likelihoodRatioCutOff")==0) - { likelihoodRatioCutOff = atof(argc[i+1]); - if (likelihoodRatioCutOff < 0) - { - printf("Error! An negative likelihoodRatioCutOff is inputted!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-numberOfSupportIndelMolecule")==0) - { - numberOfSupportIndelMolecule = atoi(argc[i+1]); - if (numberOfSupportIndelMolecule < 0) - { - printf("Error! An negative numberOfSupportIndelMolecule is inputted!\n"); - paraWrongFlag = true; - } - if (numberOfSupportIndelMolecule >= 1000) - { - printf("Warning! An improper large (>=1000) numberOfSupportIndelMolecule is inputted!\n"); - } - } - else if (temp.compare("-minIndelSize")==0) - { - minIndelSize = atof(argc[i+1]); - if (minIndelSize < 1) - { - printf("Error! Please make sure the minIndelSize is a positive integer!\n"); - paraWrongFlag = true; - } - if (minIndelSize >= 100000) - { - printf("Warning! An improper large number (>=100000b) Of minIndelSize is inputted!\n"); - } - } - else if (temp.compare("-minIndelRatio")==0) - { - minIndelRatio = atof(argc[i+1]); - if (minIndelRatio <= 0||minIndelRatio >= 1) - { - printf("Error! An improper number Of minIndelRatio is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-resolutionLimit")==0) - { - resolutionLimit = atof(argc[i+1]); - if (resolutionLimit < 0) - { - printf("Error! An negative number Of resolutionLimit is inputted!\n"); - paraWrongFlag = true; - } - if (resolutionLimit >= 10000) - { - printf("Warning! An improper large number (>=10000) of resolutionLimit is inputted!\n"); - } - } - else if (temp.compare("-digestionRate")==0) - { - digestionRate = atof(argc[i+1]); - if (digestionRate <= 0||digestionRate>1) - { - printf("Error! An improper digestionRate is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (digestionRate < 0.5) - { - printf("Warning! An very low level of digestionRate (<0.5) is adapted!\n"); - } - } - else if (temp.compare("-falseCutRate")==0) - { - falseCutRate = atof(argc[i+1]); - if (falseCutRate < 0||falseCutRate>=1) - { - printf("Error! An improper falseCutRate is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (falseCutRate > 0.8) - printf("Warning! An very large (>0.8) falseCutRate is inputted!\n"); - } - else if (temp.compare("-pValueCutOff")==0) - { - pValueCutOff = atof(argc[i+1]); - if (pValueCutOff <= 0||pValueCutOff>=1) - { - printf("Error! An improper pValueCutOff is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (pValueCutOff > 0.1) - { - printf("Warning! An non-significant (>0.1) pValueCutOff is inputted!\n"); - } - } - else if (temp.compare("-cauchyMean")==0) - { - cauchyMean = atof(argc[i+1]); - if (cauchyMean < 0.8||cauchyMean > 1.2) - { - printf("Error! cauchyMean should be around 1!\n"); - paraWrongFlag = true; - } - else - printf("You have changed the value of cauchyMean to be %g, please make sure it is your intention!\n",cauchyMean); - } - else if (temp.compare("-cauchyScale")==0) - { - cauchyScale = atof(argc[i+1]); - if (cauchyScale < 0) - { - printf("Warning! An negative cauchyScale is inputted!\n"); - paraWrongFlag = true; - } - else - printf("You have changed the value of cauchyScale to be %g, please make sure it is your intention!\n",cauchyScale); - } - else if (temp.compare("-confidenceLimit")==0) - { - confidenceLimit = atof(argc[i+1]); - if (cauchyScale < 0) - { - printf("Error! An negative confidenceLimit is inputted!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-numberOfSupportSignalMolecule")==0) - { - numberOfSupportSignalMolecule = atol(argc[i+1]); - if (numberOfSupportSignalMolecule < 0) - { - printf("Error! An negative numberOfSupportSignalMolecule is inputted!\n"); - paraWrongFlag = true; - } - if (numberOfSupportSignalMolecule > 1000) - { - printf("Warning! An very large (>1000) numberOfSupportSignalMolecule is inputted!\n"); - } - } - else - { - printf("No such parameter or wrong : %s\n",argc[i]); - paraWrongFlag = true; - } - } - if (paraWrongFlag) - { - printf("\nThe format to arrange parameters is:\n\t./makeRefine_en -param_1 val_1 -param_2 val_2 ... -param_n val_n\n"); - printf("\nPlease check your input parameters!\n\n"); - return -1; - } - - init(); - NCRcalculation(); - // findResolutionDistribution(); return 0; - initResultFullList(SVoutputFile); - getChromosomeList(chrMapFile); - initStatData(); - //new content - bool canOF = true; - numOfChr = 24; - for (LL tempChr = 0; tempChr < (LL)listOfChromosome.size(); tempChr++){ - LL chrID = listOfChromosome[tempChr]; - if (chrID > numOfChr) continue; - char buff[200]; - char nameOF[1000]; - strcpy(nameOF, outputFileLocation); - sprintf(buff, "%lld_%d",inputTrio,chrID); - strcat(nameOF, buff); - strcat(nameOF, ".bmap"); - if ((inputOptAlign = fopen(nameOF, "r")) == NULL) - { - canOF = false; - break; - } - } - if (!canOF) - { - readSourceFile(); - puts("DONE readSourceFile"); - addSplitedMap(); - puts("DONE addSplitedMap"); - outputToBillMapDestinationFile(); - } - //readSourceFile(); - //puts("DONE readSourceFile"); - //addSplitedMap(); - //puts("DONE addSplitedMap"); - //outputToBillMapDestinationFile(numOfChr); - //content end - for (LL tempChrId = 0; tempChrId < (LL)listOfChromosome.size(); tempChrId++){ - chrId = listOfChromosome[tempChrId]; - //if (chrId > numOfChr) continue; - // for (chrId = 12; chrId <= 12; chrId++){ - canOpenFile = true; - initData(); - // printf("Doing %d\n", chrId); - readChromosomeInfo(chrId,chrMapFile); - // printf("Done Read Chromosome\n"); - readOpticalAlign(chrId,outputFileLocation); - if (!canOpenFile) - { - printf("Open File fails?\n"); - continue; - } - // printf("Done Read Optical\n"); - //printDistancePair("test_output/Distance_before_parsing"); - printf("Distance Pair Count-1: %lld\n", distancePairCount); - parseHitEnum(); -// printf("Done parsehitEnum\n"); - printf("Distance Pair Count-2: %lld\n", distancePairCount); - completePairs(); - // printf("pair count: %lld\n", distancePairCount); - //printDistancePair("test_output/Distance_after_parsing"); - detectByDistancePair(); - //printDistancePair("test_output/Distance_after_detection"); - printf("Distance Pair Count-3: %lld\n", distancePairCount); -// printf("Done detectByDistancePair\n"); - -// processDistancePair(); -// printf("Done processDistancePair\n"); - //printDistancePair("test_output/Distance_after_processing"); -// printf("Distance Pair Count-4: %lld\n", distancePairCount); -// printf("Number of the sites: %lld\n",chromosome.numberOfSites); - deletionTest(chrId); -// printf("Done Deletion\n"); - insertionTest(chrId); - // printf("Done Insertion\n"); - } - correctOverlapVariant(); - outputVariantResult(argv,argc); -// printf("Number of No support SV: %lld\n", numberOfNoSupportSV); - printf("Number of support SV: %lld\n", numberOfSupportedSV); - printf("\n"); - return 0; -} From fc1aa71397cbf937810ae73de7e87f54362ff1a8 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:26:38 +0800 Subject: [PATCH 05/13] Delete OMSV_v3.cpp --- source/OMSV_v3.cpp | 2066 -------------------------------------------- 1 file changed, 2066 deletions(-) delete mode 100644 source/OMSV_v3.cpp diff --git a/source/OMSV_v3.cpp b/source/OMSV_v3.cpp deleted file mode 100644 index dbdfa66..0000000 --- a/source/OMSV_v3.cpp +++ /dev/null @@ -1,2066 +0,0 @@ -#include "NoError.h" -// Can choose Alden, Ref, CHM1, CHM1_2, Angel1, Angel2, Angel3, Angel4, -// Ref_Sim, Alden_sim_T0, Sim_Ref_OMDP, Sim_Alden_OMDP -// Pipeline_Real, Pipeline_Sim, CHM1ToASM, Autism, Schizo -// Real_Ref, Real_Assembly_Hg38, Real_Molecule_Assembly -// Sim_Haploid_Pipeline, Sim_Diploid_Pipeline -// Sim_Hap_Ref, Sim_Dip_Ref -// YH, Sim2_Haploid_OMBlast -// Han -// Alden1_1, Alden1_1000, Alden2_1, Alden2_1000 -//#define Alden2_1000 - -FILE* inputChrConsen; -FILE* inputOptAlign; -FILE* outputResult1; -FILE* outputResult2; -FILE* outputVariantResultFile; -char inputAlignmentFileName[1000]; -char outputFileLocation[500]; -FILE *inputAlignmentFile; -FILE *outputFileList[500]; -LL numberOfOpticalMap; -//#if defined (Real_Assembly_Hg38) || defined(Real_Molecule_Assembly) -//FILE* outputMoleculeToAssembly; -//FILE* outputAssembly; -//#endif -char tempString[10000]; -LL tempLongLong; -double tempDouble; -double tempDistanceDouble[20000]; -double C[9000][9000]; -bool canOpenFile = true; - - -double minIndelSize; -double minIndelRatio; -double resolutionLimit; -LL inputTrio; -double digestionRate; -double falseCutRate; -double pValueCutOff; -double cauchyMean; -double cauchyScale; -LL numberOfSupportEachSide; -double confidenceLimit; -LL numberOfSupportIndelMolecule; -LL numberOfSupportSignalMolecule; -double likelihoodRatioCutOff; -//Start add -struct opticalMapType{ - char mapId[100]; - LL refStartIndex; - LL refEndIndex; - LL refStart; - LL refEnd; - LL optStart; - LL optEnd; - LL chrId; - bool orientation; - double score; - double confidence; - char hitEnum[1000]; - double fpr=0;//// - double fnr=0;//// - double alignRate=0;//// - vector position; - void print(){ - printf("%s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", mapId, score, confidence, fpr,fnr,alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); - for (LL i=0; i<(LL)position.size(); i++) - printf("%d ", position[i]); - printf("\n"); - } - void print(FILE* targetFile){ - sprintf(tempString, "%lld", inputTrio); - fprintf(targetFile, "%s %s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", tempString, mapId, score, confidence, fpr, fnr, alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); - for (LL i=0; i<(LL)position.size(); i++) - fprintf(targetFile, "%d ", position[i]); - fprintf(targetFile, "\n"); - } -}; -bool ss(opticalMapType q, opticalMapType w){ - LL e = strcmp(q.mapId, w.mapId); - return ((e<0?true:false) || (e==0 && q.chrId < w.chrId) || (e == 0 && q.chrId == w.chrId && q.optStart < w.optEnd)); -} - -vector opticalMap1; -void getValue(){ //Note: this value is in KB! - FILE* file = fopen("/proc/self/status", "r"); - int result = -1; - char line[128]; - char tp[100]; - while (fgets(line, 6, file) != NULL){ - if (line[0]=='V'&&line[1]=='m'&&line[2]=='R'&&line[3]=='S'&&line[4]=='S'){ - fgets(line,128,file); - printf("Memory usage: %s",line); -// result = parseLine(line); - break; - } - } - fclose(file); -} - -void readSourceFile(){ - if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); - int numberOfDL = 0; - char ttt; - char ttts[100000]; - while(fgetc(inputAlignmentFile) == '#') - { - numberOfDL++; - fgets(ttts,100000,inputAlignmentFile); - } - LL totSize = 0; - while(fscanf(inputAlignmentFile,"%s",ttts)==1){ - totSize++; - fgets(ttts,100000,inputAlignmentFile); - } - fclose(inputAlignmentFile); - if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); - opticalMap1.clear(); - printf("There are %lld space\n",totSize); - opticalMap1.resize(totSize+10000); - for (LL i=0; i 0 - && opticalMap1[i+1].refStart - opticalMap1[i].refEnd < 5000000){ - opticalMap1[tempCC] = opticalMap1[i]; - opticalMap1[tempCC].optStart = opticalMap1[i].optEnd; - opticalMap1[tempCC].optEnd = opticalMap1[i+1].optStart; - opticalMap1[tempCC].refStart = opticalMap1[i].refEnd; - opticalMap1[tempCC].refEnd = opticalMap1[i+1].refStart; - opticalMap1[tempCC].score = 10.0; - opticalMap1[tempCC].confidence = 0.1; - opticalMap1[tempCC].orientation = true; -// opticalMap1[tempCC].calc();//// - opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// - opticalMap1[tempCC].fnr = opticalMap1[i].fnr + opticalMap1[i+1].fnr;//// - //opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// - opticalMap1[tempCC].position.clear(); -// opticalMap1[tempCC].hitEnum = new char[4]; - strcpy(opticalMap1[tempCC].hitEnum, "FFF"); - if (opticalMap1[tempCC].optStart > opticalMap1[tempCC].optEnd) - tempCC--; - tempCC++; - if (tempCC==totSize){ - totSize+=50000; - opticalMap1.resize(totSize); - } - } - } - numberOfOpticalMap = tempCC; - opticalMap1.resize(numberOfOpticalMap); -} - -void outputToBillMapDestinationFile(){ - printf("Start to produce temp files\n"); - for (LL i=0; i numOfChr) continue; - if (outputFileList[opticalMap1[i].chrId] == NULL) - initOutput(opticalMap1[i].chrId,"w+"); - else - initOutput(opticalMap1[i].chrId,"a+"); - opticalMap1[i].print(outputFileList[opticalMap1[i].chrId]); - fclose(outputFileList[opticalMap1[i].chrId]); - } - getValue(); - opticalMap1.clear(); - vector().swap(opticalMap1); -} - - - - -//stop add - -void setDefault() -{ - digestionRate = 0.875; - falseCutRate = 0.000010; - pValueCutOff = 1e-9; - cauchyMean = 1.0096; - cauchyScale = 0.0291; - numberOfSupportEachSide = 1; - confidenceLimit = 9; - numberOfSupportIndelMolecule = 10; //coverage of the indel - numberOfSupportSignalMolecule = 10; - likelihoodRatioCutOff = 1e6; -// Settings - minIndelSize = 2000; - minIndelRatio = 0.05; - resolutionLimit = 700; - inputTrio = 0507; -} - -LL newChrSite = 0; -LL numberOfHomoMissingFragment = 0; -LL numberOfHomoDeletedSNP = 0; -LL homoMissingSite = 0; -LL numberOfNoSupportSV = 0; -LL homoNonMissingSite = 0; -LL homoInsertSite = 0; -LL homoNonInsertSite = 0; -LL heterMissingSite = 0; -LL heterNonMissingSite = 0; -LL heterInsertSite = 0; -LL heterNonInsertSite = 0; -LL numberOfNickingSite = 0; -LL numberOfOM878 = 0; -LL numberOfOM891 = 0; -LL numberOfDetailOpticalMap = 0; -LL distancePairCount = 0; -LL numberOfVariant = 0; -LL numberOfSupportedSV = 0; - -LL tempPre = -1; -LL tempCounter = 0; -vector listOfChromosome; - -// shortestDist[10] refers to distance between 9-11 -//double shortestDist[500000]; -//LL shortestDistCount[500000]; - -// shortestGapDist[10] refers to distance between 10-11 -//double shortestGapDist[500000]; -//LL shortestGapDistCount[500000]; - - -// 0 = Homo Ins/Del -// 1 = Homo Non-Ins/Del -// 2 = Heter Ins/Del -// 3 = Heter Non-Ins/Del -// 4 = nothing - -LL curId; -int chrId; - - -bool feq(double x, double y){ - //return fabs(x - y) <= eps; - return fabs(x - y) <= eps*(min(fabs(x),fabs(y)));//should be normalized by the values of x and y -} -bool feq1(double x, double y){ - return fabs(x - y) <= eps; - // return fabs(x - y) <= eps*fabs(min(x,y));//should be normalized by the values of x and y -} - - -LL max(LL x, LL y){ - return x > y ? x : y; -} - -LL min(LL x, LL y){ - return x > y ? y : x; -} - -double average(vector a){ - double ans = 0; - for (LL i=0; i<(LL)a.size(); i++) - ans += a[i]; - ans /= a.size(); - return ans; -} - - -struct variantType{ - LL people; - LL chr; - LL start; - LL end; - LL size; - LL sizeExt; - LL oppoSupp;//// - double ratio; - double ratioExt; - LL support; - double likelihood; - bool isSignal; - bool isDel; - bool isHomo; - bool isSupported; - void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSizeExt, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double inputRatioExt, double likely, LL supp, LL IoppoSupp) - {//version for SV - oppoSupp = IoppoSupp;//// - people = inputPeople; - chr = inputChr; - start = inputStart; - end = inputEnd; - size = inputSize; - sizeExt = inputSizeExt; - ratio = inputRatio; - ratioExt = inputRatioExt; - likelihood = likely; - support = supp; - isSignal = (inputSignal == 1); - isDel = (inputDel == 1); - isHomo = (inputHomo == 1); - isSupported = (inputSupport == 1); - }; - void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double likely, LL supp, LL IoppoSupp) - {//version for signal changes - oppoSupp = IoppoSupp;//// - people = inputPeople; - chr = inputChr; - start = inputStart; - end = inputEnd; - size = inputSize; - sizeExt = 0; - ratio = inputRatio; - ratioExt = 0; - likelihood = likely; - support = supp; - isSignal = (inputSignal == 1); - isDel = (inputDel == 1); - isHomo = (inputHomo == 1); - isSupported = (inputSupport == 1); - }; - bool operator<(const variantType &x) const{ - return (chr < x.chr || (chr == x.chr && start < x.start) || (chr == x.chr && start == x.start && end < x.end) || (chr == x.chr && start == x.start && end == x.end && isSupported)); - } - - /*void print(){ - printf("%lld %lld %lld %lld %lld %d %d %d\n", people, chr, start, end, size, isSignal, isDel, isHomo); - };*/ -}; - -struct chrType{ - LL length; - LL numberOfSites; - -/* vector position; - vector distance; - vector oldDistance; - vector gapDistance; - vector coverage; - vector occurrence; - vector gapCount; - vector gapCoverage; - vector gapSigni; - vector signi; - vector distanceDifference; - vector gapDistanceDifference; - void creator(){ - position.resize(numberOfSites+100); - distance.resize(numberOfSites+100); - oldDistance.resize(numberOfSites+100); - gapDistance.resize(numberOfSites+100); - coverage.resize(numberOfSites+100); - occurrence.resize(numberOfSites+100); - gapCount.resize(numberOfSites+100); - gapCoverage.resize(numberOfSites+100); - gapSigni.resize(numberOfSites+100); - signi.resize(numberOfSites+100); - distanceDifference.resize(numberOfSites+100); - gapDistanceDifference.resize(numberOfSites+100); - } - void deletor(){ - position.clear(); - distance.clear(); - oldDistance.clear(); - gapDistance.clear(); - coverage.clear(); - occurrence.clear(); - gapCount.clear(); - gapCoverage.clear(); - gapSigni.clear(); - signi.clear(); - distanceDifference.clear(); - gapDistanceDifference.clear(); - } -*/ -// LL position[200000]; - - // distance[0] = position[1] - position[0] -// LL distance[200000]; - -// LL oldDistance[500000]; -// LL gapDistance[500000]; - - // coverage[1] refer to nicking site 1 -// int coverage[200000]; - - // occurrence[1] refer to nicking site 1 -// int occurrence[200000]; - - // gapCount[1] refers to gap between site 1 to 2 -// int gapCount[200000]; -// int gapCoverage[200000]; - - // If it is significant by Kevin's definition - LL* position = new LL[10]; - LL* distance = new LL[10]; - int* coverage = new int[10]; - int* occurrence = new int[10]; - int* gapCount = new int[10]; - int* gapCoverage = new int[10]; - bool* gapSigni = new bool[10]; - bool* signi = new bool[10]; -// bool gapSigni[200000]; -// bool signi[200000]; - - // distance Difference -// double distanceDifference[500000]; -// double gapDistanceDifference[500000]; -}; -chrType chromosome; - -struct optAlignType{ - LL belongs; - char mapId[100]; - LL optStart; - LL optEnd; - LL refStart; - LL refEnd; - LL numberOfSites; - double score; - double confidence; - char hitEnum[1000]; - - // position = distance[i+1] - distance[i] - int position[2500]; - int oldPosition[2500]; - double fpr;//// - double fnr;//// - double alignRate;//// - /*vector position; - void calc() - { - int tot_fp = 0, tot_fn = 0; - for (int i = 0; hitEnum[i]!='\0'; i++) - { - int curr = 0; - int fp, fn; - for (int j = i; hitEnum[j] >= '0' && hitEnum[j] <= '9'; j++) - { - curr = curr * 10 + (hitEnum[j] - '0'); - } - if (hitEnum[j] == 'I') - { - fp++; - tot_fp += curr; - } - else if (hitEnum[j] == 'D') - { - fn++; - tot_fn += curr; - } - i = j; - } - int mol_len = 0; - int ali_len = 0; - int cnt = 0; - for (auto ele:position) - { - cnt++; - mol_len += ele; - if (cnt >= min(optStart,optEnd)&&cnt <= max(optStart,optEnd)) ali_len+=ele; - } - fpr = tot_fp*1.0/ali_len; - fnr = tot_fn*1.0/(abs(refEndIndex-refStartIndex)+1); - alignRate = ali_len*1.0/mol_len; - } -*/ -}; - -struct distanceType{ - char mapId[20]; - LL start; - LL end; - double distance; - LL cnt; - bool operator<(const distanceType &x) const{ - return (start < x.start || (start == x.start && end < x.end)); - } - void print(){ - printf("chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end] - chromosome.position[start], distance, cnt); - } - void printToFile(FILE* outputFile){ - fprintf(outputFile, "chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end] - chromosome.position[start], distance, cnt); - } -}; - - -vector opticalMap; -vector distancePair; -vector variant; - -void init(){ - // Has reduced by half -// opticalMap = new optAlignType[3000000]; -// distancePair = new distanceType[100000000]; - variant.clear(); - variant.resize(100000); -} -char outputFolder[1000]; - -FILE* createInputTrioFile(const char *suffix, LL inputTrio, const char *mode){ - char nameOfFile[1000]; - char buffer[200]; - memset(buffer, 0, sizeof(buffer)); - strcpy(nameOfFile, outputFolder); - sprintf(buffer, "%lld_%g_", inputTrio,minIndelSize); - strcat(nameOfFile, buffer); - strcat(nameOfFile, suffix); - //printf("The file name is: %s\nAnd mode is %s\n",nameOfFile,mode); - FILE* retFile = fopen(nameOfFile, mode); - if (retFile == NULL) perror("error in opening file!"); - return retFile; -} - - - -void initResultFullList(char* SVoutputFile){ - char nameOfFile[1000]; - char buffer[200]; - char extPart[200]; - outputResult1 = createInputTrioFile("Distance_Pairs_List.txt", inputTrio, "w+"); - outputResult2 = createInputTrioFile("Signal_Indels_List.txt", inputTrio, "w+"); -/* -#if defined(Real_Molecule_Assembly) - outputMoleculeToAssembly = createInputTrioFile("/local/shared/bill/BillMapFromREAL_Molecule_ASM_HG38/distanceList/", "moleculeToAssembly_List.txt", inputTrio, "w+"); -#endif -#if defined(Real_Assembly_Hg38) - outputAssembly = createInputTrioFile("/local/shared/bill/BillMapFromREAL_Molecule_ASM_HG38/distanceList/", ".txt", inputTrio, "w+"); -#endif -*/ - sprintf(extPart,"_%g",minIndelSize); - memset(nameOfFile, 0, sizeof(nameOfFile)); - strcpy(nameOfFile, outputFolder); - sprintf(buffer, "%lld", inputTrio); - strcat(nameOfFile, buffer); - strcat(nameOfFile, SVoutputFile); - strcat(nameOfFile, extPart); - strcat(nameOfFile, ".osv"); - printf("%s\n",nameOfFile); - if ((outputVariantResultFile = fopen(nameOfFile, "w+")) == NULL) - { - perror("error opening file(): nameOfFile"); - } -} - -bool overlapped(LL x1, LL y1, LL x2, LL y2){ - if (x1 >= x2 && x1 <= y2) return true; - if (y1 >= x2 && y1 <= y2) return true; - if (x2 >= x1 && x2 <= y1) return true; - if (y2 >= x1 && y2 <= y1) return true; - return false; -} - -void getChromosomeList(char* chrMapFile){ - listOfChromosome.clear(); - char nameOfFile[1000]; - strcpy(nameOfFile, chrMapFile); - if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL) - { - perror("error opening file(): ChromosomeInfo"); - } - fscanf(inputChrConsen,"%s",tempString); - while(tempString[0]=='#'){ - fgets(tempString, 10000, inputChrConsen); - fscanf(inputChrConsen,"%s",tempString); - } - LL tempChrId, tempNumberOfSites; - double tempDouble1, tempDouble2; - tempChrId = atoll(tempString); - if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); - listOfChromosome.push_back(tempChrId); - fgets(tempString, 10000, inputChrConsen); - while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ - listOfChromosome.push_back(tempChrId); - fgets(tempString, 10000, inputChrConsen); - } - fclose(inputChrConsen); - sort(listOfChromosome.begin(), listOfChromosome.end()); - vector::iterator tempIt; - tempIt = unique(listOfChromosome.begin(), listOfChromosome.end()); - listOfChromosome.resize(distance(listOfChromosome.begin(), tempIt)); - printf("Number of Chromosomes: %lld\n", (LL)listOfChromosome.size()); -} - -void readChromosomeInfo(int chr,char* chrMapFile){ - char nameOfFile[1000]; - strcpy(nameOfFile, chrMapFile); - if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL) - { - perror("error opening file(): ChromosomeInfo"); - } -// printf("Before read\n"); - - fscanf(inputChrConsen,"%s",tempString); - while(tempString[0]=='#'){ - fgets(tempString, 10000, inputChrConsen); - fscanf(inputChrConsen,"%s",tempString); - } - LL cc = 0; - LL tempChrId, tempNumberOfSites; -// printf("Before deletor\n"); - double tempDouble1, tempDouble2; - tempChrId = atoll(tempString); - printf("StART\n"); -// chrType tpChr; -// chromosome = tpChr; -// chromosome.deletor(); -// printf("After deletor %s\n",tempString); - bool done= false; -// memset(chromosome.position,0,sizeof(chromosome.position)); -// memset(chromosome.distance,0,sizeof(chromosome.distance)); -// memset(chromosome.coverage,0,sizeof(chromosome.coverage)); -// memset(chromosome.occurrence,0,sizeof(chromosome.occurrence)); -// memset(chromosome.gapCount,0,sizeof(chromosome.gapCount)); -// memset(chromosome.gapCoverage,0,sizeof(chromosome.gapCoverage)); -// memset(chromosome.gapSigni,0,sizeof(chromosome.gapSigni)); -// memset(chromosome.signi,0,sizeof(chromosome.signi)); - delete[] chromosome.position; - delete[] chromosome.distance; - delete[] chromosome.coverage; - delete[] chromosome.occurrence; - delete[] chromosome.gapCount; - delete[] chromosome.gapCoverage; - delete[] chromosome.gapSigni; - delete[] chromosome.signi; - if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); - if (tempChrId == chr){ - chromosome.numberOfSites = tempNumberOfSites; - chromosome.position = new LL[tempNumberOfSites+10]; - chromosome.distance = new LL[tempNumberOfSites+10]; - chromosome.coverage = new int[tempNumberOfSites+10]; - chromosome.occurrence = new int[tempNumberOfSites+10]; - chromosome.gapCount = new int[tempNumberOfSites+10]; - chromosome.gapCoverage = new int[tempNumberOfSites+10]; - chromosome.gapSigni = new bool[tempNumberOfSites+10]; - chromosome.signi = new bool[tempNumberOfSites+10]; - done = true; -// printf("The number of site is %lld\n",chromosome.numberOfSites); -// chromosome.creator(); - chromosome.position[cc] = (LL) tempDouble2; - chromosome.length = (LL) tempDouble1; - chromosome.gapCount[cc] = 0; - chromosome.coverage[cc] = 0; - chromosome.occurrence[cc] = 0; - cc++; - } - fgets(tempString, 10000, inputChrConsen); - while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ - if (tempChrId == chr){ - chromosome.numberOfSites = tempNumberOfSites; - if (!done){ - chromosome.position = new LL[tempNumberOfSites+10]; - chromosome.distance = new LL[tempNumberOfSites+10]; - chromosome.coverage = new int[tempNumberOfSites+10]; - chromosome.occurrence = new int[tempNumberOfSites+10]; - chromosome.gapCount = new int[tempNumberOfSites+10]; - chromosome.gapCoverage = new int[tempNumberOfSites+10]; - chromosome.gapSigni = new bool[tempNumberOfSites+10]; - chromosome.signi = new bool[tempNumberOfSites+10]; - done = true; -// printf("Before creator %lld\n",chromosome.numberOfSites); -// chromosome.creator(); -// printf("After creator %d\n",chr); -// done = true; - } - chromosome.position[cc] = (LL) tempDouble2; - chromosome.length = (LL) tempDouble1; - chromosome.gapCount[cc] = 0; - chromosome.coverage[cc] = 0; - chromosome.occurrence[cc] = 0; - cc++; - } - fgets(tempString, 10000, inputChrConsen); - } - chromosome.numberOfSites++; - for (LL i=0; i tempMap; -// vector tempVec[10000]; - LL oppoSupp = 0; -/* bool vectorOverFlow = false; - for (LL i=start-1; i>=0; i--){ - if (distancePair[i].start >= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ - string mapFrom(distancePair[i].mapId); - LL mapTo = (LL)tempMap.size() + 1; - if (tempMap.count(mapFrom) == 0){ - tempMap.insert(make_pair(mapFrom, mapTo)); - } - LL newIndex = tempMap[mapFrom]; - if (newIndex >= 10000){ - vectorOverFlow = true; - break; - } - tempVec[newIndex].push_back(distancePair[i]); - } else - break; - } - if (vectorOverFlow) return; - for (LL i=end+1; i= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ - string mapFrom(distancePair[i].mapId); - LL mapTo = (LL)tempMap.size() + 1; - if (tempMap.count(mapFrom) == 0){ - tempMap.insert(make_pair(mapFrom, mapTo)); - } - LL newIndex = tempMap[mapFrom]; - if (newIndex >= 10000){ - vectorOverFlow = true; - break; - } - tempVec[newIndex].push_back(distancePair[i]); - } else - break; - } - if (vectorOverFlow) return;*/ - double averageScore = 0;//// - double averageConfidence = 0;//// - double averageFPR = 0;//// - double averageFNR = 0;//// - double averageAlignRate = 0;//// -/* for (LL i=1; i<(LL)tempMap.size()+1; i++){ - double tempMoleCoverSize = 0; - double tempMoleRealSize = 0; - double tempScore = 0;//// - double tempConfidence =0;//// - double tempFPR = 0;//// - double tempFNR = 0;//// - double tempAlignRate = 0;//// - for (LL j=0; j<(LL)tempVec[i].size(); j++){ - tempMoleCoverSize += (chromosome.position[tempVec[i][j].end] - chromosome.position[tempVec[i][j].start]); - tempMoleRealSize += tempVec[i][j].distance; - } -// if (feq1(tempMoleCoverSize, referenceDistance)) - if (feq1(tempMoleCoverSize, referenceDistance)) - { - - tempDistanceDouble[cnt++] = tempMoleRealSize/referenceDistance; - //printf("Let me see extra: %g\t%g\t%lld\n",tempMoleRealSize,referenceDistance,oppoSupp); - if (abs(tempMoleRealSize - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - } - double averageScore = 0; - double averageConfidence = 0; - double averageFPR = 0; - double averageFNR = 0; - double averageAlignRate = 0; - for (LL i=start; i<=end; i++) - { - tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; - //printf("Let me see: %g\t%g\t%lld\n", distancePair[i].distance,referenceDistance,oppoSupp); - if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - sort(tempDistanceDouble, tempDistanceDouble + cnt); - averageScore /= cnt;//// - averageConfidence /= cnt;//// - averageFPR /= cnt;//// - averageFNR /= cnt;//// - averageAlignRate /= cnt;//// -*/ - vector maps; - vector dists; - maps.clear(); - dists.clear(); -// fprintf(outputResult1, "%d %lld %lld %lld %lld\t", chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], distancePair[start].start+1, distancePair[start].end+1); - for (LL i=start; i<=end; i++) - { - bool fal = true; - for (LL j = 0; j < (LL)maps.size(); j++) - { - if (maps[j] == atol(distancePair[i].mapId) && abs(dists[j] - distancePair[i].distance) < 1){fal = false;break;} - } - if (fal) - { - maps.push_back(atol(distancePair[i].mapId)); - dists.push_back(distancePair[i].distance); - tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; - // fprintf(outputResult1, "%lld:%lld ",atol(distancePair[i].mapId),(LL)distancePair[i].distance); - //printf("Let me see: %g\t%g\t%lld\n", distancePair[i].distance,referenceDistance,oppoSupp); - if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// - } - } - // fprintf(outputResult1, "%lld\n",(LL)maps.size()); - // for (LL i = 0; i < (LL)maps.size(); i++) - // fprintf(outputResult1, "%lld:%lld ",maps[i],(LL)dists[i]); - // fprintf(outputResult1, "\n"); -// fprintf(outputResult1, "%lld\n",cnt); - sort(tempDistanceDouble, tempDistanceDouble + cnt); - if (cnt < numberOfSupportIndelMolecule) {//why 2500? -// variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -// variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0); -// variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0); -// variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0); - numberOfNoSupportSV++; - return; - } - //use another vector to remove the outliers in two ends to purify the results - //int outRange = static_cast(cnt/10)>1?static_cast(cnt/10):1; - int outlierRange = 0; - double tempDistanceDoubleNoOutlier[cnt]; - LL cntNoOutlier = 0; - for (LL i = outlierRange; i < cnt-outlierRange;) - { - tempDistanceDoubleNoOutlier[cntNoOutlier++] = tempDistanceDouble[i++]; - } - double sampleMean = 0; - for (LL i=0; i nullHypothesisLikelihood){ - // Update Homo Indel Variant - if (feq1(tempMaxLikelihood, homoIndelHypothesisLikelihood)){ - double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; - if (tempSizeChange <= -max(minIndelSize, minIndelRatio * referenceDistance)){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], fabs(tempSizeChange), 0, 1, 1, 1, tempMaxLikelihood/ nullHypothesisLikelihood, tempMaxLikelihood, cnt, oppoSupp); - } - else if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], fabs(tempSizeChange), 0, 0, 1, 1, tempMaxLikelihood/ nullHypothesisLikelihood, tempMaxLikelihood, cnt, oppoSupp); - } - } - // Update HeterDel Variant - else if (feq1(tempMaxLikelihood, heterDelHypothesisLikelihood)){ - double tempSizeChange = heterDelHypothesisMaxLikelihoodSizeChanged; - if (fabs(tempSizeChange) >= max(minIndelSize, minIndelRatio * referenceDistance)){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], fabs(tempSizeChange), 0, 1, 0, 1, tempMaxLikelihood/ nullHypothesisLikelihood, tempMaxLikelihood, cnt, oppoSupp); - } - } - // Update HeterIns Variant - else if (feq1(tempMaxLikelihood, heterInsHypothesisLikelihood)){ - double tempSizeChange = heterInsHypothesisMaxLikelihoodSizeChanged; - if (fabs(tempSizeChange) >= max(minIndelSize, minIndelRatio * referenceDistance)){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], fabs(tempSizeChange), 0, 0, 0, 1, tempMaxLikelihood/ nullHypothesisLikelihood, tempMaxLikelihood, cnt, oppoSupp); - } - } - if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); - } -// if (chromosome.position[distancePair[start].start]==9810098) -// { -// printf("Let me see, the segment if (%lld, %lld), and the position flag is %d.\n",chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], posFlag); -// } -} - -void detectByDistancePair(){ - sort(distancePair.begin(), distancePair.end()); - // printDistancePair(); - distancePair.resize(distancePairCount+1); - distancePair[distancePairCount].start = -1; - distancePair[distancePairCount].end = -1; - distancePairCount++; - LL tempHead = 0, tempTail = 0, tempPreNumVar, tempPreNumNoSuppVar; - for (LL i=0; i().swap(distancePair); -} - -void completePairs() -{ - sort(distancePair.begin(), distancePair.end()); - vector extraDP; - extraDP.clear(); - LL ost = -1, oed = -1; - for (LL i = 0; i < distancePairCount; i++) - { - if (distancePair[i].start!= ost || distancePair[i].end!=oed)//this locate the non-redundant reference regions (NRRR) - { - distanceType tempPair; - ost = distancePair[i].start; - oed = distancePair[i].end; - if (oed-ost == 1)continue; - for (LL j = 0; j <= i-1; j++)//this search the possible sub-regions - { - //LL ted = distancePair[j].end; - if (distancePair[j].start < ost) continue; - else if (distancePair[j].start > ost) break; - else//for each sub-region holding the same start point with NRRR, we explore the consecutive regions on the same molecule - { - tempPair = distancePair[j]; - bool rep = true; - for (LL k = j+1; distancePair[k].start= oed) break; - } - if (tempPair.end == oed) - { - rep = false; - for (LL l = i; l < distancePairCount; l++) - { - if (distancePair[i] < distancePair[l])break; - else if (strcmp(distancePair[l].mapId, tempPair.mapId)==0) {rep = true; break;} - } - } - if (!rep) extraDP.push_back(tempPair); - } - } - } - else - continue; - } - printf("There are %lld extra distance pairs are added.\n",(LL)extraDP.size()); - // also need to include the new distance pairs into the original set - distancePair.resize(distancePairCount+extraDP.size()); - for (LL i = 0; i < (LL)extraDP.size(); i++) - { - distancePair[distancePairCount++]=extraDP[i]; - } -} - - - -void processDistancePair(){ - sort(distancePair.begin(), distancePair.end()); - // printDistancePair(); - distancePair.resize(distancePairCount+1); - distancePair[distancePairCount].start = -1; - distancePair[distancePairCount].end = -1; - distancePairCount++; - LL tempCnt = 1, cc = 0; - double tempDis = distancePair[0].distance; - distancePair[0].cnt = 1; - for (LL i=0; i=distancePairCount?0:distancePair[i+1].distance; - } - distancePairCount = cc; - distancePair.resize(cc); -// memset(shortestDist, 0, sizeof(shortestDist)); -// memset(shortestDistCount, 0, sizeof(shortestDistCount)); -// memset(shortestGapDist, 0, sizeof(shortestGapDist)); -// memset(shortestGapDistCount, 0, sizeof(shortestGapDistCount)); -// for (LL i=0; i= '0' && opticalMap[i].hitEnum[j] <= '9'){ - tempCount = tempCount * 10 + (opticalMap[i].hitEnum[j]-'0'); - } - else { - if (opticalMap[i].hitEnum[j] == 'M'){ - for (LL k=0; kdistancePair.size()-10000)distancePair.resize(distancePair.size()+100000); - } - distancePair.resize(distancePairCount); - opticalMap.clear(); - vector().swap(opticalMap); -} - -void NCRcalculation(){ - for (int i = 0; i < 9000; ++i){ - C[i][0] = C[i][i] = 1; - for (int j = 1; j < i; ++j) - C[i][j] = (C[i - 1][j] + C[i - 1][j - 1]); - for (int j = i + 1; j < 9000; ++j) - C[i][j] = 0; - } -} - -void deletionTest(int chrId){ -// if (chrId == 1){ -// fprintf(outputResult2, "ID\tChr\tSite_Index\tPosition\tCoverage\tOccurence\tP\tU\tP-value\tH0\tHa\tHb\tRef_Dis\tMolecule_Dis\tCount\tType\n"); -// } - for (LL i=0; i 5000) continue; - - // printf("%lld %lld %lld %lld\n", i, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i]); - numberOfNickingSite++; - double pValue = 0; - double likelihoodh0 = 0; - double likelihoodha = 0; - double likelihoodhb = 0; - double likelihoodRatio = 0; - for (LL j=chromosome.coverage[i] - chromosome.occurrence[i]; j<=chromosome.coverage[i]; j++){ - pValue += C[chromosome.coverage[i]][j]*pow(1-digestionRate, j)*pow(digestionRate, chromosome.coverage[i] - j); - } - likelihoodh0 = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(1-digestionRate, chromosome.coverage[i] - chromosome.occurrence[i])*pow(digestionRate, chromosome.occurrence[i]); - - double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); - double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); - - atLeastOneFalseCutRate = 0.1; - likelihoodha = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(atLeastOneFalseCutRate, chromosome.occurrence[i])*pow(1 - atLeastOneFalseCutRate, chromosome.coverage[i] - chromosome.occurrence[i]); - // double likelihoodRatio2 = likelihoodh0/likelihoodha; - likelihoodRatio = pow((1-digestionRate)/(1-atLeastOneFalseCutRate), chromosome.coverage[i] - chromosome.occurrence[i]) * pow(digestionRate/atLeastOneFalseCutRate, chromosome.occurrence[i]); - - LL x = chromosome.coverage[i] - chromosome.occurrence[i]; - LL k = chromosome.coverage[i]; - double p = digestionRate; - double u = atLeastOneFalseCutRate; - for (LL k1 = 0; k1 <= k; k1++){ - double tempLhb = 0; - LL k2 = k - k1; - for (LL y=max(0, x - k2); y <= min(k1, x); y++){ - tempLhb += C[k1][y]*C[k2][x-y]*pow(1-p, y)*pow(p, k1-y)*pow(u, y+k2-x)*pow(1-u, x-y); - } - - likelihoodhb += C[k][k1]*tempLhb; - } - likelihoodhb /= pow(2, k); - - -// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, (i+1=0?chromosome.position[i-1]:0), (LL)shortestDist[i], shortestDistCount[i], "Del"); - - -// chromosome.distanceDifference[i] = (i+1=0?chromosome.position[i-1]:0) - shortestDist[i]; - - - if (pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) - chromosome.signi[i] = true; - else chromosome.signi[i] = false; - - if (chromosome.signi[i]){ - if (likelihoodha > likelihoodhb){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 1, 1, pValue, likelihoodha, x, 0); - } else { - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 0, 1, pValue, likelihoodhb, x, 0); - } - if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); - } - // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Miss"); - } -} - -void insertionTest(int chrId){ - for (LL i=0; i 5000) continue; - - double pValue = 0; - double likelihoodh0 = 0; - double likelihoodha = 0; - double likelihoodhb = 0; - double likelihoodRatio = 0; - double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); - double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); - - - /* - TEST - atLeastOneFalseCutRate = 0.1; - TEST - */ - for (LL j=chromosome.gapCount[i]; j<=chromosome.gapCoverage[i]; j++){ - pValue += C[chromosome.gapCoverage[i]][j]*pow(atLeastOneFalseCutRate, j)*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - j); - } - likelihoodh0 = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(atLeastOneFalseCutRate, chromosome.gapCount[i])*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); - likelihoodha = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(digestionRate, chromosome.gapCount[i])*pow(1-digestionRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); - likelihoodRatio = pow((1-atLeastOneFalseCutRate)/(1-digestionRate), chromosome.gapCoverage[i] - chromosome.gapCount[i]) * pow(atLeastOneFalseCutRate/digestionRate, chromosome.gapCount[i]); - - // printf("%10lE %10lE\n", likelihoodh0 / likelihoodha, likelihoodRatio); - - LL x = chromosome.gapCount[i]; - LL k = chromosome.gapCoverage[i]; - double p = digestionRate; - double u = atLeastOneFalseCutRate; - for (LL k1 = 0; k1 <= k; k1++){ - double tempLhb = 0; - LL k2 = k - k1; - for (LL y=max(0, x - k2); y <= min(k1, x); y++){ - tempLhb += C[k1][y]*C[k2][x-y]*pow(u, y)*pow(1-u, k1-y)*pow(1-p, y+k2-x)*pow(p, x-y); - } - likelihoodhb += C[k][k1]*tempLhb; - } - likelihoodhb /= pow(2, k); - - -// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, ((i+1= numberOfSupportSignalMolecule && pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) - chromosome.gapSigni[i] = true; - else chromosome.gapSigni[i] = false; - - if (chromosome.gapSigni[i]){ - if (likelihoodha > likelihoodhb){ - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 1, 1, pValue, likelihoodha, x, 0); - } else { - variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 0, 1, pValue, likelihoodhb, x, 0); - } - if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); - } - - // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId,chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Ins"); - } -} - -void initStatData(){ - homoMissingSite = 0; - homoNonMissingSite = 0; - homoInsertSite = 0; - homoNonInsertSite = 0; - heterMissingSite = 0; - heterNonMissingSite = 0; - heterInsertSite = 0; - heterNonInsertSite = 0; - numberOfNickingSite = 0; - numberOfHomoDeletedSNP = 0; - numberOfHomoMissingFragment = 0; - numberOfVariant = 0; - numberOfNoSupportSV = 0; -} - -void initData(){ - memset(chromosome.position, 0, sizeof(chromosome.position)); - memset(chromosome.coverage, 0, sizeof(chromosome.coverage)); - memset(chromosome.occurrence, 0, sizeof(chromosome.occurrence)); - memset(chromosome.gapCoverage, 0, sizeof(chromosome.gapCoverage)); - memset(chromosome.gapCount, 0, sizeof(chromosome.gapCount)); - chromosome.length = 0; - chromosome.numberOfSites = 0; -// memset(opticalMap, 0, sizeof(opticalMap)); -// memset(distancePair, 0, sizeof(distancePair)); - distancePairCount = 0; -} - -void correctOverlapVariant(){ -// delete [] distancePair; -// delete [] opticalMap; - vector tempVariant; - tempVariant.resize(variant.size()+10000); - vector tempRealVariant; - tempRealVariant.resize(variant.size()+10000); - LL tempCnt = 0, tempRealCnt = 0; - - - // Consider Segmental Only - for (LL i=0; i= tempRealVariant[tempIndex].end) || \ - (tempRealVariant[tempIndex].start <= tempRealVariant[tempIndex+1].start && tempRealVariant[tempIndex].end >= tempRealVariant[tempIndex+1].end) - )) - tempIndex++; - tempEnd = tempIndex; - LL tempMaxLength = -1, tempMaxLengthIndex = -1; - LL tempDelCnt = 0, tempInsCnt = 0, tempHomoCnt = 0, tempHeterCnt = 0; -////////////////////////S1:find the maximum length//////////////////////////////// -/* - // Find non-signal variants first - for (LL i=tempStart; i<=tempEnd; i++){ - if (!tempRealVariant[i].isSignal && tempRealVariant[i].end - tempRealVariant[i].start > tempMaxLength){ - tempMaxLength = tempRealVariant[i].end - tempRealVariant[i].start; - tempMaxLengthIndex = i; - } - if (!tempRealVariant[i].isSignal){ - if (tempRealVariant[i].isDel) - tempDelCnt++; - if (!tempRealVariant[i].isDel) - tempInsCnt++; - if (tempRealVariant[i].isHomo) - tempHomoCnt++; - if (!tempRealVariant[i].isHomo) - tempHeterCnt++; - } - } - // If can't find, find others - if (tempMaxLengthIndex == -1){ - tempMaxLength = -1; - for (LL i=tempStart; i<=tempEnd; i++) - if (tempRealVariant[i].end - tempRealVariant[i].start > tempMaxLength){ - tempMaxLength = tempRealVariant[i].end - tempRealVariant[i].start; - tempMaxLengthIndex = i; - } - } - tempVariant[tempCnt++] = tempRealVariant[tempMaxLengthIndex]; -*/ -////////////////////////S2:find the cases without completely covering any others/////////////////////////////////////////////// - //if (tempRealVariant[tempStart].end==135838989 && tempRealVariant[tempStart].chr == 2)printf("There are %lld cases\n",tempEnd-tempStart+1); - for (LL i=tempStart; i<=tempEnd; i++) - { - if (tempRealVariant[i].isSignal) continue; - bool flag = true; - for (LL j = tempStart; j <=tempEnd; j++) - { - if (j==i) continue; - if (tempRealVariant[i].end >= tempRealVariant[j].end && tempRealVariant[i].start <= tempRealVariant[j].start) - { - flag = false; - break; - } - } - //printf("%lld:%lld-%lld, ",tempRealVariant[i].chr,tempRealVariant[i].start,tempRealVariant[i].end); - //if (flag) printf("Get it!\n"); - //else printf("Failed!\n"); - if (flag) tempVariant[tempCnt++] = tempRealVariant[i]; - } - - - - - - - - tempStart = tempEnd+1; - } - - printf("numberOfRealVariant after filter overlapping: %lld\n", tempCnt); - - for (LL i=0; i0&&variant[i].sizeExt>0) - { - strcpy(tempH,"HI1I2"); - strcpy(tempIndel,"Insertions"); - } - else if (variant[i].size<0&&variant[i].sizeExt<0) - { - strcpy(tempH,"HD1D2"); - strcpy(tempIndel,"Deletions"); - } - else if (variant[i].size*variant[i].sizeExt<0) - { - strcpy(tempH,"HDI"); - strcpy(tempIndel,"Both"); - } - else - { - strcpy(tempH,(variant[i].isHomo?"Homozygous":"Heterozygous")); - strcpy(tempIndel,(variant[i].isDel?"Deletion":"Insertion")); - } - fprintf(outputVariantResultFile, "%lld\t%lld\t%lld\t%s%s\t%lld\t%lf\t%lld\tsizeChange=%lld\t%s\t%g\t%g\t%lld\t%lld\n", variant[i].chr,\ - variant[i].start, variant[i].end, tempIndel, variant[i].isSignal?".site":"", variantCallId++, 0.0, inputTrio, variant[i].size, tempH,variant[i].likelihood, variant[i].ratio, variant[i].support, variant[i].oppoSupp); - numberOfSupportedSV++; - } -} - -int main(int argv, char *argc[]){ - printf("\n"); - setDefault(); - char SVoutputFile[1000]; - memset(SVoutputFile,0,sizeof(SVoutputFile)); - strcpy(SVoutputFile,"Detected_structual_variants"); - memset(outputFolder,0,sizeof(outputFolder)); - char chrMapFile[1000]; - int numOfChr = 24; - memset(chrMapFile,0,sizeof(chrMapFile)); - strcpy(inputAlignmentFileName,"C6661_700bp_hg38_combRefOMB2.oma"); - strcpy(outputFileLocation,"./"); - strcpy(outputFolder,"./"); - strcpy(chrMapFile,"hg38_r.cmap"); - if (argv == 1) - { - printf("\nThe valid parameters are described as follows:\n"); - printf("\t-inputLabel: \n\t\t Default value: %lld. The index/label of genome.\n",inputTrio); - printf("\t-outputFolder: \n\t\t Default value: %s. The path of the folder to store the output fils.\n",outputFolder); - printf("\t-SVoutputFile: \n\t\t Default value: %s. The prefix of the file name of SVs (.osv).\n",SVoutputFile); - printf("\t-chrMapFile: \n\t\t Default value: %s. The file name of the reference map(.cmap).\n",chrMapFile); - printf("\t-optAlignFile: \n\t\t Default value: %s. The file name of the alignment map file(.oma).\n",inputAlignmentFileName); - printf("\t-optTempFolder: \n\t\t Default value: %s. The folder to store the processed alignment maps by chromosomes.\n",outputFileLocation); - printf("\t-likelihoodRatioCutOff: \n\t\t Default value: %g. The cutoff of the likelihood ratio for SV all hypothesis (reciprocal of the one in the paper). The default value changes along with the experiment data.\n",likelihoodRatioCutOff); - printf("\t-numberOfSupportIndelMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment being called SVs. The default value changes along with the experiment data.\n",numberOfSupportIndelMolecule); - printf("\t-numberOfSupportSignalMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment to call signal variations. The default value changes along with the experiment data.\n",numberOfSupportSignalMolecule); - printf("\t-minIndelSize: \n\t\t Default value (b): %g. The minimum length of a segment to call SVs.\n",minIndelSize); - printf("\t-minIndelRatio: \n\t\t Default value: %g. The length proportion of a minimum SV could be detected on a segment. E.g. segment = 10000b, then the length of the minimum SV should be larger than 10000*0.05=500b.\n",minIndelRatio); - printf("\t-resolutionLimit: \n\t\t Default value: %g. The minimum length of a segment to call signal variations.\n",resolutionLimit); - printf("\t-digestionRate: \n\t\t Default value: %g. The digestion rate of labels (signals) measured in the experiment.\n",digestionRate); - printf("\t-falseCutRate: \n\t\t Default value: %g. The rate of false cut of a non-label position.\n",falseCutRate); - printf("\t-pValueCutOff: \n\t\t Default value: %g. The cutoff of p-value when call signal variations.\n",pValueCutOff); - printf("\t-cauchyMean: \n\t\t Default value: %g. The mean value of cauchy distribution of null hypothesis when calling SVs. Reset a new value only if you have good reason.\n",cauchyMean); - printf("\t-cauchyScale: \n\t\t Default value: %g. The parameter to calculate cauchy distribution. Reset a new value only if you have good reason.\n",cauchyScale); - printf("\t-confidenceLimit: \n\t\t Default value: %g. The lowest alignment confidence for molecules (optical maps) to call SVs or signal variations.\n",confidenceLimit); - printf("\t-numberOfChromosome: \n\t\t Default value: %d. The first n chromosomes to detect SVs.\n",numOfChr); - return -1; - } - bool paraWrongFlag = false; - for (int i = 1; i < argv; i=i+2) - { - string temp(argc[i]); - //printf("What is the parameter?? %s \n",temp.c_str()); - if (temp.compare("-inputLabel")==0) - inputTrio = atol(argc[i+1]); - else if (temp.compare("-numberOfChromosome")==0) - numOfChr = atoi(argc[i+1]); - else if (temp.compare("-optAlignFile")==0) - { - memset(inputAlignmentFileName,0,sizeof(inputAlignmentFileName)); - strcpy(inputAlignmentFileName, argc[i+1]); - } - else if (temp.compare("-optTempFolder")==0) - { - memset(outputFileLocation,0,sizeof(outputFileLocation)); - strcpy(outputFileLocation, argc[i+1]); - } - else if (temp.compare("-SVoutputFile")==0) - { - memset(SVoutputFile,0,sizeof(SVoutputFile)); - strcpy(SVoutputFile,argc[i+1]); - } - else if (temp.compare("-chrMapFile")==0) - { - memset(chrMapFile,0,sizeof(chrMapFile)); - strcpy(chrMapFile,argc[i+1]); - } - else if (temp.compare("-outputFolder")==0) - strcpy(outputFolder,argc[i+1]); - else if (temp.compare("-likelihoodRatioCutOff")==0) - { likelihoodRatioCutOff = atof(argc[i+1]); - if (likelihoodRatioCutOff < 0) - { - printf("Error! An negative likelihoodRatioCutOff is inputted!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-numberOfSupportIndelMolecule")==0) - { - numberOfSupportIndelMolecule = atoi(argc[i+1]); - if (numberOfSupportIndelMolecule < 0) - { - printf("Error! An negative numberOfSupportIndelMolecule is inputted!\n"); - paraWrongFlag = true; - } - if (numberOfSupportIndelMolecule >= 1000) - { - printf("Warning! An improper large (>=1000) numberOfSupportIndelMolecule is inputted!\n"); - } - } - else if (temp.compare("-minIndelSize")==0) - { - minIndelSize = atof(argc[i+1]); - if (minIndelSize < 1) - { - printf("Error! Please make sure the minIndelSize is a positive integer!\n"); - paraWrongFlag = true; - } - if (minIndelSize >= 100000) - { - printf("Warning! An improper large number (>=100000b) Of minIndelSize is inputted!\n"); - } - } - else if (temp.compare("-minIndelRatio")==0) - { - minIndelRatio = atof(argc[i+1]); - if (minIndelRatio <= 0||minIndelRatio >= 1) - { - printf("Error! An improper number Of minIndelRatio is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-resolutionLimit")==0) - { - resolutionLimit = atof(argc[i+1]); - if (resolutionLimit < 0) - { - printf("Error! An negative number Of resolutionLimit is inputted!\n"); - paraWrongFlag = true; - } - if (resolutionLimit >= 10000) - { - printf("Warning! An improper large number (>=10000) of resolutionLimit is inputted!\n"); - } - } - else if (temp.compare("-digestionRate")==0) - { - digestionRate = atof(argc[i+1]); - if (digestionRate <= 0||digestionRate>1) - { - printf("Error! An improper digestionRate is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (digestionRate < 0.5) - { - printf("Warning! An very low level of digestionRate (<0.5) is adapted!\n"); - } - } - else if (temp.compare("-falseCutRate")==0) - { - falseCutRate = atof(argc[i+1]); - if (falseCutRate < 0||falseCutRate>=1) - { - printf("Error! An improper falseCutRate is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (falseCutRate > 0.8) - printf("Warning! An very large (>0.8) falseCutRate is inputted!\n"); - } - else if (temp.compare("-pValueCutOff")==0) - { - pValueCutOff = atof(argc[i+1]); - if (pValueCutOff <= 0||pValueCutOff>=1) - { - printf("Error! An improper pValueCutOff is inputted, should be in between 0 and 1!\n"); - paraWrongFlag = true; - } - else if (pValueCutOff > 0.1) - { - printf("Warning! An non-significant (>0.1) pValueCutOff is inputted!\n"); - } - } - else if (temp.compare("-cauchyMean")==0) - { - cauchyMean = atof(argc[i+1]); - if (cauchyMean < 0.8||cauchyMean > 1.2) - { - printf("Error! cauchyMean should be around 1!\n"); - paraWrongFlag = true; - } - else - printf("You have changed the value of cauchyMean to be %g, please make sure it is your intention!\n",cauchyMean); - } - else if (temp.compare("-cauchyScale")==0) - { - cauchyScale = atof(argc[i+1]); - if (cauchyScale < 0) - { - printf("Warning! An negative cauchyScale is inputted!\n"); - paraWrongFlag = true; - } - else - printf("You have changed the value of cauchyScale to be %g, please make sure it is your intention!\n",cauchyScale); - } - else if (temp.compare("-confidenceLimit")==0) - { - confidenceLimit = atof(argc[i+1]); - if (cauchyScale < 0) - { - printf("Error! An negative confidenceLimit is inputted!\n"); - paraWrongFlag = true; - } - } - else if (temp.compare("-numberOfSupportSignalMolecule")==0) - { - numberOfSupportSignalMolecule = atol(argc[i+1]); - if (numberOfSupportSignalMolecule < 0) - { - printf("Error! An negative numberOfSupportSignalMolecule is inputted!\n"); - paraWrongFlag = true; - } - if (numberOfSupportSignalMolecule > 1000) - { - printf("Warning! An very large (>1000) numberOfSupportSignalMolecule is inputted!\n"); - } - } - else - { - printf("No such parameter or wrong : %s\n",argc[i]); - paraWrongFlag = true; - } - } - if (paraWrongFlag) - { - printf("\nThe format to arrange parameters is:\n\t./makeRefine_en -param_1 val_1 -param_2 val_2 ... -param_n val_n\n"); - printf("\nPlease check your input parameters!\n\n"); - return -1; - } - - init(); - NCRcalculation(); - // findResolutionDistribution(); return 0; - initResultFullList(SVoutputFile); - getChromosomeList(chrMapFile); - initStatData(); - //new content - bool canOF = true; - numOfChr = 24; - for (LL tempChr = 0; tempChr < (LL)listOfChromosome.size(); tempChr++){ - LL chrID = listOfChromosome[tempChr]; - if (chrID > numOfChr) continue; - char buff[200]; - char nameOF[1000]; - strcpy(nameOF, outputFileLocation); - sprintf(buff, "%lld_%d",inputTrio,chrID); - strcat(nameOF, buff); - strcat(nameOF, ".bmap"); - if ((inputOptAlign = fopen(nameOF, "r")) == NULL) - { - canOF = false; - break; - } - } - getValue(); - if (!canOF) - { - readSourceFile(); - getValue(); - puts("DONE readSourceFile"); - addSplitedMap(); - puts("DONE addSplitedMap"); - outputToBillMapDestinationFile(); - getValue(); - } - //content end - for (LL tempChrId = 0; tempChrId < (LL)listOfChromosome.size(); tempChrId++){ -// chrType chromosome; -// printf("Start process chr:\t"); - chrId = listOfChromosome[tempChrId]; - //if (chrId > numOfChr) continue; - // for (chrId = 12; chrId <= 12; chrId++){ -// printf("%lld\n",chrId); - canOpenFile = true; - initData(); -// printf("Doing %d\n", chrId); - readChromosomeInfo(chrId,chrMapFile); -// printf("Done Read Chromosome\n"); - readOpticalAlign(chrId,outputFileLocation); - if (!canOpenFile) - { - printf("Open File fails?\n"); - continue; - } - // printf("Done Read Optical\n"); - //printDistancePair("test_output/Distance_before_parsing"); - printf("Distance Pair Count-1: %lld\n", distancePairCount); - parseHitEnum(); -// printf("Done parsehitEnum\n"); - printf("Distance Pair Count-2: %lld\n", distancePairCount); - completePairs(); -// printf("pair count: %lld\n", distancePairCount); - //printDistancePair("test_output/Distance_after_parsing"); - detectByDistancePair(); - //printDistancePair("test_output/Distance_after_detection"); - printf("Distance Pair Count-3: %lld\n", distancePairCount); -// printf("Done detectByDistancePair\n"); - -// processDistancePair(); -// printf("Done processDistancePair\n"); - //printDistancePair("test_output/Distance_after_processing"); -// printf("Distance Pair Count-4: %lld\n", distancePairCount); -// printf("Number of sites: %lld\n", chromosome.numberOfSites); - deletionTest(chrId); -// printf("Done Deletion\n"); - insertionTest(chrId); - // printf("Done Insertion\n"); - } - correctOverlapVariant(); - printf("Memory usage: "); - getValue(); - - outputVariantResult(argv,argc); -// printf("Number of No support SV: %lld\n", numberOfNoSupportSV); - printf("Number of support SV: %lld\n", numberOfSupportedSV); - printf("\n"); - return 0; -} From df40d7176c5acf5ee76298313a473916724a46c2 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:27:23 +0800 Subject: [PATCH 06/13] Update readme.md --- source/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/readme.md b/source/readme.md index 986da43..7fa1da9 100644 --- a/source/readme.md +++ b/source/readme.md @@ -1,2 +1,2 @@ The c++ source files (c++11, compiled by g++) are put in this folder. -OMSV\_v3.cpp and OMSV\_mixedIndel\_v3.cpp are the original code (consistent with published work). +OMSV\_v4.cpp, which combined the two components, is the improved version and its results are slightly changed. From 5dcfb37ec91f1658cb2d7923719e73fd7bdf1815 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:27:45 +0800 Subject: [PATCH 07/13] Add files via upload --- source/OMSV_v4.cpp | 2048 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2048 insertions(+) create mode 100644 source/OMSV_v4.cpp diff --git a/source/OMSV_v4.cpp b/source/OMSV_v4.cpp new file mode 100644 index 0000000..30ffe89 --- /dev/null +++ b/source/OMSV_v4.cpp @@ -0,0 +1,2048 @@ +#include "NoError.h" +// Can choose Alden, Ref, CHM1, CHM1_2, Angel1, Angel2, Angel3, Angel4, +// Ref_Sim, Alden_sim_T0, Sim_Ref_OMDP, Sim_Alden_OMDP +// Pipeline_Real, Pipeline_Sim, CHM1ToASM, Autism, Schizo +// Real_Ref, Real_Assembly_Hg38, Real_Molecule_Assembly +// Sim_Haploid_Pipeline, Sim_Diploid_Pipeline +// Sim_Hap_Ref, Sim_Dip_Ref +// YH, Sim2_Haploid_OMBlast +// Han // Alden1_1, Alden1_1000, Alden2_1, Alden2_1000 +//#define Alden2_1000 + +FILE* inputChrConsen; +FILE* inputOptAlign; +FILE* outputResult1; +FILE* outputResult2; +FILE* outputVariantResultFile; +char inputAlignmentFileName[1000]; +char outputFileLocation[500]; +FILE *inputAlignmentFile; +FILE *outputFileList[500]; +LL numberOfOpticalMap; +//#if defined (Real_Assembly_Hg38) || defined(Real_Molecule_Assembly) //FILE* outputMoleculeToAssembly; //FILE* outputAssembly; +//#endif +char tempString[10000]; +LL tempLongLong; +double tempDouble; +double tempDistanceDouble[20000]; +double C[9000][9000]; +bool canOpenFile = true; + + +double minIndelSize; +double minIndelRatio; +double resolutionLimit; +LL inputTrio; +double digestionRate; +double falseCutRate; +double pValueCutOff; +double cauchyMean; +double cauchyScale; +LL numberOfSupportEachSide; +double confidenceLimit; +LL numberOfSupportIndelMolecule; +LL numberOfSupportSignalMolecule; +double likelihoodRatioCutOff; +//Start add +struct opticalMapType{ + char mapId[100]; + LL refStartIndex; + LL refEndIndex; + LL refStart; + LL refEnd; + LL optStart; + LL optEnd; + LL chrId; + bool orientation; + double score; + double confidence; +// char hitEnum[1000]; + char hitEnum[1000]; + double fpr=0;//// + double fnr=0;//// + double alignRate=0;//// + vector position; +/* void calc()//// + {//// + int tot_fp = 0, tot_fn = 0; + for (int i = 0; hitEnum[i]!='\0'; i++) + { + int curr = 0; + int fp, fn; + int j; + for (j = i; hitEnum[j] >= '0' && hitEnum[j] <= '9'; j++) + { + curr = curr * 10 + (hitEnum[j] - '0'); + } + if (hitEnum[j] == 'I') + { + fp++; + tot_fp += curr; + } + else if (hitEnum[j] == 'D') + { + fn++; + tot_fn += curr; + } + i = j; + } + int mol_len = 0; + int ali_len = 0; + int cnt = 0; + for (auto ele:position) + { + cnt++; + mol_len += ele; + if (cnt >= min(optStart,optEnd)&&cnt <= max(optStart,optEnd)) ali_len+=ele; + } + fpr = tot_fp*1.0/ali_len; + fnr = tot_fn*1.0/(abs(refEndIndex-refStartIndex)+1); + alignRate = ali_len*1.0/mol_len; + }////*/ + void print(){ + printf("%s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", mapId, score, confidence, fpr,fnr,alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); +// printf("%s %lf %lf %lld %lld %lld %lld %lld %s %lld ", mapId, score, confidence, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); + for (LL i=0; i<(LL)position.size(); i++) + printf("%d ", position[i]); + printf("\n"); + } + void print(FILE* targetFile){ + sprintf(tempString, "%lld", inputTrio); + fprintf(targetFile, "%s %s %lf %lf %lf %lf %lf %lld %lld %lld %lld %lld %s %lld ", tempString, mapId, score, confidence, fpr, fnr, alignRate, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); +// fprintf(targetFile, "%s %s %lf %lf %lld %lld %lld %lld %lld %s %lld ", tempString, mapId, score, confidence, chrId, optStart, optEnd, refStart, refEnd, hitEnum, (LL)position.size()); + for (LL i=0; i<(LL)position.size(); i++) + fprintf(targetFile, "%d ", position[i]); + fprintf(targetFile, "\n"); + } +}; +bool ss(opticalMapType q, opticalMapType w){ + LL e = strcmp(q.mapId, w.mapId); + return ((e<0?true:false) || (e==0 && q.chrId < w.chrId) || (e == 0 && q.chrId == w.chrId && q.optStart < w.optEnd)); +} + +vector opticalMap1; + +void readSourceFile(){ + if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); + int numberOfDL = 0; + char ttt; + char ttts[100000]; + while(fgetc(inputAlignmentFile) == '#') + { + numberOfDL++; + fgets(ttts,100000,inputAlignmentFile); + } + LL totSize = 0; + while(fscanf(inputAlignmentFile,"%s",ttts)==1){ + totSize++; + fgets(ttts,100000,inputAlignmentFile); + } + fclose(inputAlignmentFile); + if ((inputAlignmentFile = fopen(inputAlignmentFileName, "r")) == NULL) puts("ERROR IN READ SOURCE"); + opticalMap1.clear(); + opticalMap1.resize(totSize+10000); +// opticalMap1 = new opticalMapType[20000000]; + for (LL i=0; i 0 + && opticalMap1[i+1].refStart - opticalMap1[i].refEnd < 5000000){ + opticalMap1[tempCC] = opticalMap1[i]; + opticalMap1[tempCC].optStart = opticalMap1[i].optEnd; + opticalMap1[tempCC].optEnd = opticalMap1[i+1].optStart; + opticalMap1[tempCC].refStart = opticalMap1[i].refEnd; + opticalMap1[tempCC].refEnd = opticalMap1[i+1].refStart; + opticalMap1[tempCC].score = 10.0; + opticalMap1[tempCC].confidence = 0.1; + opticalMap1[tempCC].orientation = true; +// opticalMap1[tempCC].calc();//// + opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// + opticalMap1[tempCC].fnr = opticalMap1[i].fnr + opticalMap1[i+1].fnr;//// + //opticalMap1[tempCC].fpr = opticalMap1[i].fpr + opticalMap1[i+1].fpr;//// + opticalMap1[tempCC].position.clear(); +// opticalMap1[tempCC].hitEnum = new char[4]; + strcpy(opticalMap1[tempCC].hitEnum, "FFF"); + if (opticalMap1[tempCC].optStart > opticalMap1[tempCC].optEnd) + tempCC--; + tempCC++; + if (tempCC==totSize){ + totSize+=50000; + opticalMap1.resize(totSize); + } + } + } + numberOfOpticalMap = tempCC; + opticalMap1.resize(numberOfOpticalMap); +} + +void outputToBillMapDestinationFile(){ + for (LL i=0; i numOfChr) continue; + if (outputFileList[opticalMap1[i].chrId] == NULL) + initOutput(opticalMap1[i].chrId,"w+"); + else + initOutput(opticalMap1[i].chrId,"a+"); + opticalMap1[i].print(outputFileList[opticalMap1[i].chrId]); + fclose(outputFileList[opticalMap1[i].chrId]); + } + opticalMap1.clear(); + vector().swap(opticalMap1); +} + + +//stop add + +void setDefault() { + digestionRate = 0.875; + falseCutRate = 0.000010; + pValueCutOff = 1e-9; + cauchyMean = 1.0096; + cauchyScale = 0.0291; + numberOfSupportEachSide = 1; + confidenceLimit = 9; + numberOfSupportIndelMolecule = 10; //coverage of the indel + numberOfSupportSignalMolecule = 10; + likelihoodRatioCutOff = 1e6; // Settings + minIndelSize = 2000; + minIndelRatio = 0.05; + resolutionLimit = 700; + inputTrio = 0507; +} + +LL newChrSite = 0; +LL numberOfHomoMissingFragment = 0; +LL numberOfHomoDeletedSNP = 0; +LL homoMissingSite = 0; +LL numberOfNoSupportSV = 0; +LL homoNonMissingSite = 0; +LL homoInsertSite = 0; +LL homoNonInsertSite = 0; LL heterMissingSite = 0; LL heterNonMissingSite = 0; LL heterInsertSite = 0; LL heterNonInsertSite = 0; LL numberOfNickingSite = 0; LL numberOfOM878 = 0; LL numberOfOM891 = 0; +LL numberOfDetailOpticalMap = 0; LL distancePairCount = 0; LL numberOfVariant = 0; LL numberOfSupportedSV = 0; + +LL tempPre = -1; LL tempCounter = 0; vector listOfChromosome; + +// shortestDist[10] refers to distance between 9-11 +//double shortestDist[500000]; +//LL shortestDistCount[500000]; + +// shortestGapDist[10] refers to distance between 10-11 +//double shortestGapDist[500000]; +//LL shortestGapDistCount[500000]; + + +// 0 = Homo Ins/Del // 1 = Homo Non-Ins/Del // 2 = Heter Ins/Del // 3 = Heter Non-Ins/Del // 4 = nothing + +LL curId; int chrId; + + +bool feq(double x, double y){ + //return fabs(x - y) <= eps; + return fabs(x - y) <= eps*(min(fabs(x),fabs(y)));//should be normalized by the values of x and y +} +bool feq1(double x, double y){ + return fabs(x - y) <= eps; + // return fabs(x - y) <= eps*fabs(min(x,y));//should be normalized by the values of x and y +} + + +LL max(LL x, LL y){ + return x > y ? x : y; +} + +LL min(LL x, LL y){ + return x > y ? y : x; +} + +double average(vector a){ + double ans = 0; + for (LL i=0; i<(LL)a.size(); i++) + ans += a[i]; + ans /= a.size(); + return ans; +} + + +struct variantType{ + LL people; + LL chr; + LL start; + LL end; + LL size; + LL sizeExt; + LL support; +// double score; +// double confidence; +// double fpr;//// + // double fnr;//// +// double alignRate;//// + LL oppoSupp;//// + double ratio; + double ratioExt; + double likelihood; + bool isSignal; + bool isDel; + bool isHomo; + bool isSupported; + void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSizeExt, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double inputRatioExt, double likely, LL supp, LL IoppoSupp) + {//version for SV +// fpr = Ifpr;//// + // fnr = Ifnr;//// + // alignRate = IalignRate;//// + oppoSupp = IoppoSupp;//// +// score = Iscore; +// confidence = Iconfidence; + people = inputPeople; + chr = inputChr; + start = inputStart; + end = inputEnd; + size = inputSize; + sizeExt = inputSizeExt; + ratio = inputRatio; + ratioExt = inputRatioExt; + likelihood = likely; + support = supp; + isSignal = (inputSignal == 1); + isDel = (inputDel == 1); + isHomo = (inputHomo == 1); + isSupported = (inputSupport == 1); + }; + void update(LL inputPeople, LL inputChr, LL inputStart, LL inputEnd, LL inputSize, LL inputSignal, LL inputDel, LL inputHomo, LL inputSupport, double inputRatio, double likely, LL supp, LL IoppoSupp) + {//version for signal changes +// fpr = Ifpr;//// + // fnr = Ifnr;//// + // alignRate = IalignRate;//// + oppoSupp = IoppoSupp;//// +// score = Iscore; +// confidence = Iconfidence; + people = inputPeople; + chr = inputChr; + start = inputStart; + end = inputEnd; + size = inputSize; + sizeExt = 0; + ratio = inputRatio; + ratioExt = 0; + likelihood = likely; + support = supp; + isSignal = (inputSignal == 1); + isDel = (inputDel == 1); + isHomo = (inputHomo == 1); + isSupported = (inputSupport == 1); + }; + bool operator<(const variantType &x) const{ + return (chr < x.chr || (chr == x.chr && start < x.start) || (chr == x.chr && start == x.start && end < x.end) || (chr == x.chr && start == x.start && end == x.end && isSupported)); + } + + /*void print(){ + printf("%lld %lld %lld %lld %lld %d %d %d\n", people, chr, start, end, size, isSignal, isDel, isHomo); + };*/ +}; +struct chrType{ + LL length; + LL numberOfSites; +// LL position[50000]; + // distance[0] = position[1] - position[0] +// LL distance[50000]; +// LL oldDistance[50000]; +// LL gapDistance[50000]; + // coverage[1] refer to nicking site 1 +// LL coverage[50000]; + // occurrence[1] refer to nicking site 1 +// LL occurrence[50000]; + // gapCount[1] refers to gap between site 1 to 2 +// LL gapCount[50000]; +// LL gapCoverage[50000]; + // If it is significant by Kevin's definition +// bool gapSigni[50000]; +// bool signi[50000]; + // distance Difference +// double distanceDifference[50000]; +// double gapDistanceDifference[50000]; + LL* position = new LL[10]; + LL* distance = new LL[10]; + int* coverage = new int[10]; + int* occurrence = new int[10]; + int* gapCount = new int[10]; + int* gapCoverage = new int[10]; + bool* gapSigni = new bool[10]; + bool* signi = new bool[10]; +}; +chrType chromosome; + +struct optAlignType{ + LL belongs; + char mapId[100]; + LL optStart; + LL optEnd; + LL refStart; + LL refEnd; + LL numberOfSites; + double score; + double confidence; + char hitEnum[1200]; + + // position = distance[i+1] - distance[i] + int position[2500]; + int oldPosition[2500]; + double fpr;//// + double fnr;//// + double alignRate;//// + +}; + +struct distanceType{ + char mapId[20]; + LL start; + LL end; +// double score; +// double confidence; +// double fpr;//// + // double fnr;//// + // double alignRate;//// + double distance; + LL cnt; + bool operator<(const distanceType &x) const{ + return (start < x.start || (start == x.start && end < x.end)); + } + void print(){ + printf("chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end]-chromosome.position[start], distance, cnt); + } + void printToFile(FILE* outputFile){ + fprintf(outputFile, "chrId:%d, id:%s start:%lld end:%lld oldDis:%lld, molDis:%lf cnt:%lld\n", chrId, mapId, chromosome.position[start], chromosome.position[end], chromosome.position[end] - chromosome.position[start], distance, cnt); + } +}; +vector opticalMap; +vector distancePair; +vector variant; +void init(){ + // Has reduced by half +// opticalMap = new optAlignType[3000000]; +// distancePair = new distanceType[100000000]; + variant.clear(); + variant.resize(100000); +} +char outputFolder[1000]; + +FILE* createInputTrioFile(const char *suffix, LL inputTrio, const char *mode){ + char nameOfFile[1000]; + char buffer[200]; + memset(buffer, 0, sizeof(buffer)); + strcpy(nameOfFile, outputFolder); + sprintf(buffer, "%lld_%g_", inputTrio,minIndelSize); + strcat(nameOfFile, buffer); + strcat(nameOfFile, suffix); + //printf("The file name is: %s\nAnd mode is %s\n",nameOfFile,mode); + FILE* retFile = fopen(nameOfFile, mode); + if (retFile == NULL) perror("error in opening file!"); + return retFile; +} + + + +void initResultFullList(char* SVoutputFile){ + char nameOfFile[1000]; + char buffer[200]; + char extPart[200]; + outputResult1 = createInputTrioFile("Distance_Pairs_List.txt", inputTrio, "w+"); + outputResult2 = createInputTrioFile("Signal_Indels_List.txt", inputTrio, "w+"); /* +#if defined(Real_Molecule_Assembly) + outputMoleculeToAssembly = createInputTrioFile("/local/shared/bill/BillMapFromREAL_Molecule_ASM_HG38/distanceList/", "moleculeToAssembly_List.txt", inputTrio, "w+"); +#endif if defined(Real_Assembly_Hg38) +*/ + sprintf(extPart,"_%g",minIndelSize); + memset(nameOfFile, 0, sizeof(nameOfFile)); + strcpy(nameOfFile, outputFolder); + sprintf(buffer, "%lld", inputTrio); + strcat(nameOfFile, buffer); + strcat(nameOfFile, SVoutputFile); + strcat(nameOfFile, extPart); + strcat(nameOfFile, ".osv"); +// printf("%s\n",nameOfFile); + if ((outputVariantResultFile = fopen(nameOfFile, "w+")) == NULL){ + perror("error opening file(): nameOfFile"); + } +} + +bool overlapped(LL x1, LL y1, LL x2, LL y2){ + if (x1 >= x2 && x1 <= y2) return true; + if (y1 >= x2 && y1 <= y2) return true; + if (x2 >= x1 && x2 <= y1) return true; + if (y2 >= x1 && y2 <= y1) return true; + return false; +} +void getChromosomeList(char* chrMapFile){ + listOfChromosome.clear(); + char nameOfFile[1000]; + strcpy(nameOfFile, chrMapFile); + if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL){ + perror("error opening file(): ChromosomeInfo"); + } + fscanf(inputChrConsen,"%s",tempString); + while(tempString[0]=='#'){ + fgets(tempString, 10000, inputChrConsen); + fscanf(inputChrConsen,"%s",tempString); + } + LL tempChrId, tempNumberOfSites; + double tempDouble1, tempDouble2; + tempChrId = atoll(tempString); + if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); + listOfChromosome.push_back(tempChrId); + fgets(tempString, 10000, inputChrConsen); + while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ + listOfChromosome.push_back(tempChrId); + fgets(tempString, 10000, inputChrConsen); + } + fclose(inputChrConsen); + sort(listOfChromosome.begin(), listOfChromosome.end()); + vector::iterator tempIt; + tempIt = unique(listOfChromosome.begin(), listOfChromosome.end()); + listOfChromosome.resize(distance(listOfChromosome.begin(), tempIt)); + printf("Number of Chromosomes: %lld\n", (LL)listOfChromosome.size()); +} + +void readChromosomeInfo(int chr,char* chrMapFile){ + char nameOfFile[1000]; + strcpy(nameOfFile, chrMapFile); + if ((inputChrConsen = fopen(nameOfFile, "r")) == NULL){ + perror("error opening file(): ChromosomeInfo"); + } + fscanf(inputChrConsen,"%s",tempString); + while(tempString[0]=='#'){ + fgets(tempString, 10000, inputChrConsen); + fscanf(inputChrConsen,"%s",tempString); + } + LL cc = 0; + LL tempChrId, tempNumberOfSites; + double tempDouble1, tempDouble2; + tempChrId = atoll(tempString); + bool done= false; + delete[] chromosome.position; + delete[] chromosome.distance; + delete[] chromosome.coverage; + delete[] chromosome.occurrence; + delete[] chromosome.gapCount; + delete[] chromosome.gapCoverage; + delete[] chromosome.gapSigni; + delete[] chromosome.signi; + if (fscanf(inputChrConsen, "%lf %lld %lld %lld %lf %lf %lf %lf", &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) != 8) perror("Wrong in read chromosome maps!"); + if (tempChrId == chr){ + chromosome.numberOfSites = tempNumberOfSites; + chromosome.position = new LL[tempNumberOfSites+10]; + chromosome.distance = new LL[tempNumberOfSites+10]; + chromosome.coverage = new int[tempNumberOfSites+10]; + chromosome.occurrence = new int[tempNumberOfSites+10]; + chromosome.gapCount = new int[tempNumberOfSites+10]; + chromosome.gapCoverage = new int[tempNumberOfSites+10]; + chromosome.gapSigni = new bool[tempNumberOfSites+10]; + chromosome.signi = new bool[tempNumberOfSites+10]; + done = true; + chromosome.position[cc] = (LL) tempDouble2; + chromosome.length = (LL) tempDouble1; + chromosome.gapCount[cc] = 0; + chromosome.coverage[cc] = 0; + chromosome.occurrence[cc] = 0; + cc++; + } + fgets(tempString, 10000, inputChrConsen); + while (fscanf(inputChrConsen, "%lld %lf %lld %lld %lld %lf %lf %lf %lf", &tempChrId, &tempDouble1, &tempNumberOfSites, &tempLongLong, &tempLongLong, &tempDouble2, &tempDouble, &tempDouble, &tempDouble) == 9){ + if (tempChrId == chr){ + chromosome.numberOfSites = tempNumberOfSites; + if (!done){ + chromosome.position = new LL[tempNumberOfSites+10]; + chromosome.distance = new LL[tempNumberOfSites+10]; + chromosome.coverage = new int[tempNumberOfSites+10]; + chromosome.occurrence = new int[tempNumberOfSites+10]; + chromosome.gapCount = new int[tempNumberOfSites+10]; + chromosome.gapCoverage = new int[tempNumberOfSites+10]; + chromosome.gapSigni = new bool[tempNumberOfSites+10]; + chromosome.signi = new bool[tempNumberOfSites+10]; + done = true; + } + chromosome.position[cc] = (LL) tempDouble2; + chromosome.length = (LL) tempDouble1; + chromosome.gapCount[cc] = 0; + chromosome.coverage[cc] = 0; + chromosome.occurrence[cc] = 0; + cc++; + } + fgets(tempString, 10000, inputChrConsen); + } + chromosome.numberOfSites++; + for (LL i=0; i tempMap; +// vector tempVec[10000]; + LL oppoSupp = 0; +/* bool vectorOverFlow = false; + for (LL i=start-1; i>=0; i--){ + if (distancePair[i].start >= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ + string mapFrom(distancePair[i].mapId); + LL mapTo = (LL)tempMap.size() + 1; + if (tempMap.count(mapFrom) == 0){ + tempMap.insert(make_pair(mapFrom, mapTo)); + } + LL newIndex = tempMap[mapFrom]; + if (newIndex >= 10000){ + vectorOverFlow = true; + break; + } + tempVec[newIndex].push_back(distancePair[i]); + } else + break; + } + if (vectorOverFlow) return; + for (LL i=end+1; i= distancePair[start].start && distancePair[i].end <= distancePair[start].end){ + string mapFrom(distancePair[i].mapId); + LL mapTo = (LL)tempMap.size() + 1; + if (tempMap.count(mapFrom) == 0){ + tempMap.insert(make_pair(mapFrom, mapTo)); + } + LL newIndex = tempMap[mapFrom]; + if (newIndex >= 10000){ + vectorOverFlow = true; + break; + } + tempVec[newIndex].push_back(distancePair[i]); + } else + break; + } + if (vectorOverFlow) return;*/ + double averageScore = 0;//// + double averageConfidence = 0;//// + double averageFPR = 0;//// + double averageFNR = 0;//// + double averageAlignRate = 0;//// +/* for (LL i=1; i<(LL)tempMap.size()+1; i++){ + double tempMoleCoverSize = 0; + double tempMoleRealSize = 0; + double tempScore = 0;//// + double tempConfidence =0;//// + double tempFPR = 0;//// + double tempFNR = 0;//// + double tempAlignRate = 0;//// + for (LL j=0; j<(LL)tempVec[i].size(); j++){ + tempMoleCoverSize += (chromosome.position[tempVec[i][j].end] - chromosome.position[tempVec[i][j].start]); + tempMoleRealSize += tempVec[i][j].distance; + } + if (feq1(tempMoleCoverSize, referenceDistance)) + { + tempDistanceDouble[cnt++] = tempMoleRealSize/referenceDistance; + if (abs(tempMoleRealSize - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// + } + } + //double averageScore = 0; + //double averageConfidence = 0; + for (LL i=start; i<=end; i++) + { + tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; + if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// + } + sort(tempDistanceDouble, tempDistanceDouble + cnt);*/ + + vector maps; + vector dists; + maps.clear(); + dists.clear(); +// fprintf(outputResult1, "%d %lld %lld %lld %lld\t", chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], distancePair[start].start+1, distancePair[start].end+1); + for (LL i=start; i<=end; i++) + { + bool fal = true; + for (LL j = 0; j < (LL)maps.size(); j++) + { + if (maps[j] == atol(distancePair[i].mapId) && abs(dists[j] - distancePair[i].distance) < 1){fal = false;break;} + } + if (fal) + { + maps.push_back(atol(distancePair[i].mapId)); + dists.push_back(distancePair[i].distance); + tempDistanceDouble[cnt++] = distancePair[i].distance/referenceDistance; + // fprintf(outputResult1, "%lld:%lld ",atol(distancePair[i].mapId),(LL)distancePair[i].distance); + //printf("Let me see: %g\t%g\t%lld\n", distancePair[i].distance,referenceDistance,oppoSupp); + if (abs(distancePair[i].distance - referenceDistance)*2 <= minIndelSize) oppoSupp++;//// + } + } + // fprintf(outputResult1, "%lld\n",(LL)maps.size()); + // for (LL i = 0; i < (LL)maps.size(); i++) + // fprintf(outputResult1, "%lld:%lld ",maps[i],(LL)dists[i]); + //fprintf(outputResult1, "\n"); +// fprintf(outputResult1, "%lld\n",cnt); + sort(tempDistanceDouble, tempDistanceDouble + cnt); + if (cnt < numberOfSupportIndelMolecule) {//why 2500? + // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0); + // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0); + // variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0); + numberOfNoSupportSV++; + return; + } + //use another vector to remove the outliers in two ends to purify the results + //int outRange = static_cast(cnt/10)>1?static_cast(cnt/10):1; + int outlierRange = 0; + double tempDistanceDoubleNoOutlier[cnt]; + LL cntNoOutlier = 0; + for (LL i = outlierRange; i < cnt-outlierRange;) + { + tempDistanceDoubleNoOutlier[cntNoOutlier++] = tempDistanceDouble[i++]; + } + double sampleMean = 0; + for (LL i=0; i nullHypothesisLikelihood) + { + // Update Homo Indel Variant: also need to consider HI1I2, HD1D2, HDI + if (feq(tempMaxLikelihood, homoIndelHypothesisLikelihood)) + { + if (homoIndelHypothesisLikelihood/nullHL > hetIHL/nullHL*hetDHL/nullHL) + { // if homoHL_ratio > hetIHL_ratio*hetDHL_ratio, we consider homo Indel + //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; + if (tempSizeChange < 0) + { + posFlag = 1; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, tempMaxLikelihood, cnt, oppoSupp); + } + else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) + { + posFlag = 2; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, tempMaxLikelihood, cnt, oppoSupp); + } + } + else if ( ( hetIHL/likelihoodRatioCutOff>nullHL )&&( hetDHL/likelihoodRatioCutOff>nullHL ) )//(hetIHL/nullHL>likelihoodRatioCutOff&&hetDHL/nullHL>likelihoodRatioCutOff) + { // if homoHL_ratio > hetIHL_ratio*hetDHL_ratio, we consider the potential HI1I2, HD1D2, HDI + double disRight = heterInsHypothesisMaxLikelihoodSizeChanged; + double disLeft = heterDelHypothesisMaxLikelihoodSizeChanged; + double disBias = disRight - disLeft;//&&(max(hetIHL/hetDHL,hetDHL/hetIHL)<1e10) + if ( ( cnt>=numberOfSupportIndelMolecule*1.5)&&\ + ( fabs(disBias)>=max(minIndelSize,minIndelRatio*max(fabs(disRight),fabs(disLeft))) ) ) + //( hetIHL/likelihoodRatioCutOff/likelihoodRatioCutOff>nullHL )&&( hetDHL/likelihoodRatioCutOff/likelihoodRatioCutOff>nullHL ) ) + {// if all three differences are significant, then call HI1I2, HD1D2, HDI resp., and store them into separate two variants with the same ref region first + if (disRight > 0) + { + posFlag = 3; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag, hetIHL, cnt, oppoSupp); + } + else + { + posFlag = 4; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 1, 0, 1, hetIHL/nullHL, posFlag, hetIHL, cnt, oppoSupp); + } + if (disLeft > 0) + { + posFlag = 5; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 0, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); + } + else + { + posFlag = 6; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); + } + } + else + {// else also output homo Indel + //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; + if (tempSizeChange < 0) + { + posFlag = 7; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag, homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) + { + posFlag = 8; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + } + } + else + { + if (tempSizeChange < 0) + { + posFlag = 9; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) + { + posFlag = 10; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + } + } + // Update Heter InDel Variant, need to modified tomorrow + else if (feq(tempMaxLikelihood, hetIHL)||feq(tempMaxLikelihood, hetDHL)) + { + double disRight = heterInsHypothesisMaxLikelihoodSizeChanged; + double disLeft = heterDelHypothesisMaxLikelihoodSizeChanged; + double disBias = disRight - disLeft; + + if ( (hetDHL/likelihoodRatioCutOff > nullHL) && (hetIHL/likelihoodRatioCutOff > nullHL) ) + {//if two sides are both confident, then we need to check the possible HI1I2, HD1D2, and HDI + //printf("Actually no such cases?\n"); + if ( (cnt>=numberOfSupportIndelMolecule*1.5 )&&\ + ( fabs(disBias)>=max(minIndelSize,minIndelRatio*max(fabs(disRight),fabs(disLeft))) ) ) + //( min(hetDHL,hetIHL)/likelihoodRatioCutOff/likelihoodRatioCutOff > nullHL ) ) + {// if all three differences are significant, then call HI1I2, HD1D2, HDI resp., and store them into separate two variants with the sam$ + if (disRight > 0) + { + posFlag = -1; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL, cnt, oppoSupp); + } + else + { + posFlag = -2; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 1, 0, 1, hetIHL/nullHL, posFlag, hetIHL,cnt, oppoSupp); + } + if (disLeft > 0) + { + posFlag = -3; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 0, 0, 1, hetDHL/nullHL, posFlag, hetDHL, cnt, oppoSupp); + } + else + { + posFlag = -4; + // printf("Find one case with position flag %d\n",posFlag); + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt, oppoSupp); + } + } + else if ((disRight*disLeft>0)&&(homoIndelHypothesisLikelihood/nullHL > max(max(hetIHL,hetDHL)/min(hetIHL,hetDHL),likelihoodRatioCutOff)))//is it appropriate? No, I need to + {// else if homo is more significant when adjust the heter case (reduce significance if both side support the same case), then output homo Indel + //double tempSizeChange = (sampleMean - cauchyMean) * referenceDistance; + if (tempSizeChange < 0) + { + posFlag = -5; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 1, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + else //if (tempSizeChange >= max(minIndelSize, minIndelRatio * referenceDistance)) + { + posFlag = -6; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], tempSizeChange, 0, 0, 0, 1, 1, homoIndelHypothesisLikelihood/nullHL, posFlag,homoIndelHypothesisLikelihood, cnt, oppoSupp); + } + } + else + if (hetIHL>hetDHL) + { + posFlag = -7; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL,cnt, oppoSupp); + } + else //if ((hetIHL= max(minIndelSize, minIndelRatio * referenceDistance))) + { + posFlag = -8; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt, oppoSupp); + } + } + else + { + if (hetIHL>hetDHL) + { + posFlag = -9; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disRight, 0, 0, 0, 0, 1, hetIHL/nullHL, posFlag,hetIHL,cnt, oppoSupp); + } + else //if (fabs(disLeft) >= max(minIndelSize, minIndelRatio * referenceDistance)) + { + posFlag = -10; + variant[numberOfVariant++].update(curId, chrId, chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], disLeft, 0, 0, 1, 0, 1, hetDHL/nullHL, posFlag,hetDHL,cnt,oppoSupp); + } + } + } + } +// if (chromosome.position[distancePair[start].start]==9810098) // { // printf("Let me see, the segment if (%lld, %lld), and the position flag is %d.\n",chromosome.position[distancePair[start].start], chromosome.position[distancePair[start].end], posFlag); // } +} + +void completePairs() +{ +// sort(distancePair, distancePair + distancePairCount); + sort(distancePair.begin(), distancePair.end()); + vector extraDP; + extraDP.clear(); + LL ost = -1, oed = -1; + for (LL i = 0; i < distancePairCount; i++) + { + if (distancePair[i].start!= ost || distancePair[i].end!=oed)//this locate the non-redundant reference regions (NRRR) + { + distanceType tempPair; + ost = distancePair[i].start; + oed = distancePair[i].end; + if (oed-ost == 1)continue; + for (LL j = 0; j <= i-1; j++)//this search the possible sub-regions + { + //LL ted = distancePair[j].end; + if (distancePair[j].start < ost) continue; + else if (distancePair[j].start > ost) break; + else//for each sub-region holding the same start point with NRRR, we explore the consecutive regions on the same molecule + { + tempPair = distancePair[j]; + bool rep = true; + for (LL k = j+1; distancePair[k].start= oed) break; + } + if (tempPair.end == oed) //no need + { + rep = false; + for (LL l = i; l < distancePairCount; l++) + { + if (distancePair[i] < distancePair[l])break; + else if (strcmp(distancePair[l].mapId, tempPair.mapId)==0) {rep = true; break;} + } + } + if (!rep) extraDP.push_back(tempPair); + } + } + } + else + continue; + } + printf("There are %lld extra distance pairs are added.\n",(LL)extraDP.size()); + // also need to include the new distance pairs into the original set + distancePair.resize(distancePairCount+extraDP.size()); + for (LL i = 0; i < (LL)extraDP.size(); i++){ + distancePair[distancePairCount++]=extraDP[i]; + } +} +void detectByDistancePair(){ + sort(distancePair.begin(), distancePair.end());// sort(distancePair, distancePair + distancePairCount); + distancePair.resize(distancePairCount+1);// printDistancePair(); + distancePair[distancePairCount].start = -1; + distancePair[distancePairCount].end = -1; + distancePairCount++; + LL tempHead = 0, tempTail = 0, tempPreNumVar, tempPreNumNoSuppVar; + for (LL i=0; i=distancePairCount?0:distancePair[i+1].distance; + } + distancePairCount = cc; + distancePair.resize(cc); +// memset(shortestDist, 0, sizeof(shortestDist)); +// memset(shortestDistCount, 0, sizeof(shortestDistCount)); +// memset(shortestGapDist, 0, sizeof(shortestGapDist)); +// memset(shortestGapDistCount, 0, sizeof(shortestGapDistCount)); +// for (LL i=0; i= '0' && opticalMap[i].hitEnum[j] <= '9'){ + tempCount = tempCount * 10 + (opticalMap[i].hitEnum[j]-'0'); + } + else { + if (opticalMap[i].hitEnum[j] == 'M'){ + for (LL k=0; kdistancePair.size()-10000)distancePair.resize(distancePair.size()+100000); + } + distancePair.resize(distancePairCount); + opticalMap.clear(); + vector().swap(opticalMap); +} +void NCRcalculation(){ + for (int i = 0; i < 9000; ++i){ + C[i][0] = C[i][i] = 1; + for (int j = 1; j < i; ++j) + C[i][j] = (C[i - 1][j] + C[i - 1][j - 1]); + for (int j = i + 1; j < 9000; ++j) + C[i][j] = 0; + } +} + +void deletionTest(int chrId){ +// if (chrId == 1){ +// fprintf(outputResult2, "ID\tChr\tSite_Index\tPosition\tCoverage\tOccurence\tP\tU\tP-value\tH0\tHa\tHb\tRef_Dis\tMolecule_Dis\tCount\tType\n"); +// } + for (LL i=0; i 5000) continue; + + // printf("%lld %lld %lld %lld\n", i, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i]); + numberOfNickingSite++; + double pValue = 0; + double likelihoodh0 = 0; + double likelihoodha = 0; + double likelihoodhb = 0; + double likelihoodRatio = 0; + for (LL j=chromosome.coverage[i] - chromosome.occurrence[i]; j<=chromosome.coverage[i]; j++){ + pValue += C[chromosome.coverage[i]][j]*pow(1-digestionRate, j)*pow(digestionRate, chromosome.coverage[i] - j); + } + likelihoodh0 = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(1-digestionRate, chromosome.coverage[i] - chromosome.occurrence[i])*pow(digestionRate, chromosome.occurrence[i]); + + double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); + double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); + + atLeastOneFalseCutRate = 0.1; + likelihoodha = C[chromosome.coverage[i]][chromosome.occurrence[i]]*pow(atLeastOneFalseCutRate, chromosome.occurrence[i])*pow(1 - atLeastOneFalseCutRate, chromosome.coverage[i] - chromosome.occurrence[i]); + // double likelihoodRatio2 = likelihoodh0/likelihoodha; + likelihoodRatio = pow((1-digestionRate)/(1-atLeastOneFalseCutRate), chromosome.coverage[i] - chromosome.occurrence[i]) * pow(digestionRate/atLeastOneFalseCutRate, chromosome.occurrence[i]); + + LL x = chromosome.coverage[i] - chromosome.occurrence[i]; + LL k = chromosome.coverage[i]; + double p = digestionRate; + double u = atLeastOneFalseCutRate; + for (LL k1 = 0; k1 <= k; k1++){ + double tempLhb = 0; + LL k2 = k - k1; + for (LL y=max(0, x - k2); y <= min(k1, x); y++){ + tempLhb += C[k1][y]*C[k2][x-y]*pow(1-p, y)*pow(p, k1-y)*pow(u, y+k2-x)*pow(1-u, x-y); + } + + likelihoodhb += C[k][k1]*tempLhb; + } + likelihoodhb /= pow(2, k); + + +// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, (i+1=0?chromosome.position[i-1]:0), (LL)shortestDist[i], shortestDistCount[i], "Del"); + + +// chromosome.distanceDifference[i] = (i+1=0?chromosome.position[i-1]:0) - shortestDist[i]; + + + if (pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) + chromosome.signi[i] = true; + else chromosome.signi[i] = false; + + if (chromosome.signi[i]){ + if (likelihoodha > likelihoodhb){ + variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 1, 1, pValue, likelihoodha, x, 0); + } else { + variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i], 1, 1, 1, 0, 1, pValue, likelihoodhb, x, 0); + } + if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); + } + // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.coverage[i], chromosome.occurrence[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Miss"); + } +} + +void insertionTest(int chrId){ + for (LL i=0; i 5000) continue; + + double pValue = 0; + double likelihoodh0 = 0; + double likelihoodha = 0; + double likelihoodhb = 0; + double likelihoodRatio = 0; + double sizeDifference = (i == chromosome.numberOfSites - 1 ? 10000 : chromosome.position[i+1] - chromosome.position[i]); + double atLeastOneFalseCutRate = 1 - pow(2.718281828, -falseCutRate*sizeDifference); + + + /* + TEST + atLeastOneFalseCutRate = 0.1; + TEST + */ + for (LL j=chromosome.gapCount[i]; j<=chromosome.gapCoverage[i]; j++){ + pValue += C[chromosome.gapCoverage[i]][j]*pow(atLeastOneFalseCutRate, j)*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - j); + } + likelihoodh0 = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(atLeastOneFalseCutRate, chromosome.gapCount[i])*pow(1-atLeastOneFalseCutRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); + likelihoodha = C[chromosome.gapCoverage[i]][chromosome.gapCount[i]]*pow(digestionRate, chromosome.gapCount[i])*pow(1-digestionRate, chromosome.gapCoverage[i] - chromosome.gapCount[i]); + likelihoodRatio = pow((1-atLeastOneFalseCutRate)/(1-digestionRate), chromosome.gapCoverage[i] - chromosome.gapCount[i]) * pow(atLeastOneFalseCutRate/digestionRate, chromosome.gapCount[i]); + + // printf("%10lE %10lE\n", likelihoodh0 / likelihoodha, likelihoodRatio); + + LL x = chromosome.gapCount[i]; + LL k = chromosome.gapCoverage[i]; + double p = digestionRate; + double u = atLeastOneFalseCutRate; + for (LL k1 = 0; k1 <= k; k1++){ + double tempLhb = 0; + LL k2 = k - k1; + for (LL y=max(0, x - k2); y <= min(k1, x); y++){ + tempLhb += C[k1][y]*C[k2][x-y]*pow(u, y)*pow(1-u, k1-y)*pow(1-p, y+k2-x)*pow(p, x-y); + } + likelihoodhb += C[k][k1]*tempLhb; + } + likelihoodhb /= pow(2, k); + + +// fprintf(outputResult2, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%lld\t%lld\t%lld\t%s\n", curId, chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, ((i+1= numberOfSupportSignalMolecule && pValue <= pValueCutOff && ((likelihoodh0 * likelihoodRatioCutOff <= likelihoodha) || (likelihoodh0 * likelihoodRatioCutOff <= likelihoodhb))) + chromosome.gapSigni[i] = true; + else chromosome.gapSigni[i] = false; + + if (chromosome.gapSigni[i]){ + if (likelihoodha > likelihoodhb){ + variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 1, 1, pValue, likelihoodha, x, 0); + } else { + variant[numberOfVariant++].update(curId, chrId, chromosome.position[i], chromosome.position[i+1], 1, 1, 0, 0, 1, pValue, likelihoodha, x, 0); + } + if (numberOfVariant>variant.size()-1000)variant.resize(variant.size()+10000); + } + // fprintf(outputResult1, "%lld\t%d\t%lld\t%lld\t%lld\t%lld\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%10lE\t%s\n", curId,chrId, i+1, chromosome.position[i], chromosome.gapCoverage[i], chromosome.gapCount[i], p, u, pValue, likelihoodh0, likelihoodha, likelihoodhb, "Homo_Ins"); + } +} + +void initStatData(){ + homoMissingSite = 0; + homoNonMissingSite = 0; + homoInsertSite = 0; + homoNonInsertSite = 0; + heterMissingSite = 0; + heterNonMissingSite = 0; + heterInsertSite = 0; + heterNonInsertSite = 0; + numberOfNickingSite = 0; + numberOfHomoDeletedSNP = 0; + numberOfHomoMissingFragment = 0; + numberOfVariant = 0; + numberOfNoSupportSV = 0; +} +void initData(){ + memset(chromosome.position, 0, sizeof(chromosome.position)); + memset(chromosome.coverage, 0, sizeof(chromosome.coverage)); + memset(chromosome.occurrence, 0, sizeof(chromosome.occurrence)); + memset(chromosome.gapCoverage, 0, sizeof(chromosome.gapCoverage)); + memset(chromosome.gapCount, 0, sizeof(chromosome.gapCount)); + chromosome.length = 0; + chromosome.numberOfSites = 0; + //memset(opticalMap, 0, sizeof(opticalMap)); + //memset(distancePair, 0, sizeof(distancePair)); + distancePairCount = 0; +} +void correctOverlapVariant(){ +// delete opticalMap; +// delete distancePair; +// variantType *tempVariant = new variantType[500000]; +// variantType *tempRealVariant = new variantType[500000]; + vector tempVariant; + tempVariant.resize(variant.size()+10000); + vector tempRealVariant; + tempRealVariant.resize(variant.size()+10000); + LL tempCnt = 0, tempRealCnt = 0; + // Consider Segmental Only + for (LL i=0; i=tempRealVariant[sRep+1].ratio?sRep:sRep+1; + LL chooseY = tempRealVariant[sRep].ratio= tempRealVariant[tempIndex].end) || \ + (tempRealVariant[tempIndex].start <= tempRealVariant[tempIndex+1].start && tempRealVariant[tempIndex].end >= tempRealVariant[tempIndex+1].end) + )) + tempIndex++; + tempEnd = tempIndex; + LL tempMaxLength = -1, tempMaxLengthIndex = -1; + LL tempDelCnt = 0, tempInsCnt = 0, tempHomoCnt = 0, tempHeterCnt = 0; + +////////////////////////S1:find the maximum length//////////////////////////////// +/* + // Find non-signal variants first + for (LL i=tempStart; i<=tempEnd; i++){ + if (!tempRealVariant[i].isSignal && tempRealVariant[i].end - tempRealVariant[i].start > tempMaxLength){ + tempMaxLength = tempRealVariant[i].end - tempRealVariant[i].start; + tempMaxLengthIndex = i; + } + } + // If can't find, find others + if (tempMaxLengthIndex!=-1) tempVariant[tempCnt++] = tempRealVariant[tempMaxLengthIndex]; +*/ +////////////////////////S2:find the cases without completely covering any others/////////////////////////////////////////////// + for (LL i=tempStart; i<=tempEnd; i++) + { + if (tempRealVariant[i].isSignal)continue; + bool flag = true; + for (LL j = tempStart; j <=tempEnd; j++) + { + if (i==j)continue; + if (tempRealVariant[i].end >= tempRealVariant[j].end && tempRealVariant[i].start <= tempRealVariant[j].start) + { + flag = false; + break; + } + } + if (flag) tempVariant[tempCnt++] = tempRealVariant[i]; + } + //tempCnt++; + tempStart = tempEnd+1; + } + + + + printf("numberOfRealVariant after filter overlapping: %lld\n", tempCnt); + + for (LL i=0; i0&&variant[i].sizeExt>0) + { + strcpy(tempH,"HI1I2"); + strcpy(tempIndel,"Insertions"); + } + else if (variant[i].size<0&&variant[i].sizeExt<0) + { + strcpy(tempH,"HD1D2"); + strcpy(tempIndel,"Deletions"); + } + else if (variant[i].size*variant[i].sizeExt<0) + { + strcpy(tempH,"HDI"); + strcpy(tempIndel,"Both"); + } + else + { + strcpy(tempH,(variant[i].isHomo?"Homozygous":"Heterozygous")); + strcpy(tempIndel,(variant[i].isDel?"Deletion":"Insertion")); + } +// if (strcmp(tempH,"Homozygous")!=0&&strcmp(tempH,"Heterozygous")!=0){ + fprintf(outputVariantResultFile, "%lld\t%lld\t%lld\t%s%s\t%lld\t%lf\t%lld\tsizeChange=%lld\t%s\t%g\t%g\t%lld\t%lld\n", variant[i].chr, variant[i].start, variant[i].end, tempIndel, variant[i].isSignal?".site":"", variant[i].sizeExt, 0.0, inputTrio, variant[i].size, tempH, variant[i].likelihood, variant[i].ratio,variant[i].support, variant[i].oppoSupp); + numberOfSupportedSV++; +// } + } +} + +int main(int argv, char *argc[]){ + printf("\n"); + setDefault(); + char SVoutputFile[1000]; + memset(SVoutputFile,0,sizeof(SVoutputFile)); + strcpy(SVoutputFile,"Detected_structual_variants"); + memset(outputFolder,0,sizeof(outputFolder)); + char chrMapFile[1000]; + int numOfChr = 24; + memset(chrMapFile,0,sizeof(chrMapFile)); + strcpy(inputAlignmentFileName,"Alden_1000_c666_1/combined_1000_c1.oma"); + strcpy(outputFileLocation,"Alden_1000_c666_1/"); + strcpy(chrMapFile,"/home/lil/OM/RefMaps_c666_1/GRCh38_25chr.condense.1000.cmap"); + if (argv == 1) + { + printf("\nThe valid parameters are described as follows:\n"); + printf("\t-inputLabel: \n\t\t Default value: %lld. The index of genome in the trio data. No effect if only investigate one genome.\n",inputTrio); + printf("\t-outputFolder: \n\t\t The path of the folder to store the output fils. This folder must be exist!\n"); + printf("\t-SVoutputFile: \n\t\t Default value: %s. The file name of SVs (.osv).\n",SVoutputFile); + printf("\t-chrMapFile: \n\t\t Default value: %s. The file name of the reference map(.cmap).\n",chrMapFile); + printf("\t-optAlignFile: \n\t\t Default value: %s. The file name of the alignment map file(.oma).\n",inputAlignmentFileName); + printf("\t-optTempFolder: \n\t\t Default value: %s. The folder to store the processed alignment maps by chromosomes.\n",outputFileLocation); + printf("\t-likelihoodRatioCutOff: \n\t\t Default value: %g. The cutoff of the likelihood ratio for SV all hypothesis. The default value changes along with the experiment data.\n",likelihoodRatioCutOff); + printf("\t-numberOfSupportIndelMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment being called SVs. The default value changes along with the experiment data.\n",numberOfSupportIndelMolecule); + printf("\t-numberOfSupportSignalMolecule: \n\t\t Default value: %lld. The minimum coverage of a segment to call signal variations. The default value changes along with the experiment data.\n",numberOfSupportSignalMolecule); + printf("\t-minIndelSize: \n\t\t Default value (b): %g. The minimum length of a segment to call SVs.\n",minIndelSize); + printf("\t-minIndelRatio: \n\t\t Default value: %g. The length proportion of a minimum SV could be detected on a segment. E.g. segment = 10000b, then the length of the minimum SV should be larger than 10000*0.05=500b.\n",minIndelRatio); + printf("\t-resolutionLimit: \n\t\t Default value: %g. The minimum length of a segment to call signal variations.\n",resolutionLimit); + printf("\t-digestionRate: \n\t\t Default value: %g. The digestion rate of labels (signals) measured in the experiment.\n",digestionRate); + printf("\t-falseCutRate: \n\t\t Default value: %g. The rate of false cut of a non-label position.\n",falseCutRate); + printf("\t-pValueCutOff: \n\t\t Default value: %g. The cutoff of p-value when call signal variations.\n",pValueCutOff); + printf("\t-cauchyMean: \n\t\t Default value: %g. The mean value of cauchy distribution of null hypothesis when calling SVs. Reset a new value only if you have good reason.\n",cauchyMean); + printf("\t-cauchyScale: \n\t\t Default value: %g. The parameter to calculate cauchy distribution. Reset a new value only if you have good reason.\n",cauchyScale); + printf("\t-confidenceLimit: \n\t\t Default value: %g. The lowest alignment confidence for molecules (optical maps) to call SVs or signal variations.\n",confidenceLimit); + printf("\t-numberOfChromosome: \n\t\t Default value: %d. The first n chromosomes to detect SVs.\n",numOfChr); + return -1; + } + bool paraWrongFlag = false; + for (int i = 1; i < argv; i=i+2) + { + string temp(argc[i]); + //printf("What is the parameter?? %s \n",temp.c_str()); + if (temp.compare("-inputLabel")==0) + inputTrio = atol(argc[i+1]); + else if (temp.compare("-numberOfChromosome")==0) + numOfChr = atoi(argc[i+1]); + else if (temp.compare("-optAlignFile")==0) + { + memset(inputAlignmentFileName,0,sizeof(inputAlignmentFileName)); + strcpy(inputAlignmentFileName, argc[i+1]); + } + else if (temp.compare("-optTempFolder")==0) + { + memset(outputFileLocation,0,sizeof(outputFileLocation)); + strcpy(outputFileLocation, argc[i+1]); + } + else if (temp.compare("-SVoutputFile")==0) + { + memset(SVoutputFile,0,sizeof(SVoutputFile)); + strcpy(SVoutputFile,argc[i+1]); + } + else if (temp.compare("-chrMapFile")==0) + { + memset(chrMapFile,0,sizeof(chrMapFile)); + strcpy(chrMapFile,argc[i+1]); + } + else if (temp.compare("-outputFolder")==0) + strcpy(outputFolder,argc[i+1]); + else if (temp.compare("-likelihoodRatioCutOff")==0) + { likelihoodRatioCutOff = atof(argc[i+1]); + if (likelihoodRatioCutOff < 0) + { + printf("Error! An negative likelihoodRatioCutOff is inputted!\n"); + paraWrongFlag = true; + } + } + else if (temp.compare("-numberOfSupportIndelMolecule")==0) + { + numberOfSupportIndelMolecule = atoi(argc[i+1]); + if (numberOfSupportIndelMolecule < 0) + { + printf("Error! An negative numberOfSupportIndelMolecule is inputted!\n"); + paraWrongFlag = true; + } + if (numberOfSupportIndelMolecule >= 1000) + { + printf("Warning! An improper large (>=1000) numberOfSupportIndelMolecule is inputted!\n"); + } + } + else if (temp.compare("-minIndelSize")==0) + { + minIndelSize = atof(argc[i+1]); + if (minIndelSize < 1) + { + printf("Error! Please make sure the minIndelSize is a positive integer!\n"); + paraWrongFlag = true; + } + if (minIndelSize >= 100000) + { + printf("Warning! An improper large number (>=100000b) Of minIndelSize is inputted!\n"); + } + } + else if (temp.compare("-minIndelRatio")==0) + { + minIndelRatio = atof(argc[i+1]); + if (minIndelRatio <= 0||minIndelRatio >= 1) + { + printf("Error! An improper number Of minIndelRatio is inputted, should be in between 0 and 1!\n"); + paraWrongFlag = true; + } + } + else if (temp.compare("-resolutionLimit")==0) + { + resolutionLimit = atof(argc[i+1]); + if (resolutionLimit < 0) + { + printf("Error! An negative number Of resolutionLimit is inputted!\n"); + paraWrongFlag = true; + } + if (resolutionLimit >= 10000) + { + printf("Warning! An improper large number (>=10000) of resolutionLimit is inputted!\n"); + } + } + else if (temp.compare("-digestionRate")==0) + { + digestionRate = atof(argc[i+1]); + if (digestionRate <= 0||digestionRate>1) + { + printf("Error! An improper digestionRate is inputted, should be in between 0 and 1!\n"); + paraWrongFlag = true; + } + else if (digestionRate < 0.5) + { + printf("Warning! An very low level of digestionRate (<0.5) is adapted!\n"); + } + } + else if (temp.compare("-falseCutRate")==0) + { + falseCutRate = atof(argc[i+1]); + if (falseCutRate < 0||falseCutRate>=1) + { + printf("Error! An improper falseCutRate is inputted, should be in between 0 and 1!\n"); + paraWrongFlag = true; + } + else if (falseCutRate > 0.8) + printf("Warning! An very large (>0.8) falseCutRate is inputted!\n"); + } + else if (temp.compare("-pValueCutOff")==0) + { + pValueCutOff = atof(argc[i+1]); + if (pValueCutOff <= 0||pValueCutOff>=1) + { + printf("Error! An improper pValueCutOff is inputted, should be in between 0 and 1!\n"); + paraWrongFlag = true; + } + else if (pValueCutOff > 0.1) + { + printf("Warning! An non-significant (>0.1) pValueCutOff is inputted!\n"); + } + } + else if (temp.compare("-cauchyMean")==0) + { + cauchyMean = atof(argc[i+1]); + if (cauchyMean < 0.8||cauchyMean > 1.2) + { + printf("Error! cauchyMean should be around 1!\n"); + paraWrongFlag = true; + } + else + printf("You have changed the value of cauchyMean to be %g, please make sure it is your intention!\n",cauchyMean); + } + else if (temp.compare("-cauchyScale")==0) + { + cauchyScale = atof(argc[i+1]); + if (cauchyScale < 0) + { + printf("Warning! An negative cauchyScale is inputted!\n"); + paraWrongFlag = true; + } + else + printf("You have changed the value of cauchyScale to be %g, please make sure it is your intention!\n",cauchyScale); + } + else if (temp.compare("-confidenceLimit")==0) + { + confidenceLimit = atof(argc[i+1]); + if (cauchyScale < 0) + { + printf("Error! An negative confidenceLimit is inputted!\n"); + paraWrongFlag = true; + } + } + else if (temp.compare("-numberOfSupportSignalMolecule")==0) + { + numberOfSupportSignalMolecule = atol(argc[i+1]); + if (numberOfSupportSignalMolecule < 0) + { + printf("Error! An negative numberOfSupportSignalMolecule is inputted!\n"); + paraWrongFlag = true; + } + if (numberOfSupportSignalMolecule > 1000) + { + printf("Warning! An very large (>1000) numberOfSupportSignalMolecule is inputted!\n"); + } + } + else + { + printf("No such parameter or wrong : %s\n",argc[i]); + paraWrongFlag = true; + } + } + if (paraWrongFlag) + { + printf("\nThe format to arrange parameters is:\n\t./makeRefine_en -param_1 val_1 -param_2 val_2 ... -param_n val_n\n"); + printf("\nPlease check your input parameters!\n\n"); + return -1; + } + + init(); + NCRcalculation(); + // findResolutionDistribution(); return 0; + initResultFullList(SVoutputFile); + getChromosomeList(chrMapFile); + initStatData(); + //new content + bool canOF = true; + numOfChr = 24; + for (LL tempChr = 0; tempChr < (LL)listOfChromosome.size(); tempChr++){ + LL chrID = listOfChromosome[tempChr]; + if (chrID > numOfChr) continue; + char buff[200]; + char nameOF[1000]; + strcpy(nameOF, outputFileLocation); + sprintf(buff, "%lld_%d",inputTrio,chrID); + strcat(nameOF, buff); + strcat(nameOF, ".bmap"); + if ((inputOptAlign = fopen(nameOF, "r")) == NULL) + { + canOF = false; + break; + } + } + if (!canOF) + { + readSourceFile(); + puts("DONE readSourceFile"); + addSplitedMap(); + puts("DONE addSplitedMap"); + outputToBillMapDestinationFile(); + } + //readSourceFile(); + //puts("DONE readSourceFile"); + //addSplitedMap(); + //puts("DONE addSplitedMap"); + //outputToBillMapDestinationFile(numOfChr); + //content end + for (LL tempChrId = 0; tempChrId < (LL)listOfChromosome.size(); tempChrId++){ + chrId = listOfChromosome[tempChrId]; + //if (chrId > numOfChr) continue; + // for (chrId = 12; chrId <= 12; chrId++){ + canOpenFile = true; + initData(); + // printf("Doing %d\n", chrId); + readChromosomeInfo(chrId,chrMapFile); + // printf("Done Read Chromosome\n"); + readOpticalAlign(chrId,outputFileLocation); + if (!canOpenFile) + { + printf("Open File fails?\n"); + continue; + } + // printf("Done Read Optical\n"); + //printDistancePair("test_output/Distance_before_parsing"); + printf("Distance Pair Count-1: %lld\n", distancePairCount); + parseHitEnum(); +// printf("Done parsehitEnum\n"); + printf("Distance Pair Count-2: %lld\n", distancePairCount); + completePairs(); + // printf("pair count: %lld\n", distancePairCount); + //printDistancePair("test_output/Distance_after_parsing"); + detectByDistancePair(); + //printDistancePair("test_output/Distance_after_detection"); + printf("Distance Pair Count-3: %lld\n", distancePairCount); +// printf("Done detectByDistancePair\n"); + +// processDistancePair(); +// printf("Done processDistancePair\n"); + //printDistancePair("test_output/Distance_after_processing"); +// printf("Distance Pair Count-4: %lld\n", distancePairCount); +// printf("Number of the sites: %lld\n",chromosome.numberOfSites); + deletionTest(chrId); +// printf("Done Deletion\n"); + insertionTest(chrId); + // printf("Done Insertion\n"); + } + correctOverlapVariant(); + outputVariantResult(argv,argc); +// printf("Number of No support SV: %lld\n", numberOfNoSupportSV); + printf("Number of support SV: %lld\n", numberOfSupportedSV); + printf("\n"); + return 0; +} From 71c65d25abb9529e177ae1d4b382e2a08f47cd31 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 7 Sep 2017 13:51:22 +0800 Subject: [PATCH 08/13] Update callSV.sh --- callSV.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/callSV.sh b/callSV.sh index d97d2b1..478aac6 100644 --- a/callSV.sh +++ b/callSV.sh @@ -23,10 +23,10 @@ rm -r -f temp fi echo "2. Start to call medium-size inversions" -if java -jar OMTools-master/OMTools.jar >/dev/null 2>&1 +if java -jar */OMTools.jar >/dev/null 2>&1 then - java -jar OMTools/OMTools.jar SVDetection --refmapin hg38_r.cmap --optresin data/NA12878_700bp_hg38_combRefOMB2.oma --mininvsig 4 --svout 12878Med_inv.osv --flanksig 0 --deg 0 -svmode 2 -minsupport 10 && sed -i '/.site/d' 12878Med_inv.osv + java -jar */OMTools.jar SVDetection --refmapin hg38_r.cmap --optresin data/NA12878_700bp_hg38_combRefOMB2.oma --mininvsig 4 --svout 12878Med_inv.osv --flanksig 0 --deg 0 -svmode 2 -minsupport 10 && sed -i '/.site/d' 12878Med_inv.osv else echo "Medium-size inversion caller requires java and OMTools but not installed. Skip this step." fi From 63f98b154c3da2b0628e57254957445b672ae083 Mon Sep 17 00:00:00 2001 From: Le LI Date: Mon, 11 Sep 2017 17:07:16 +0800 Subject: [PATCH 09/13] Update OMSV_v4.cpp --- source/OMSV_v4.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/OMSV_v4.cpp b/source/OMSV_v4.cpp index 30ffe89..34dcf5d 100644 --- a/source/OMSV_v4.cpp +++ b/source/OMSV_v4.cpp @@ -57,7 +57,7 @@ struct opticalMapType{ double score; double confidence; // char hitEnum[1000]; - char hitEnum[1000]; + char hitEnum[2000]; double fpr=0;//// double fnr=0;//// double alignRate=0;//// @@ -494,7 +494,7 @@ struct optAlignType{ LL numberOfSites; double score; double confidence; - char hitEnum[1200]; + char hitEnum[2000]; // position = distance[i+1] - distance[i] int position[2500]; From a009f016cf2a529d0db9a532dbadfafb47da223f Mon Sep 17 00:00:00 2001 From: Le LI Date: Tue, 24 Oct 2017 13:41:58 +0800 Subject: [PATCH 10/13] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7ffccde --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Le LI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From f70b34b690e965073151c4de19223393e8747e93 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 4 Jan 2018 13:20:51 +0800 Subject: [PATCH 11/13] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2c96354..66e0c9d 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,4 @@ callSV.sh provide the examples of calling the OMSV callers (Matlab is required t ================================================================================= ## Citation: -> Le Li, Tsz-Piu Kwok, Alden King-Yung Leung, et.al. **OMSV enables accurate and comprehensive identification of large structural variations from nanochannel-based single-molecule optical maps**. *Manuscript under revision, 2017*. +> Le Li, Tsz-Piu Kwok, Alden King-Yung Leung, et.al. **OMSV enables accurate and comprehensive identification of large structural variations from nanochannel-based single-molecule optical maps**. *Genome biology, 2017, 18(1): 230*. From 8fc87878cdc601b462b4c9ec4ae1b889587d1294 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 31 May 2018 16:56:23 +0800 Subject: [PATCH 12/13] Update README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 66e0c9d..bdd6f56 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ +# Updates in May31, 2018: +================================================================================= +Added a low-coverage mode in the pipeline to support the cases that the input alignment is low-coverage (e.g. <10x for the alignment of consensus maps to the reference). + +To trigger this mode, please set ++ "likelihoodRatioCutOff" as 0, ++ "numberOfSupportIndelMolecule" as 1. + +================================================================================= # OMSV TUTORIAL ================================================================================= ## Overall From fe641bd3fb088f4430094aeb97b18d7e87df5ba8 Mon Sep 17 00:00:00 2001 From: Le LI Date: Thu, 31 May 2018 16:59:37 +0800 Subject: [PATCH 13/13] Update OMSV_v4.cpp --- source/OMSV_v4.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/OMSV_v4.cpp b/source/OMSV_v4.cpp index 34dcf5d..157e46d 100644 --- a/source/OMSV_v4.cpp +++ b/source/OMSV_v4.cpp @@ -920,7 +920,7 @@ void advanceLikelihoodDistanceCalculation(LL start, LL end){ sampleMean += tempDistanceDoubleNoOutlier[i]; sampleMean /= cntNoOutlier; // Change sampleMean to sampleMedian - sampleMean = (cntNoOutlier%2==1)?(tempDistanceDoubleNoOutlier[cntNoOutlier/2]+tempDistanceDoubleNoOutlier[(cntNoOutlier/2)+1])/2:tempDistanceDoubleNoOutlier[cntNoOutlier/2]; + sampleMean = (cntNoOutlier%2!=1)?(tempDistanceDoubleNoOutlier[cntNoOutlier/2]+tempDistanceDoubleNoOutlier[(cntNoOutlier/2)+1])/2:tempDistanceDoubleNoOutlier[cntNoOutlier/2]; // Null hypothesis for (LL i=0; i