diff --git a/Calculator.cpp b/Calculator.cpp new file mode 100644 index 0000000..3f34feb --- /dev/null +++ b/Calculator.cpp @@ -0,0 +1,70 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include "Calculator.h" +#include +#include +#include + +using namespace std; + + +double CalcVariance(vector& v) { + double s; + double m; + int i; + s=0; + for (i=0;i& v) { + double s; + s=0; + for (int i=0;i& v) { + double s; + s=0; + for (int i=0;ir); +} diff --git a/Calculator.h b/Calculator.h new file mode 100644 index 0000000..5a2262f --- /dev/null +++ b/Calculator.h @@ -0,0 +1,13 @@ +#ifndef __CALCULATOR__ +#define __CALCULATOR__ + +#include +#include + +double CalcVariance(std::vector&); +double CalcMean(std::vector&); +bool doubleEqual(double, double, double); +bool doubleLess(double, double, double, bool); +bool doubleGreater(double, double, double, bool); + +#endif diff --git a/ExclusionListProcessor.cpp b/ExclusionListProcessor.cpp new file mode 100644 index 0000000..b4f984c --- /dev/null +++ b/ExclusionListProcessor.cpp @@ -0,0 +1,28 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include "ExclusionListProcessor.h" +#include +#include +#include + +using namespace std; + +void ReadExclusionsFile(string f, map& m) { + string line, id; + ifstream inFile(f.c_str()); + if (inFile.is_open()) { + while (inFile.good()) { + getline (inFile, line); + if (!line.empty()) { + m[line] = 0; + } + } + inFile.close(); + } + else { + cout << "Unable to open " << f << " for reading" << endl; + exit(EXIT_FAILURE); + } +} diff --git a/ExclusionListProcessor.h b/ExclusionListProcessor.h new file mode 100644 index 0000000..b05ba60 --- /dev/null +++ b/ExclusionListProcessor.h @@ -0,0 +1,9 @@ +#ifndef __ExclusionListProcessor__ +#define __ExclusionListProcessor__ + +#include +#include + +void ReadExclusionsFile(std::string, std::map&); + +#endif diff --git a/FileWriter.cpp b/FileWriter.cpp new file mode 100644 index 0000000..463c0e4 --- /dev/null +++ b/FileWriter.cpp @@ -0,0 +1,73 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include "FileWriter.h" +#include +#include +#include + +using namespace std; + +void OutputSolution(vector& s, map& m, vector& n, string& o, int& g) { + ofstream outFile; + outFile.open(o.c_str(), ios_base::out); + if (outFile.is_open()) { + int index = 0; + //int group = 0; + int offset = 0; + outFile << "Group\tID\tPhenotype\n"; + for (int i = 0; i < n.size(); ++i) { + //group++; + while (index < n[i] + offset) { + outFile << g << "\t" << s[index] << "\t" << m[s[index]] << endl; + index++; + } + offset = offset + index; + g++; + } + outFile.close(); + } + else { + cout << "Can not open " << o << " for writing" << endl; + exit(EXIT_FAILURE); + } +} + +void OutputIndependentSolution(vector& s, map& m, vector& n, string& o, int& g) { + + ofstream outFile; + + if (g == 1) { + outFile.open(o.c_str(), ios_base::out); + if (outFile.is_open()) { + outFile << "Group\tID\tPhenotype\n"; + for (int i = 0; i < n.size(); ++i) { + for (int j = 0; j < n[i]; ++j) { + outFile << g << "\t" << s[j] << "\t" << m[s[j]] << endl; + } + } + outFile.close(); + } + else { + cout << "Can not open " << o << " for writing" << endl; + exit(EXIT_FAILURE); + } + } + else { + outFile.open(o.c_str(), ios_base::app); + if (outFile.is_open()) { + for (int i = 0; i < n.size(); ++i) { + for (int j = 0; j < n[i]; ++j) { + outFile << g << "\t" << s[j] << "\t" << m[s[j]] << endl; + } + } + outFile.close(); + } + else { + cout << "Can not open " << o << " for appending" << endl; + exit(EXIT_FAILURE); + } + } +} + diff --git a/FileWriter.h b/FileWriter.h new file mode 100644 index 0000000..76dcc8d --- /dev/null +++ b/FileWriter.h @@ -0,0 +1,11 @@ +#ifndef __FILEWRITER__ +#define __FILEWRITER__ + +#include +#include +#include + +void OutputSolution(std::vector&, std::map&, std::vector&, std::string&, int&); +void OutputIndependentSolution(std::vector&, std::map&, std::vector&, std::string&, int&); + +#endif diff --git a/GA.cpp b/GA.cpp new file mode 100644 index 0000000..49664a1 --- /dev/null +++ b/GA.cpp @@ -0,0 +1,319 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include "GA.h" +#include "StringOperations.h" +#include "Calculator.h" +#include +#include +#include +#include +#include +#include + +using namespace std; + +void InitializeSolutions(int n, vector& s, vector >& p, map& e) { + // solutions will be a hash containing arrays of subject IDs. + // We will score these later by referencing ID to phenotype value + // Note that N in each solution will be the total N across the groups in which to find specific distributions for. + // We will break each solution up into the relevant subsets for scoring and returning. + map idsSelected; + int randomIndex; + for (int i = 0; i < p.size(); ++i) { + for (int j = 0; j < n; ++j) { + do { + randomIndex = rand() % s.size(); // 0 to vector-size -1 + } while (!((idsSelected.find(s[randomIndex]) == idsSelected.end()) && (e.find(s[randomIndex]) == e.end()))); + p[i].push_back(s[randomIndex]); + idsSelected[s[randomIndex]] = 0; + } + idsSelected.clear(); + } +} + + +double ScoreSolution(vector& c, map& p, vector& n, vector& m, vector& s) { + vector phenotype; + int index = 0; + double score = 0; + for (int i = 0; i < n.size(); ++i) { + int count = 0; + while (count < n[i]) { + phenotype.push_back(p[c[index]]); + index++; + count++; + } + double mean = CalcMean(phenotype); + double sd = sqrt(CalcVariance(phenotype)); + if (fabs(m[i] - mean) >= 0) { + score += fabs(m[i] - mean) * 2; + } + if (fabs(s[i] - sd) >= 0) { + score += fabs(s[i] - sd); + } + phenotype.clear(); + } + return score; +} + + +void DetermineBestSolution(std::vector& v, double& s, int& x) { + double bestScore = 999999999; + int bestScoreIndex; + for (int i = 0; i < v.size(); ++i) { + if (v[i] < bestScore) { + bestScore = v[i]; + bestScoreIndex = i; + } + } + s = bestScore; + x = bestScoreIndex; +} + + +bool TestBestSolution(vector& c, map& p, vector& n, vector& m, vector& s, vector& mp, vector& sp) { + vector values; + int index = 0; + bool solutionFound; + for (int i = 0; i < n.size(); ++i) { + int count = 0; + while (count < n[i]) { + values.push_back(p[c[index]]); + index++; + count++; + } + + double mean = roundf(CalcMean(values) * pow(10,mp[i])) / pow(10,mp[i]); + double sd = roundf(sqrt(CalcVariance(values)) * pow(10,sp[i])) / pow(10,sp[i]); + if (doubleEqual(mean, m[i], 0.00001) && doubleEqual(sd, s[i], 0.00001)) { + solutionFound = true; + } + else { + solutionFound = false; + break; + } + values.clear(); + } + return solutionFound; +} + + +void GenerateRouletteWheel(vector& r, vector& s) { + vectors2; + vector::iterator it = max_element(s.begin(), s.end()); + double worstScore = *it; + double reqSum = 0; + double prevProb = 0; + for (int i = 0; i < s.size(); ++i) { + s2.push_back(worstScore-s[i]+1); + reqSum += (worstScore-s[i]+1); + } + for (int i = 0; i < s2.size(); ++i) { + r.push_back(prevProb + (s2[i] / reqSum)); + prevProb += s2[i] / reqSum; + } +} + + +int SpinWheel(vector& r) { + int i; + double randomNumber = rand() / (double)RAND_MAX; + for (i = 0; i < r.size(); ++i) { + if (randomNumber < r[i]) { + break; + } + } + return i; +} + + +void DetermineWorstSolution(vector& v, double& s, int& x) { + double worstScore = 0; + int worstScoreIndex; + for (int i = 0; i < v.size(); ++i) { + if (v[i] > worstScore) { + worstScore = v[i]; + worstScoreIndex = i; + } + } + s = worstScore; + x = worstScoreIndex; +} + + +void CrossoverPMX(vector >& p, vector& r, vector& s, vector >& chrXY, map& chrXAlleles, map& chrYAlleles) { + + // define crossover points + int xoverA = rand() % p[r[0]].size(); + int xoverB = rand() % p[r[0]].size(); + if (xoverA > xoverB) { + int t = xoverA; + xoverA = xoverB; + xoverB = t; + } + + // Record elements in crossover region in maps: id -> index and map IDs across chromosomes within crossover region + map chrXxOverAlleleMap; + map chrYxOverAlleleMap; + map::iterator it; + + for (int i = xoverA; i <= xoverB; ++i) { + chrXxOverAlleleMap[p[r[1]][i]] = p[r[0]][i]; + chrYxOverAlleleMap[p[r[0]][i]] = p[r[1]][i]; + } + + // Perform the PMX crossover + for (int i = 0; i < p[r[0]].size(); ++i) { + + if (i < xoverA || i > xoverB) { + + // if element not in chromosomal crossover region associated with orginal chr Y + it = chrXxOverAlleleMap.find(p[r[0]][i]); + if (it == chrXxOverAlleleMap.end()) { + chrXY[0].push_back(p[r[0]][i]); + chrXAlleles[p[r[0]][i]] = 0; + } + else { + string newAllele = chrXxOverAlleleMap[p[r[0]][i]]; + it = chrXxOverAlleleMap.find(newAllele); + while (it != chrXxOverAlleleMap.end()) { + newAllele = chrXxOverAlleleMap[newAllele]; + it = chrXxOverAlleleMap.find(newAllele); + } + chrXY[0].push_back(newAllele); + chrXAlleles[newAllele] = 0; + } + + // if element not in chromosomal crossover region associated with orginal chr X + it = chrYxOverAlleleMap.find(p[r[1]][i]); + if (it == chrYxOverAlleleMap.end()) { + chrXY[1].push_back(p[r[1]][i]); + chrYAlleles[p[r[1]][i]] = 0; + } + else { + string newAllele = chrYxOverAlleleMap[p[r[1]][i]]; + it = chrYxOverAlleleMap.find(newAllele); + while (it != chrYxOverAlleleMap.end()) { + newAllele = chrYxOverAlleleMap[newAllele]; + it = chrYxOverAlleleMap.find(newAllele); + } + chrXY[1].push_back(newAllele); + chrYAlleles[newAllele] = 0; + } + } + else { + //CROSSOVER REGION + chrXY[0].push_back(p[r[1]][i]); + chrXY[1].push_back(p[r[0]][i]); + chrXAlleles[p[r[1]][i]] = 0; + chrYAlleles[p[r[0]][i]] = 0; + } + } +} + + + +void Mutate(vector >& chrXY, vector& n, double m, vector& s, map& p, map& chrXAlleles, map& chrYAlleles, map& e) { + + double min = 1/(double)chrXY[0].size(); + if (min <= m) { + + int mutationIndex1; + string mutationIndex1Allele; + int mutationIndex2; + string mutationIndex2Allele; + string newAllele; + map::iterator it; + + double random = min + ((double)rand() / RAND_MAX) * (m - min); + int mutations = round((double)chrXY[0].size() * random); + + // for each mutation to be performed // mutate each chromosome in turn + for (int j = 0; j < mutations; ++j) { + + // CHROMOSOME X (0) + // pick indicies at random in which alleles are to be swapper + mutationIndex1 = rand() % chrXY[0].size(); + mutationIndex1Allele = chrXY[0][mutationIndex1]; + // pick and allele at random to place into this from the full set of IDs. This will help get round local maxima + do { + mutationIndex2 = rand() % s.size(); + mutationIndex2Allele = s[mutationIndex2]; + } while (e.find(mutationIndex2Allele) != e.end()); + + // Determine if this is a new allele in the chromosome or whether this will need to be a swap if in chromosome + it = chrXAlleles.find(mutationIndex2Allele); + // if not found then insert new allele into preselected mutation site - this should be the default first check + if (it == chrXAlleles.end()) { + chrXY[0][mutationIndex1] = mutationIndex2Allele; + chrXAlleles.erase(mutationIndex1Allele); + chrXAlleles[mutationIndex2Allele] = 0; + + } + // else mutate by performing simple swap but dont put contraint on index range to chose from as below + else { + mutationIndex2 = rand() % chrXY[0].size(); + mutationIndex2Allele = chrXY[0][mutationIndex2]; + chrXY[0][mutationIndex1] = mutationIndex2Allele; + chrXY[0][mutationIndex2] = mutationIndex1Allele; + } + + // CHROMOSOME Y (1) + // pick indicies at random in which alleles are to be swapper + mutationIndex1 = rand() % chrXY[1].size(); + mutationIndex1Allele = chrXY[1][mutationIndex1]; + // pick and allele at random to place into this from the full set of IDs. This will help get round local maxima + do { + mutationIndex2 = rand() % s.size(); + mutationIndex2Allele = s[mutationIndex2]; + } while (e.find(mutationIndex2Allele) != e.end()); + + // Determine if this is a new allele in the chromosome or whether this will need to be a swap if in chromosome + it = chrYAlleles.find(mutationIndex2Allele); + // if not found then insert new allele into preselected mutation site + if (it == chrYAlleles.end()) { + chrXY[1][mutationIndex1] = mutationIndex2Allele; + chrYAlleles.erase(mutationIndex1Allele); + chrYAlleles[mutationIndex2Allele] = 0; + + } + // else mutate similar to below performing simple swap but dont put contraint on index range to chose from as below + else { + mutationIndex2 = rand() % chrXY[1].size(); + mutationIndex2Allele = chrXY[1][mutationIndex2]; + chrXY[1][mutationIndex1] = mutationIndex2Allele; + chrXY[1][mutationIndex2] = mutationIndex1Allele; + } + } + } + else { + cout << "Cannot mutate at user rate - not enough samples in chromosome" << endl; + } +} + + +void DetermineX2LeastFit(vector& c, std::vector s) { + int index; + vector::iterator it; + it = max_element(s.begin(), s.end()); + it = find(s.begin(), s.end(), *it); + index = distance(s.begin(), it); + c.push_back(index); + s[index] = -1; + it = max_element(s.begin(), s.end()); + it = find(s.begin(), s.end(), *it); + index = distance(s.begin(), it); + c.push_back(index); +} + + + + + + + + + + diff --git a/GA.h b/GA.h new file mode 100644 index 0000000..eeee48f --- /dev/null +++ b/GA.h @@ -0,0 +1,19 @@ +#ifndef __GA__ +#define __GA__ + +#include +#include +#include + +void InitializeSolutions(int, std::vector&, std::vector >&, std::map& ); +double ScoreSolution(std::vector&, std::map&, std::vector&, std::vector&, std::vector&); +void DetermineBestSolution(std::vector&, double&, int&); +bool TestBestSolution(std::vector&, std::map&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&); +void GenerateRouletteWheel(std::vector&, std::vector&); +int SpinWheel(std::vector&); +void DetermineWorstSolution(std::vector&, double&, int&); +void CrossoverPMX(std::vector >&, std::vector&, std::vector&, std::vector >&, std::map&, std::map&) ; +void Mutate(std::vector >&, std::vector&, double, std::vector&, std::map&, std::map&, std::map&, std::map&); +void DetermineX2LeastFit(std::vector&, std::vector); + +#endif diff --git a/InputParser.cpp b/InputParser.cpp new file mode 100644 index 0000000..3ae0beb --- /dev/null +++ b/InputParser.cpp @@ -0,0 +1,196 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include +#include +#include +#include +#include "InputParser.h" +#include "StringOperations.h" + + +using namespace std; + +void ShowHeader() { + cout << endl << "==================== GAGS ===============" << endl; + cout << " Genetic Algorithm Group Selection " << endl; + cout << " Written by Andrew R Wood " << endl; + cout << " a.r.wood@exeter.ac.uk " << endl; + cout << " University of Exeter Medical School " << endl; + cout << " Version 0.1: 20-09-2018 " << endl; + cout << "=======================================" << endl; + cout << endl; +} + +void ShowOptions() { + cout << "Usage:" << endl; + cout << " --pheno -p [phenotype file]" << endl; + cout << " --ns -n [list of group sizes]" << endl; + cout << " --means -m [list of respective means]" << endl; + cout << " --sds -s [list od respective SDs]" << endl; + cout << " --seed -r [seed - default 10000]" << endl; + cout << " --mrate -y [max mutation rate - default 0.01]" << endl; + cout << " --popsize -z [initial solutions - default 10]" << endl; + cout << " --exclusions -e [optional: file of IDs to exclude]" << endl; + cout << " --iterations -i [max iterations - default 50000]" << endl; + cout << " --out -o [output file prefix]" << endl; + cout << endl; +} + +void ParseCommand(int argc, char** argv, ARGS *a) { + + if (argc > 1) { + + // defaults + a->seed = 10000; + a->popsize = 10; + a->mRate = 0.01; + a->iterations = 50000; + a->ok = true; + + for (int i = 1; i != argc; ++i) { + if (string(argv[i]) == "--pheno" || string(argv[i]) == "-p") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->phenoString = argv[i+1]; + } + else { + cout << "Not entered --pheno (-p) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--ns" || string(argv[i]) == "-n") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->nsString = argv[i+1]; + } + else { + cout << "Not entered --ns (-n) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--means" || string(argv[i]) == "-m") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->meansString = argv[i+1]; + } + else { + cout << "Not entered --means (-m) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--sds" || string(argv[i]) == "-s") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->sdsString = argv[i+1]; + } + else { + cout << "Not entered --sds (-s) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--seed" || string(argv[i]) == "-r") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->seed = atoi(argv[i+1]); + } + else { + cout << "Not entered --seed (-r) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--mrate" || string(argv[i]) == "-y") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->mRate = atof(argv[i+1]); + } + else { + cout << "Not entered --mrate (-y) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--popsize" || string(argv[i]) == "-z") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->popsize = atoi(argv[i+1]); + } + else { + cout << "Not entered --popsize (-z) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--exclusions" || string(argv[i]) == "-e") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->exclusionsString = argv[i+1]; + } + else { + cout << "Not entered --exclusions (-e) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--iterations" || string(argv[i]) == "-i") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->iterations = atoi(argv[i+1]); + } + else { + cout << "Not entered --iterations (-i) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--out" || string(argv[i]) == "-o") { + if (i+1 <= argc-1 && string(argv[i+1]).substr(0,2) != "--") { + a->outString = argv[i+1]; + } + else { + cout << "Not entered --out (-o) argument" << endl << endl; + a->ok = false; + } + } + else if (string(argv[i]) == "--help" || string(argv[i]) == "-h") { + a->ok = false; + } + } + + if (a->phenoString.empty() || a->meansString.empty() || a->sdsString.empty() || a->outString.empty() ) { + a->ok = false; + } + + } + else { + a->ok = false; + } + +} + + +void SplitStringArgs(string& s, vector& v) { + split(s, ",", v); +} + + +void SplitDoubleArgs(string& s, vector& v) { + vector vec; + split(s, ",", vec); + for (int i = 0; i < vec.size(); ++i) { + v.push_back(stringToDouble(vec[i])); + } +} + + +void SplitIntArgs(string& s, vector& v) { + vector vec; + split(s, ",", vec); + for (int i = 0; i < vec.size(); ++i) { + v.push_back(stringToInt(vec[i])); + } +} + + +void DetermineDoubleArgsPrecision(string& s, vector& v) { + vector vec; + split(s, ",", vec); + for (int i = 0; i < vec.size(); ++i) { + if (vec[i].find(".") != -1) { + v.push_back(vec[i].size() - (vec[i].find(".")+1)); + } + else { + v.push_back(0); + } + } +} + + diff --git a/InputParser.h b/InputParser.h new file mode 100644 index 0000000..33f09c3 --- /dev/null +++ b/InputParser.h @@ -0,0 +1,27 @@ +#ifndef __INPUTPARSER__ +#define __INPUTPARSER__ + +struct ARGS +{ + std::string phenoString; + std::string nsString; + std::string meansString; + std::string sdsString; + std::string exclusionsString; + std::string outString; + int seed; + int popsize; + int iterations; + double mRate; + bool ok; +}; + +void ShowHeader(); +void ShowOptions(); +void ParseCommand(int, char**, ARGS*); +void SplitStringArgs(std::string&, std::vector&); +void SplitDoubleArgs(std::string&, std::vector&); +void SplitIntArgs(std::string&, std::vector&); +void DetermineDoubleArgsPrecision(std::string&, std::vector&); + +#endif diff --git a/Main.cpp b/Main.cpp new file mode 100644 index 0000000..365b2fc --- /dev/null +++ b/Main.cpp @@ -0,0 +1,304 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include +#include +#include +#include +#include "StringOperations.h" +#include "InputParser.h" +#include "ExclusionListProcessor.h" +#include "PhenotypeProcessor.h" +#include "GA.h" +#include "FileWriter.h" +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char** argv) { + + // 1. Show header + ShowHeader(); + + // 2. Parse command line + ARGS theseArgs; + ParseCommand(argc, argv, &theseArgs); + if(!theseArgs.ok) { + ShowOptions(); + exit(EXIT_FAILURE); + } + cout << "Arguments: " << endl; + cout << " --pheno -p " << theseArgs.phenoString << endl; + cout << " --ns -n " << theseArgs.nsString << endl; + cout << " --means -m " << theseArgs.meansString << endl; + cout << " --sd -s " << theseArgs.sdsString << endl; + cout << " --seed -r " << theseArgs.seed << endl; + cout << " --mrate -y " << theseArgs.mRate << endl; + cout << " --popsize -z " << theseArgs.popsize << endl; + cout << " --exclusions -e " << theseArgs.exclusionsString << endl; + cout << " --iterations -i " << theseArgs.iterations << endl; + cout << " --out -o " << theseArgs.outString << endl << endl; + srand(theseArgs.seed); + + // Create vectors for list arguments + vector ns; + SplitIntArgs(theseArgs.nsString, ns); + vector means; + SplitDoubleArgs(theseArgs.meansString, means); + vector sds; + SplitDoubleArgs(theseArgs.sdsString, sds); + + // Get required precision input by user + vector t1; + DetermineDoubleArgsPrecision(theseArgs.meansString, t1); + vector t2; + DetermineDoubleArgsPrecision(theseArgs.sdsString, t2); + + // read exclusions file + map exclusions; + if (theseArgs.exclusionsString.length() > 0) { + ReadExclusionsFile(theseArgs.exclusionsString, exclusions); + } + + // Load the phenotypes + cout << "Reading phenotypes..." << endl; + vector idVector; + vector phenoVector; + map phenoMap; + ReadPhenotypeFileMinusExclusions(theseArgs.phenoString, idVector, phenoVector, phenoMap, exclusions); + + // Check number of individuals required to formulate the distribution is more than the number with pheneotype + unsigned int total_n = 0; + for (unsigned int i = 0; i phenoVector.size()) { + cout << "Unable to process this data as required sample sizes for distributions larger than N in phenotype file" << endl; + } + + // At this point we want to order the Ns and adjust ordering of means and SDs in vecto + vector nsSorted; + vector meansSorted; + vector sdsSorted; + vector meansPrecision; + vector sdsPrecision; + + map jsUsed; + + nsSorted = ns; + sort(nsSorted.begin(), nsSorted.end()); + for (unsigned int i = 0; i < nsSorted.size(); ++i) { + for (unsigned int j = 0; j < ns.size(); ++j) { + if (ns[j] == nsSorted[i] && jsUsed.find(j) == jsUsed.end()) { + meansSorted.push_back(means[j]); + sdsSorted.push_back(sds[j]); + meansPrecision.push_back(t1[j]); + sdsPrecision.push_back(t2[j]); + jsUsed[j] = 0; + break; + } + } + } + ns.clear(); + means.clear(); + sds.clear(); + + // If the smallest N >= 10% of total_n then we carry on as usual + bool solveIndependently; + bool solveIndependentlyEverSet = false; + int independentGroupIndex = 0; + int smallestOffset = 0; + int nSubtract = 0; + int n; + + + do { + + if ( ((double) nsSorted[smallestOffset] / (double) (total_n-nSubtract)) > 0.1 ) { + solveIndependently = false; + n = total_n-nSubtract; + for (unsigned int i = smallestOffset; i < nsSorted.size(); ++i) { + ns.push_back(nsSorted[i]); + means.push_back(meansSorted[i]); + sds.push_back(sdsSorted[i]); + } + } + // else we need to solve as seperate problem + else { + solveIndependently = true; + solveIndependentlyEverSet = true; + n = nsSorted[smallestOffset]; + ns.push_back(nsSorted[smallestOffset]); + means.push_back(meansSorted[smallestOffset]); + sds.push_back(sdsSorted[smallestOffset]); + nSubtract += nsSorted[smallestOffset]; + smallestOffset++; + } + + + // 1. Initialise solutions + cout << "Initialising solutions..." << endl; + vector > population(theseArgs.popsize); + cout << "Exclusion list size " << exclusions.size() << endl; + InitializeSolutions(n, idVector, population, exclusions); + + // 2. Score solutions + cout << "Scoring solutions..." << endl; + vector scores(theseArgs.popsize); + //double thiscore; + for (int i = 0; i < theseArgs.popsize; ++i) { + scores[i] = ScoreSolution(population[i], phenoMap, ns, means, sds); + } + + // 3. Determine the lowest (best) score and solution with lowest score + double lowestScore; + int bestSolutionIndex; + DetermineBestSolution(scores, lowestScore, bestSolutionIndex); + cout << "Best score and solution ID presently: " << lowestScore << " " << bestSolutionIndex << endl; + + // 4. Determine if we have found a solution that is good enough by chance using the best one we have in the population + bool solutionFound; + solutionFound = TestBestSolution(population[bestSolutionIndex], phenoMap, ns, means, sds, meansPrecision, sdsPrecision); + + // 5. If solution found by chance then output it + if (solutionFound) { + cout << "Solution found" << endl; + if (!solveIndependentlyEverSet) { + independentGroupIndex++; + OutputSolution(population[bestSolutionIndex], phenoMap, ns, theseArgs.outString, independentGroupIndex); + } + else { + independentGroupIndex++; + OutputIndependentSolution(population[bestSolutionIndex], phenoMap, ns, theseArgs.outString, independentGroupIndex); + // add samples from last solution to exclusion list so we don't include them again + for (vector::iterator it = population[bestSolutionIndex].begin(); it != population[bestSolutionIndex].end(); ++it) { + exclusions[*it] = 0; + } + } + } + + + // 6. else we need tocyce up to max iterations to try and find solution + else { + + // Determine highest (worst) score and solution with worst score + bool carryOn = true; + double highestScore; + int worstSolutionIndex; + DetermineWorstSolution(scores, highestScore, worstSolutionIndex); + + cout << "Iteration 1 "; + + // Iterate through for MAX iterations + for (int i = 0; i < theseArgs.iterations; ++i) { + + int iter = (i+1)*100; + if (100 % iter == 0) { + cout << iter << " "; + } + + // [re]generate Roulette Wheel + vector rWheel; + GenerateRouletteWheel(rWheel, scores); + + // Spin the wheel x2 to select solutions + vector randomIndicies; + int ri1 = SpinWheel(rWheel); + randomIndicies.push_back(ri1); + int ri2; + do { + ri2 = SpinWheel(rWheel); + } while (ri1 == ri2); + randomIndicies.push_back(ri2); + + // Crossover + vector > chrXY(2); + map chrXAlleles; + map chrYAlleles; + CrossoverPMX(population, randomIndicies, idVector, chrXY, chrXAlleles, chrYAlleles); + + // Mutate + Mutate(chrXY, ns, theseArgs.mRate, idVector, phenoMap, chrXAlleles, chrYAlleles, exclusions); + + // Identify the worst 2 scores in the population frist + vector worst2Solutions; + DetermineX2LeastFit(worst2Solutions, scores); + + // 1. First see if either provide us with a good enough solution // + for (int j = 0; j < 2; ++j) { + + // get score for current child + double thisScore = ScoreSolution(chrXY[j], phenoMap, ns, means, sds); + + // if this score lower (i.e. better) than lowest score in the population check whether solution good enough + if (thisScore < lowestScore) { + + lowestScore = thisScore; + + // check if we have found a solution + solutionFound = TestBestSolution(chrXY[j], phenoMap, ns, means, sds, meansPrecision, sdsPrecision); + + // if it is - print out and stop + if (solutionFound) { + + cout << thisScore << endl; + cout << "Solution found! " << endl; + + if (!solveIndependentlyEverSet) { + independentGroupIndex++; + OutputSolution(chrXY[j], phenoMap, ns, theseArgs.outString, independentGroupIndex); + } + else { + independentGroupIndex++; + OutputIndependentSolution(chrXY[j], phenoMap, ns, theseArgs.outString, independentGroupIndex); + // add samples from last solution to exclusion list so we don't include them again + for (vector::iterator it = chrXY[j].begin(); it != chrXY[j].end(); ++it) { + exclusions[*it] = 0; + } + } + + carryOn = false; + break; + } + else { + // place this solution in the population and update respective score + population[worst2Solutions[j]] = chrXY[j]; + scores[worst2Solutions[j]] = thisScore; + } + } + else { + // place this solution in the population + population[worst2Solutions[j]] = chrXY[j]; + scores[worst2Solutions[j]] = thisScore; + } + } + if (!carryOn) { + break; + } + // cout << lowestScore << endl; + randomIndicies.clear(); + } + } + + // clear out the vectors + ns.clear(); + means.clear(); + sds.clear(); + population.clear(); + scores.clear(); + + if (!solutionFound) { + cout << theseArgs.iterations << endl; + cout << "Sorry - no solution not found this time." << endl; + cout << "Error score: " << lowestScore << endl; + } + + } while (solveIndependently); + + return 0; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..25c25cc --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +.PHONY: all +all: gags + +gags: Main.cpp Calculator.cpp ExclusionListProcessor.cpp FileWriter.cpp GA.cpp InputParser.cpp PhenotypeProcessor.cpp StringOperations.cpp + g++ Main.cpp Calculator.cpp ExclusionListProcessor.cpp FileWriter.cpp GA.cpp InputParser.cpp PhenotypeProcessor.cpp StringOperations.cpp -o gags -O2 + +# creating it if necessary +.PHONY: install +install: + cp gags /usr/local/bin/ + +.PHONY: clean +clean: + rm -f gags + + diff --git a/PhenotypeProcessor.cpp b/PhenotypeProcessor.cpp new file mode 100644 index 0000000..d10d581 --- /dev/null +++ b/PhenotypeProcessor.cpp @@ -0,0 +1,39 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include "PhenotypeProcessor.h" +#include "StringOperations.h" +#include +#include +#include +#include + +using namespace std; + +void ReadPhenotypeFileMinusExclusions(string f, vector& i, vector& v, map& m, map& e) { + string line, id, pheno; + ifstream inFile(f.c_str()); + if (inFile.is_open()) { + getline (inFile, line); // skip header + while (inFile.good()) { + getline (inFile, line); + if (!line.empty()) { + istringstream iss; + iss.str(line); + iss >> id; + iss >> pheno; + if (pheno != "NA" && e.find(id) == e.end()) { + i.push_back(id); + v.push_back(stringToDouble(pheno)); + m[id] = stringToDouble(pheno); + } + } + } + inFile.close(); + } + else { + cout << "Unable to open " << f << " for reading" << endl; + exit(EXIT_FAILURE); + } +} diff --git a/PhenotypeProcessor.h b/PhenotypeProcessor.h new file mode 100644 index 0000000..3b0e892 --- /dev/null +++ b/PhenotypeProcessor.h @@ -0,0 +1,8 @@ +#ifndef __PhenotypeProcessor__ +#define __PhenotypeProcessor__ + +#include +#include +#include +void ReadPhenotypeFileMinusExclusions(std::string, std::vector&, std::vector&, std::map&, std::map&); +#endif diff --git a/StringOperations.cpp b/StringOperations.cpp new file mode 100644 index 0000000..7ccc767 --- /dev/null +++ b/StringOperations.cpp @@ -0,0 +1,93 @@ +// GAGS +// Developed by Andrew R Wood +// a.r.wood@exeter.ac.uk + +#include +#include +#include +#include +#include +#include "StringOperations.h" + +using namespace std; + + +// function to convert string to double +double stringToDouble(string s) { + double d; + stringstream ss(s); //turn the string into a stream + ss >> d; //convert + return d; +} + + +// function to convert string to int +int stringToInt(string s) { + int i; + stringstream ss(s); //turn the string into a stream + ss >> i; //convert + return i; +} + + +// function to convert int to string +string intToString(int i) { + string s; + ostringstream convert; + convert << i; + s = convert.str(); + return s; +} + + +// function to convert double to string +string doubleToString(double d) { + string s; + ostringstream convert; + convert << d; + s = convert.str(); + return s; +} + + +// function to split a string based on a delimiter +// +//void split(string& s, char c, vector& v) { +void split(string& s, string sc, vector& v) { + + string::size_type i = 0; + string::size_type j = s.find(sc); + + while (j != string::npos) { + v.push_back(s.substr(i,j-i)); + i = ++j; + j = s.find(sc,j); + if (j == string::npos) { + v.push_back(s.substr(i, s.length())); + } + } + if (v.size() == 0) { + v.push_back(s); + } +} + + +/* +// function to split a string based on a delimiter +template +void split(const basic_string& s, T c, + vector >& v) { + basic_string::size_type i = 0; + basic_string::size_type j = s.find(c); + + while (j != basic_string::npos) { + v.push_back(s.substr(i, j-i)); + i = ++j; + j = s.find(c, j); + + if (j == basic_string::npos) + v.push_back(s.substr(i, s.length())); + } +} + +*/ diff --git a/StringOperations.h b/StringOperations.h new file mode 100644 index 0000000..39bf119 --- /dev/null +++ b/StringOperations.h @@ -0,0 +1,21 @@ +#ifndef _STRINGOPERATIONS +#define _STRINGOPERATIONS + +#include +#include + +double stringToDouble(std::string); +int stringToInt(std::string); +std::string intToString(int); +std::string doubleToString(double); + +//void split(std::string&, char, std::vector&); +void split(std::string&, std::string, std::vector&); + + +//template +//void split(const std::basic_string&, T, std::vector >&) { + +#endif + +