diff --git a/README b/README index 84f943f..9690348 100644 --- a/README +++ b/README @@ -22,6 +22,8 @@ PC Sabeti, et al. (2002) Detecting recent positive selection in the human USAGE: +Data must be phased and have no missing genotypes. + To calculate EHH: ./selscan --ehh --hap --map --out @@ -82,6 +84,16 @@ selscan --xpehh --hap example2.pop1.hap --ref example2.pop2.hap --map example2.m population for XP-EHH calculations. Ignored otherwise. Default: __hapfile2 +--tped : A TPED file containing haplotype and map data. + Variants should be coded 0/1 + Default: __hapfile1 + +--tped-ref : A TPED file containing haplotype and map data. + Variants should be coded 0/1. This is the 'reference' + population for XP-EHH calculations and should contain the same number + of loci as the query population. Ignored otherwise. + Default: __hapfile2 + --out : The basename for all output files. Default: outfile @@ -114,6 +126,13 @@ selscan --xpehh --hap example2.pop1.hap --ref example2.pop2.hap --map example2.m --max-gap : Maximum allowed gap in bp between two snps. Default: 200000 +--max-extend : The maximum distance an EHH decay curve is allowed to extend from the core. + Set <= 0 for no restriction. + Default: 1000000 + +--skip-low-freq : Do not include low frequency variants in the construction of haplotypes (iHS only). + Default: false + --threads : The number of threads to spawn during the calculation. Partitions loci across threads. Default: 1 diff --git a/bin/linux/norm b/bin/linux/norm index b993125..41e2a8a 100755 Binary files a/bin/linux/norm and b/bin/linux/norm differ diff --git a/bin/linux/selscan b/bin/linux/selscan index a1e166b..190c4db 100755 Binary files a/bin/linux/selscan and b/bin/linux/selscan differ diff --git a/bin/osx/norm b/bin/osx/norm index 69ca72d..23272d9 100755 Binary files a/bin/osx/norm and b/bin/osx/norm differ diff --git a/bin/osx/selscan b/bin/osx/selscan index fe36954..7a2a747 100755 Binary files a/bin/osx/selscan and b/bin/osx/selscan differ diff --git a/bin/win/norm.exe b/bin/win/norm.exe index 45a1c5a..bf7c739 100644 Binary files a/bin/win/norm.exe and b/bin/win/norm.exe differ diff --git a/bin/win/selscan.exe b/bin/win/selscan.exe index 550a24b..5736ea9 100644 Binary files a/bin/win/selscan.exe and b/bin/win/selscan.exe differ diff --git a/src/norm.cpp b/src/norm.cpp index f9ef665..bfd7bc0 100644 --- a/src/norm.cpp +++ b/src/norm.cpp @@ -279,7 +279,7 @@ int main(int argc, char *argv[]) cerr << "Calculating mean and variance per frequency bin:\n\n"; getMeanVarBins(freq, score, totalLoci, mean, variance, n, numBins, threshold); - /* + gsl_sort(score, 1, totalLoci); double upperCutoff = gsl_stats_quantile_from_sorted_data (score, 1, totalLoci, 0.995); double lowerCutoff = gsl_stats_quantile_from_sorted_data (score, 1, totalLoci, 0.005); @@ -288,9 +288,9 @@ int main(int argc, char *argv[]) cerr << "Bottom 0.5% cutoff: " << lowerCutoff << "\n\n"; flog << "\nTop 0.5% cutoff: " << upperCutoff << endl; flog << "Bottom 0.5% cutoff: " << lowerCutoff << "\n\n"; - */ - double upperCutoff = 2; - double lowerCutoff = -2; + + //double upperCutoff = 2; + //double lowerCutoff = -2; delete [] freq; delete [] score; diff --git a/src/selscan-data.cpp b/src/selscan-data.cpp index 62cbc83..5fc6b9e 100644 --- a/src/selscan-data.cpp +++ b/src/selscan-data.cpp @@ -5,12 +5,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -75,6 +75,62 @@ MapData *readMapData(string filename, int expected_loci) return data; } +MapData *readMapDataTPED(string filename, int expected_loci, int expected_haps) +{ + ifstream fin; + cerr << "Opening " << filename << "...\n"; + fin.open(filename.c_str()); + + if (fin.fail()) + { + cerr << "ERROR: Failed to open " << filename << " for reading.\n"; + throw 0; + } + + int fileStart = fin.tellg(); + string line; + int nloci = 0; + int num_cols = 4; + int current_cols = 0; + while (getline(fin, line)) + { + nloci++; + current_cols = countFields(line); + if (current_cols != num_cols + expected_haps) + { + cerr << "ERROR: line " << nloci << " of " << filename << " has " << current_cols + << ", but expected " << num_cols + expected_haps << ".\n"; + throw 0; + } + } + + if (nloci != expected_loci) + { + cerr << "ERROR: Expected " << expected_loci << " loci in map file but found " << nloci << ".\n"; + throw 0; + } + + fin.clear(); // clear error flags + fin.seekg(fileStart); + + cerr << "Loading map data for " << nloci << " loci\n"; + + MapData *data = initMapData(nloci); + + string chr; + for (int locus = 0; locus < data->nloci; locus++) + { + fin >> data->chr; + fin >> data->locusName[locus]; + fin >> data->geneticPos[locus]; + fin >> data->physicalPos[locus]; + getline(fin, line); + } + + fin.close(); + return data; +} + //allocates the arrays and populates them with MISSING or "--" depending on type MapData *initMapData(int nloci) { @@ -168,7 +224,79 @@ HaplotypeData *readHaplotypeData(string filename) for (int locus = 0; locus < data->nloci; locus++) { fin >> data->data[hap][locus]; - if(data->data[hap][locus] != 0 && data->data[hap][locus] != 1) + if (data->data[hap][locus] != '0' && data->data[hap][locus] != '1') + { + cerr << "ERROR: Alleles must be coded 0/1 only.\n"; + throw 0; + } + } + } + + fin.close(); + + return data; +} + +HaplotypeData *readHaplotypeDataTPED(string filename) +{ + ifstream fin; + cerr << "Opening " << filename << "...\n"; + fin.open(filename.c_str()); + + if (fin.fail()) + { + cerr << "ERROR: Failed to open " << filename << " for reading.\n"; + throw 0; + } + + int numMapCols = 4; + int fileStart = fin.tellg(); + string line; + int nloci = 0; + int previous_nhaps = -1; + int current_nhaps = 0; + //Counts number of haps (cols) and number of loci (rows) + //if any lines differ, send an error message and throw an exception + while (getline(fin, line)) + { + //getline(fin,line); + //if(fin.eof()) break; + nloci++; + current_nhaps = countFields(line); + //cout << "nloci: " << current_nhaps << endl; + if (previous_nhaps < 0) + { + previous_nhaps = current_nhaps; + continue; + } + else if (previous_nhaps != current_nhaps) + { + cerr << "ERROR: line " << nloci << " of " << filename << " has " << current_nhaps + << " fields, but the previous line has " << previous_nhaps << " fields.\n"; + throw 0; + } + previous_nhaps = current_nhaps; + } + + fin.clear(); // clear error flags + fin.seekg(fileStart); + + cerr << "Loading " << current_nhaps - numMapCols << " haplotypes and " << nloci << " loci...\n"; + + HaplotypeData *data = initHaplotypeData(current_nhaps - numMapCols, nloci); + + string junk; + + for (int locus = 0; locus < data->nloci; locus++) + { + for (int i = 0; i < numMapCols; i++) + { + fin >> junk; + } + for (int hap = 0; hap < data->nhaps; hap++) + { + fin >> data->data[hap][locus]; + if (data->data[hap][locus] != '0' && data->data[hap][locus] != '1') { cerr << "ERROR: Alleles must be coded 0/1 only.\n"; throw 0; @@ -193,13 +321,13 @@ HaplotypeData *initHaplotypeData(unsigned int nhaps, unsigned int nloci) data->nhaps = nhaps; data->nloci = nloci; - data->data = new short*[nhaps]; + data->data = new char *[nhaps]; for (unsigned int i = 0; i < nhaps; i++) { - data->data[i] = new short[nloci]; + data->data[i] = new char[nloci]; for (unsigned int j = 0; j < nloci; j++) { - data->data[i][j] = MISSING; + data->data[i][j] = MISSING_CHAR; } } diff --git a/src/selscan-data.h b/src/selscan-data.h index 52cff87..563e644 100644 --- a/src/selscan-data.h +++ b/src/selscan-data.h @@ -25,10 +25,11 @@ using namespace std; const int MISSING = -9999; +const char MISSING_CHAR = '9'; struct HaplotypeData { - short **data; + char **data; int nhaps; int nloci; }; @@ -50,6 +51,7 @@ void releaseMapData(MapData *data); //returns a populated MapData structure if successful //throws an exception otherwise MapData *readMapData(string filename, int expected_loci); +MapData *readMapDataTPED(string filename, int expected_loci, int expected_haps); //allocates the 2-d array and populated it with -9 HaplotypeData *initHaplotypeData(unsigned int nhaps, unsigned int nloci); @@ -59,6 +61,7 @@ void releaseHapData(HaplotypeData *data); //returns a populated HaplotypeData structure if successful //throws an exception otherwise HaplotypeData *readHaplotypeData(string filename); +HaplotypeData *readHaplotypeDataTPED(string filename); //counts the number of "fields" in a string //where a field is defined as a contiguous set of non whitespace diff --git a/src/selscan-main.cpp b/src/selscan-main.cpp index 54f1ceb..e8e13b4 100644 --- a/src/selscan-main.cpp +++ b/src/selscan-main.cpp @@ -59,6 +59,18 @@ const int DEFAULT_THREAD = 1; const string HELP_THREAD = "The number of threads to spawn during the calculation.\n\ \tPartitions loci across threads."; +const string ARG_FILENAME_POP1_TPED = "--tped"; +const string DEFAULT_FILENAME_POP1_TPED = "__hapfile1"; +const string HELP_FILENAME_POP1_TPED = "A TPED file containing haplotype and map data.\n\ +\tVariants should be coded 0/1"; + +const string ARG_FILENAME_POP2_TPED = "--tped-ref"; +const string DEFAULT_FILENAME_POP2_TPED = "__hapfile2"; +const string HELP_FILENAME_POP2_TPED = "A TPED file containing haplotype and map data.\n\ +\tVariants should be coded 0/1. This is the 'reference'\n\ +\tpopulation for XP-EHH calculations and should contain the same number\n\ +\tof loci as the query population. Ignored otherwise."; + const string ARG_FILENAME_POP1 = "--hap"; const string DEFAULT_FILENAME_POP1 = "__hapfile1"; const string HELP_FILENAME_POP1 = "A hapfile with one row per haplotype, and one column per\n\ @@ -130,14 +142,21 @@ const int DEFAULT_QWIN = 100000; const string HELP_QWIN = "When calculating EHH, this is the length of the window in bp\n\ \tin each direction from the query locus."; +const string ARG_MAX_EXTEND = "--max-extend"; +const int DEFAULT_MAX_EXTEND = 1000000; +const string HELP_MAX_EXTEND = "The maximum distance an EHH decay curve is allowed to extend from the core.\n\ +\tSet <= 0 for no restriction."; + +const string ARG_SKIP = "--skip-low-freq"; +const bool DEFAULT_SKIP = false; +const string HELP_SKIP = "Do not include low frequency variants in the construction of haplotypes (iHS only)."; pthread_mutex_t mutex_log = PTHREAD_MUTEX_INITIALIZER; struct work_order_t { - int first_index; - int last_index; int queryLoc; + int id; string filename; @@ -159,8 +178,6 @@ struct work_order_t double *ihh2; double *freq2; - bool *pass; - ofstream *flog; ofstream *fout; Bar *bar; @@ -207,6 +224,8 @@ int main(int argc, char *argv[]) params.addFlag(ARG_THREAD, DEFAULT_THREAD, "", HELP_THREAD); params.addFlag(ARG_FILENAME_POP1, DEFAULT_FILENAME_POP1, "", HELP_FILENAME_POP1); params.addFlag(ARG_FILENAME_POP2, DEFAULT_FILENAME_POP2, "", HELP_FILENAME_POP2); + params.addFlag(ARG_FILENAME_POP1_TPED, DEFAULT_FILENAME_POP1_TPED, "", HELP_FILENAME_POP1_TPED); + params.addFlag(ARG_FILENAME_POP2_TPED, DEFAULT_FILENAME_POP2_TPED, "", HELP_FILENAME_POP2_TPED); params.addFlag(ARG_FILENAME_MAP, DEFAULT_FILENAME_MAP, "", HELP_FILENAME_MAP); params.addFlag(ARG_OUTFILE, DEFAULT_OUTFILE, "", HELP_OUTFILE); params.addFlag(ARG_CUTOFF, DEFAULT_CUTOFF, "", HELP_CUTOFF); @@ -220,6 +239,8 @@ int main(int argc, char *argv[]) params.addFlag(ARG_EHH, DEFAULT_EHH, "", HELP_EHH); params.addFlag(ARG_QWIN, DEFAULT_QWIN, "", HELP_QWIN); params.addFlag(ARG_SOFT_K, DEFAULT_SOFT_K, "SILENT", HELP_SOFT_K); + params.addFlag(ARG_MAX_EXTEND, DEFAULT_MAX_EXTEND, "", HELP_MAX_EXTEND); + params.addFlag(ARG_SKIP, DEFAULT_SKIP, "", HELP_SKIP); try { @@ -233,6 +254,13 @@ int main(int argc, char *argv[]) string hapFilename = params.getStringFlag(ARG_FILENAME_POP1); string hapFilename2 = params.getStringFlag(ARG_FILENAME_POP2); string mapFilename = params.getStringFlag(ARG_FILENAME_MAP); + string tpedFilename = params.getStringFlag(ARG_FILENAME_POP1_TPED); + string tpedFilename2 = params.getStringFlag(ARG_FILENAME_POP2_TPED); + + bool TPED = false; + + if (tpedFilename.compare(DEFAULT_FILENAME_POP1_TPED) != 0) TPED = true; + string outFilename = params.getStringFlag(ARG_OUTFILE); string query = params.getStringFlag(ARG_EHH); @@ -249,6 +277,7 @@ int main(int argc, char *argv[]) bool CALC_XP = params.getBoolFlag(ARG_XP); bool CALC_SOFT = params.getBoolFlag(ARG_SOFT); bool SINGLE_EHH = false; + bool SKIP = params.getBoolFlag(ARG_SKIP); int EHH1K = params.getIntFlag(ARG_SOFT_K); @@ -296,13 +325,22 @@ int main(int argc, char *argv[]) cerr << "ERROR: EHH cut off must be > 0 and < 1."; return 1; } - - if ((CALC_IHS) && hapFilename2.compare(DEFAULT_FILENAME_POP2) != 0) + if (TPED) { - cerr << "ERROR: You are calculating iHS for " << hapFilename << ", but have also given a second data file (" << hapFilename2 << ").\n"; - return 1; + if ((CALC_IHS) && hapFilename2.compare(DEFAULT_FILENAME_POP2_TPED) != 0) + { + cerr << "ERROR: You are calculating iHS for " << tpedFilename << ", but have also given a second data file (" << tpedFilename2 << ").\n"; + return 1; + } + } + else + { + if ((CALC_IHS) && hapFilename2.compare(DEFAULT_FILENAME_POP2) != 0) + { + cerr << "ERROR: You are calculating iHS for " << hapFilename << ", but have also given a second data file (" << hapFilename2 << ").\n"; + return 1; + } } - if (EHH1K < 1) { cerr << "ERROR: EHH1K must be > 0.\n"; @@ -314,17 +352,34 @@ int main(int argc, char *argv[]) try { - hapData = readHaplotypeData(hapFilename); - if (CALC_XP) + if (TPED) + { + hapData = readHaplotypeDataTPED(tpedFilename); + if (CALC_XP) + { + hapData2 = readHaplotypeDataTPED(tpedFilename2); + if (hapData->nloci != hapData2->nloci) + { + cerr << "ERROR: Haplotypes from " << tpedFilename << " and " << tpedFilename2 << " do not have the same number of loci.\n"; + return 1; + } + } + mapData = readMapDataTPED(tpedFilename, hapData->nloci, hapData->nhaps); + } + else { - hapData2 = readHaplotypeData(hapFilename2); - if (hapData->nloci != hapData2->nloci) + hapData = readHaplotypeData(hapFilename); + if (CALC_XP) { - cerr << "ERROR: Haplotypes from " << hapFilename << " and " << hapFilename2 << " do not have the same number of loci.\n"; - return 1; + hapData2 = readHaplotypeData(hapFilename2); + if (hapData->nloci != hapData2->nloci) + { + cerr << "ERROR: Haplotypes from " << hapFilename << " and " << hapFilename2 << " do not have the same number of loci.\n"; + return 1; + } } + mapData = readMapData(mapFilename, hapData->nloci); } - mapData = readMapData(mapFilename, hapData->nloci); } catch (...) { @@ -384,10 +439,18 @@ int main(int argc, char *argv[]) flog << "\n\nCalculating "; if (CALC_XP) flog << "XP-EHH.\n"; else flog << " iHS.\n"; - flog << "Haplotypes filename: " << hapFilename << "\n"; - if (CALC_XP) flog << "Reference haplotypes filename: " << hapFilename2 << "\n"; - flog << "Map filename: " << mapFilename << "\n"; - flog << "Output file: " << outFilename << "\n"; + if (TPED) + { + flog << "Input filename: " << tpedFilename << "\n"; + if (CALC_XP) flog << "Reference input filename: " << tpedFilename2 << "\n"; + flog << "Output file: " << outFilename << "\n"; + } + { + flog << "Haplotypes filename: " << hapFilename << "\n"; + if (CALC_XP) flog << "Reference haplotypes filename: " << hapFilename2 << "\n"; + flog << "Map filename: " << mapFilename << "\n"; + flog << "Output file: " << outFilename << "\n"; + } flog << "Threads: " << numThreads << "\n"; flog << "Scale parameter: " << SCALE_PARAMETER << "\n"; flog << "Max gap parameter: " << MAX_GAP << "\n"; @@ -398,7 +461,6 @@ int main(int argc, char *argv[]) flog.flush(); Bar pbar; - barInit(pbar, mapData->nloci, 78); double *ihs, *ihh1, *ihh2; double *freq, *freq1, *freq2; @@ -409,21 +471,6 @@ int main(int argc, char *argv[]) cerr << "WARNING: there are fewer loci than threads requested. Running with " << numThreads << " thread instead.\n"; } - //Partition loci amongst the specified threads - unsigned long int *NUM_PER_THREAD = new unsigned long int[numThreads]; - unsigned long int div = mapData->nloci / numThreads; - - for (int i = 0; i < numThreads; i++) - { - NUM_PER_THREAD[i] = 0; - NUM_PER_THREAD[i] += div; - } - - for (int i = 0; i < (mapData->nloci) % numThreads; i++) - { - NUM_PER_THREAD[i]++; - } - if (SINGLE_EHH) { work_order_t *order = new work_order_t; @@ -459,14 +506,22 @@ int main(int argc, char *argv[]) return 0; } - ihh1 = new double[mapData->nloci]; - ihh2 = new double[mapData->nloci]; - if (CALC_XP) { + ihh1 = new double[mapData->nloci]; + ihh2 = new double[mapData->nloci]; + freq1 = new double[hapData->nloci]; freq2 = new double[hapData2->nloci]; + for (int i = 0; i < hapData->nloci; i++) + { + freq1[i] = calcFreq(hapData, i); + freq2[i] = calcFreq(hapData2, i); + } + + barInit(pbar, mapData->nloci, 78); + cerr << "Starting XP-EHH calculations.\n"; work_order_t *order; pthread_t *peer = new pthread_t[numThreads]; @@ -474,9 +529,7 @@ int main(int argc, char *argv[]) for (int i = 0; i < numThreads; i++) { order = new work_order_t; - order->first_index = prev_index; - order->last_index = prev_index + NUM_PER_THREAD[i]; - prev_index += NUM_PER_THREAD[i]; + order->id = i; order->hapData1 = hapData; order->hapData2 = hapData2; order->mapData = mapData; @@ -521,8 +574,85 @@ int main(int argc, char *argv[]) } else if (CALC_IHS) { - ihs = new double[hapData->nloci]; + freq = new double[hapData->nloci]; + + MapData *newMapData; + HaplotypeData *newHapData; + double *newfreq; + + int count = 0; + for (int i = 0; i < hapData->nloci; i++) + { + freq[i] = calcFreq(hapData, i); + if (freq[i] > MAF && 1 - freq[i] > MAF) count++; + } + + if (SKIP) //prefilter all sites < MAF + { + cerr << ARG_SKIP << " set. Removing all variants < " << MAF << ".\n"; + flog << ARG_SKIP << " set. Removing all variants < " << MAF << ".\n"; + newfreq = new double [count]; + newMapData = initMapData(count); + newMapData->chr = mapData->chr; + int j = 0; + for (int locus = 0; locus < mapData->nloci; locus++) + { + if (freq[locus] <= MAF || 1 - freq[locus] <= MAF) + { + continue; + } + else + { + newMapData->physicalPos[j] = mapData->physicalPos[locus]; + newMapData->geneticPos[j] = mapData->geneticPos[locus]; + newMapData->locusName[j] = mapData->locusName[locus]; + newfreq[j] = freq[locus]; + j++; + } + } + + newHapData = initHaplotypeData(hapData->nhaps, count); + + for (int hap = 0; hap < newHapData->nhaps; hap++) + { + j = 0; + for (int locus = 0; locus < mapData->nloci; locus++) + { + if (freq[locus] <= MAF || 1 - freq[locus] <= MAF) + { + continue; + } + else + { + newHapData->data[hap][j] = hapData->data[hap][locus]; + j++; + } + } + } + + cerr << "Removed " << mapData->nloci - count << " low frequency variants.\n"; + flog << "Removed " << mapData->nloci - count << " low frequency variants.\n"; + + delete [] freq; + freq = newfreq; + newfreq = NULL; + + releaseHapData(hapData); + hapData = newHapData; + newHapData = NULL; + + releaseMapData(mapData); + mapData = newMapData; + newMapData = NULL; + } + + ihh1 = new double[mapData->nloci]; + ihh2 = new double[mapData->nloci]; + ihs = new double[hapData->nloci]; + + barInit(pbar, mapData->nloci, 78); + cerr << "Starting iHS calculations with alt flag "; if (!ALT) cerr << "not "; cerr << "set.\n"; @@ -533,9 +663,7 @@ int main(int argc, char *argv[]) for (int i = 0; i < numThreads; i++) { order = new work_order_t; - order->first_index = prev_index; - order->last_index = prev_index + NUM_PER_THREAD[i]; - prev_index += NUM_PER_THREAD[i]; + order->id = i; order->hapData = hapData; order->mapData = mapData; order->ihh1 = ihh1; @@ -589,9 +717,9 @@ int main(int argc, char *argv[]) for (int i = 0; i < numThreads; i++) { order = new work_order_t; - order->first_index = prev_index; - order->last_index = prev_index + NUM_PER_THREAD[i]; - prev_index += NUM_PER_THREAD[i]; + //order->first_index = prev_index; + //order->last_index = prev_index + NUM_PER_THREAD[i]; + //prev_index += NUM_PER_THREAD[i]; order->hapData = hapData; order->mapData = mapData; order->ihh1 = ihh1; @@ -658,9 +786,9 @@ double calcFreq(HaplotypeData *hapData, int locus) for (int hap = 0; hap < hapData->nhaps; hap++) { - if (hapData->data[hap][locus] != -9) + if (hapData->data[hap][locus] != MISSING_CHAR) { - freq += hapData->data[hap][locus]; + freq += ( hapData->data[hap][locus] == '1' ) ? 1 : 0; total++; } } @@ -670,7 +798,7 @@ double calcFreq(HaplotypeData *hapData, int locus) void query_locus(void *order) { work_order_t *p = (work_order_t *)order; - short **data = p->hapData->data; + char **data = p->hapData->data; int nloci = p->hapData->nloci; int nhaps = p->hapData->nhaps; int *physicalPos = p->mapData->physicalPos; @@ -708,10 +836,10 @@ void query_locus(void *order) string *haplotypeList = new string[nhaps]; for (int hap = 0; hap < nhaps; hap++) { - derivedCount += data[hap][locus]; - char digit[2]; - sprintf(digit, "%d", data[hap][locus]); - haplotypeList[hap] = digit; + derivedCount += ( data[hap][locus] == '1' ) ? 1 : 0; + //char digit[2]; + //sprintf(digit, "%d", data[hap][locus]); + haplotypeList[hap] = data[hap][locus]; } if (derivedCount == 0 || derivedCount == nhaps) @@ -752,11 +880,11 @@ void query_locus(void *order) for (int hap = 0; hap < nhaps; hap++) { - bool isDerived = data[hap][locus]; + bool isDerived = ( data[hap][locus] == '1') ? 1 : 0; //build haplotype string - char digit[2]; - sprintf(digit, "%d", data[hap][i]); - haplotypeList[hap] += digit; + //char digit[2]; + //sprintf(digit, "%d", data[hap][i]); + haplotypeList[hap] += data[hap][i]; string hapStr = haplotypeList[hap]; if (isDerived) @@ -809,9 +937,9 @@ void query_locus(void *order) haplotypeList = new string[nhaps]; for (int hap = 0; hap < nhaps; hap++) { - char digit[2]; - sprintf(digit, "%d", data[hap][locus]); - haplotypeList[hap] = digit; + //char digit[2]; + //sprintf(digit, "%d", data[hap][locus]); + haplotypeList[hap] = data[hap][locus]; } derivedCurrentColor = 0; @@ -826,11 +954,11 @@ void query_locus(void *order) map derivedHapCount; for (int hap = 0; hap < nhaps; hap++) { - bool isDerived = data[hap][locus]; + bool isDerived = ( data[hap][locus] == '1' ) ? 1 : 0; //build haplotype string - char digit[2]; - sprintf(digit, "%d", data[hap][i]); - haplotypeList[hap] += digit; + //char digit[2]; + //sprintf(digit, "%d", data[hap][i]); + haplotypeList[hap] += data[hap][i]; string hapStr = haplotypeList[hap]; if (isDerived) @@ -905,7 +1033,7 @@ void query_locus(void *order) void query_locus_soft(void *order) { work_order_t *p = (work_order_t *)order; - short **data = p->hapData->data; + char **data = p->hapData->data; int nloci = p->hapData->nloci; int nhaps = p->hapData->nhaps; int *physicalPos = p->mapData->physicalPos; @@ -1266,14 +1394,13 @@ bool familyDidSplit(const string &hapStr, const int hapCount, int **hapColor, co void calc_ihs(void *order) { work_order_t *p = (work_order_t *)order; - short **data = p->hapData->data; + char **data = p->hapData->data; int nloci = p->hapData->nloci; int nhaps = p->hapData->nhaps; int *physicalPos = p->mapData->physicalPos; double *geneticPos = p->mapData->geneticPos; string *locusName = p->mapData->locusName; - int start = p->first_index; - int stop = p->last_index; + int id = p->id; double *ihs = p->ihs; double *ihh1 = p->ihh1; double *ihh2 = p->ihh2; @@ -1286,22 +1413,39 @@ void calc_ihs(void *order) double EHH_CUTOFF = p->params->getDoubleFlag(ARG_CUTOFF); bool ALT = p->params->getBoolFlag(ARG_ALT); double MAF = p->params->getDoubleFlag(ARG_MAF); - - int MAX_EXTEND = 1000000; + int numThreads = p->params->getIntFlag(ARG_THREAD); + int MAX_EXTEND = ( p->params->getIntFlag(ARG_MAX_EXTEND) <= 0 ) ? physicalPos[nloci - 1] - physicalPos[0] : p->params->getIntFlag(ARG_MAX_EXTEND); + //bool SKIP = p->params->getBoolFlag(ARG_SKIP); double (*calc)(map &, int, bool) = p->calc; - int step = (stop - start) / (pbar->totalTicks); + //int step = (stop - start) / (pbar->totalTicks); + int step = (nloci / numThreads) / (pbar->totalTicks); if (step == 0) step = 1; - for (int locus = start; locus < stop; locus++) + bool isDerived; + string hapStr; + + for (int locus = id; locus < nloci; locus += numThreads) { if (locus % step == 0) advanceBar(*pbar, double(step)); ihs[locus] = 0; - freq[locus] = MISSING; + //freq[locus] = MISSING; ihh1[locus] = MISSING; ihh2[locus] = MISSING; + bool skipLocus = 0; + //If the focal snp has MAF < MAF, then skip this locus + if (freq[locus] < MAF || freq[locus] > 1 - MAF) + { + pthread_mutex_lock(&mutex_log); + (*flog) << "WARNING: Locus " << locusName[locus] + << " has MAF < " << MAF << ". Skipping calculation at " << locusName[locus] << "\n"; + pthread_mutex_unlock(&mutex_log); + ihs[locus] = MISSING; + skipLocus = 0; + continue; + } //EHH to the left of the core snp double current_derived_ehh = 1; @@ -1310,32 +1454,16 @@ void calc_ihs(void *order) double previous_ancestral_ehh = 1; int currentLocus = locus; int nextLocus = locus - 1; - bool skipLocus = 0; double derived_ihh = 0; double ancestral_ihh = 0; - double derivedCount = 0; + //double derivedCount = 0; //A list of all the haplotypes //Starts with just the focal snp and grows outward string *haplotypeList = new string[nhaps]; for (int hap = 0; hap < nhaps; hap++) { - derivedCount += data[hap][locus]; - char digit[2]; - sprintf(digit, "%d", data[hap][locus]); - haplotypeList[hap] = digit; - } - - //If the focal snp has MAF < MAF, then skip this locus - double alleleFreq = double(derivedCount) / double(nhaps); - if (alleleFreq < MAF || alleleFreq > 1 - MAF) - { - pthread_mutex_lock(&mutex_log); - (*flog) << "WARNING: Locus " << locusName[locus] - << " has MAF < " << MAF << ". Skipping calculation at " << locusName[locus] << "\n"; - pthread_mutex_unlock(&mutex_log); - ihs[locus] = MISSING; - skipLocus = 0; - continue; + //derivedCount += ( data[hap][locus] == '1' ) ? 1 : 0; + haplotypeList[hap] = data[hap][locus]; } while (current_derived_ehh > EHH_CUTOFF || current_ancestral_ehh > EHH_CUTOFF) @@ -1374,12 +1502,9 @@ void calc_ihs(void *order) for (int hap = 0; hap < nhaps; hap++) { - bool isDerived = data[hap][locus]; - //build haplotype string - char digit[2]; - sprintf(digit, "%d", data[hap][currentLocus]); - haplotypeList[hap] += digit; - string hapStr = haplotypeList[hap]; + isDerived = ( data[hap][locus] == '1' ) ? 1 : 0; + haplotypeList[hap] += data[hap][currentLocus]; + hapStr = haplotypeList[hap]; if (isDerived) { @@ -1456,9 +1581,9 @@ void calc_ihs(void *order) haplotypeList = new string[nhaps]; for (int hap = 0; hap < nhaps; hap++) { - char digit[2]; - sprintf(digit, "%d", data[hap][locus]); - haplotypeList[hap] = digit; + //char digit[2]; + //sprintf(digit, "%d", data[hap][locus]); + haplotypeList[hap] = data[hap][locus]; } while (current_ancestral_ehh > EHH_CUTOFF || current_derived_ehh > EHH_CUTOFF) @@ -1494,12 +1619,9 @@ void calc_ihs(void *order) map derivedHapCount; for (int hap = 0; hap < nhaps; hap++) { - bool isDerived = data[hap][locus]; - //build haplotype string - char digit[2]; - sprintf(digit, "%d", data[hap][currentLocus]); - haplotypeList[hap] += digit; - string hapStr = haplotypeList[hap]; + isDerived = ( data[hap][locus] == '1') ? 1 : 0; + haplotypeList[hap] += data[hap][currentLocus]; + hapStr = haplotypeList[hap]; if (isDerived) { @@ -1565,7 +1687,7 @@ void calc_ihs(void *order) ihh1[locus] = derived_ihh; ihh2[locus] = ancestral_ihh; ihs[locus] = log(derived_ihh / ancestral_ihh); - freq[locus] = double(derivedCount) / double(nhaps); + //freq[locus] = double(derivedCount) / double(nhaps); } } } @@ -1573,14 +1695,15 @@ void calc_ihs(void *order) void calc_soft_ihs(void *order) { work_order_t *p = (work_order_t *)order; - short **data = p->hapData->data; + char **data = p->hapData->data; int nloci = p->hapData->nloci; int nhaps = p->hapData->nhaps; int *physicalPos = p->mapData->physicalPos; double *geneticPos = p->mapData->geneticPos; string *locusName = p->mapData->locusName; - int start = p->first_index; - int stop = p->last_index; + //int start = p->first_index; + //int stop = p->last_index; + int id = p->id; double *h1 = p->ihh1; double *h2dh1 = p->ihh2; double *h12 = p->ihs; @@ -1593,11 +1716,12 @@ void calc_soft_ihs(void *order) double EHH_CUTOFF = p->params->getDoubleFlag(ARG_CUTOFF); bool ALT = p->params->getBoolFlag(ARG_ALT); double MAF = p->params->getDoubleFlag(ARG_MAF); + int numThreads = p->params->getIntFlag(ARG_THREAD); - int step = (stop - start) / (pbar->totalTicks); + int step = (nloci / numThreads) / (pbar->totalTicks); if (step == 0) step = 1; - for (int locus = start; locus < stop; locus++) + for (int locus = id; locus < nloci; locus += numThreads) { if (locus % step == 0) advanceBar(*pbar, double(step)); @@ -1838,12 +1962,12 @@ void calc_xpihh(void *order) { work_order_t *p = (work_order_t *)order; - short **data1 = p->hapData1->data; + char **data1 = p->hapData1->data; int nhaps1 = p->hapData1->nhaps; double *ihh1 = p->ihh1; double *freq1 = p->freq1; - short **data2 = p->hapData2->data; + char **data2 = p->hapData2->data; int nhaps2 = p->hapData2->nhaps; double *ihh2 = p->ihh2; double *freq2 = p->freq2; @@ -1853,8 +1977,7 @@ void calc_xpihh(void *order) double *geneticPos = p->mapData->geneticPos; string *locusName = p->mapData->locusName; - int start = p->first_index; - int stop = p->last_index; + int id = p->id; ofstream *flog = p->flog; Bar *pbar = p->bar; @@ -1863,25 +1986,24 @@ void calc_xpihh(void *order) int MAX_GAP = p->params->getIntFlag(ARG_MAX_GAP); double EHH_CUTOFF = p->params->getDoubleFlag(ARG_CUTOFF); bool ALT = p->params->getBoolFlag(ARG_ALT); + int numThreads = p->params->getIntFlag(ARG_THREAD); - int MAX_EXTEND = 1000000; + int MAX_EXTEND = ( p->params->getIntFlag(ARG_MAX_EXTEND) <= 0 ) ? physicalPos[nloci - 1] - physicalPos[0] : p->params->getIntFlag(ARG_MAX_EXTEND); - //Weird offset thing mentioned in Sabetti et al. (2007) that is apparently used to calculate iHH - //subtract this from EHH when integrating - //double offset = EHH_CUTOFF-(1.0/double(nhaps)); - - int step = (stop - start) / (pbar->totalTicks); + int step = (nloci / numThreads) / (pbar->totalTicks); if (step == 0) step = 1; - for (int locus = start; locus < stop; locus++) + string hapStr; + + for (int locus = id; locus < nloci; locus += numThreads) { if (locus % step == 0) advanceBar(*pbar, double(step)); ihh1[locus] = 0; ihh2[locus] = 0; - freq1[locus] = MISSING; - freq2[locus] = MISSING; + //freq1[locus] = MISSING; + //freq2[locus] = MISSING; //EHH to the left of the core snp double current_pooled_ehh = 1; @@ -1910,24 +2032,23 @@ void calc_xpihh(void *order) haplotypeListPooled = new string[nhaps1 + nhaps2]; for (int hap = 0; hap < nhaps1 + nhaps2; hap++) { - char digit[2]; + //char digit[2]; //Pop1 if (hap < nhaps1) { - sprintf(digit, "%d", data1[hap][locus]); - haplotypeList1[hap] = digit; - derivedCount1 += data1[hap][locus]; + //sprintf(digit, "%d", data1[hap][locus]); + haplotypeList1[hap] = data1[hap][locus]; + haplotypeListPooled[hap] = data1[hap][locus]; + derivedCount1 += ( data1[hap][locus] == '1') ? 1 : 0; } //Pop2 else { - sprintf(digit, "%d", data2[hap - nhaps1][locus]); - haplotypeList2[hap - nhaps1] = digit; - derivedCount2 += data2[hap - nhaps1][locus]; + //sprintf(digit, "%d", data2[hap - nhaps1][locus]); + haplotypeList2[hap - nhaps1] = data2[hap - nhaps1][locus]; + haplotypeListPooled[hap] = data2[hap - nhaps1][locus]; + derivedCount2 += ( data2[hap - nhaps1][locus] == '1' ) ? 1 : 0; } - - //Pooled - haplotypeListPooled[hap] = digit; } derivedCountPooled = derivedCount1 + derivedCount2; @@ -1997,30 +2118,35 @@ void calc_xpihh(void *order) //build haplotype strings for (int hap = 0; hap < nhaps1 + nhaps2; hap++) { - char digit[2]; - string hapStr; + //char digit[2]; if (hap < nhaps1) //Pop1 { - sprintf(digit, "%d", data1[hap][currentLocus]); - haplotypeList1[hap] += digit; + //sprintf(digit, "%d", data1[hap][currentLocus]); + haplotypeList1[hap] += data1[hap][currentLocus]; hapStr = haplotypeList1[hap]; if (hapCount1.count(hapStr) == 0) hapCount1[hapStr] = 1; else hapCount1[hapStr]++; + + //Pooled + haplotypeListPooled[hap] += data1[hap][currentLocus]; + hapStr = haplotypeListPooled[hap]; + if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; + else hapCountPooled[hapStr]++; } else //Pop2 { - sprintf(digit, "%d", data2[hap - nhaps1][currentLocus]); - haplotypeList2[hap - nhaps1] += digit; + //sprintf(digit, "%d", data2[hap - nhaps1][currentLocus]); + haplotypeList2[hap - nhaps1] += data2[hap - nhaps1][currentLocus]; hapStr = haplotypeList2[hap - nhaps1]; if (hapCount2.count(hapStr) == 0) hapCount2[hapStr] = 1; else hapCount2[hapStr]++; - } - //Pooled - haplotypeListPooled[hap] += digit; - hapStr = haplotypeListPooled[hap]; - if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; - else hapCountPooled[hapStr]++; + //Pooled + haplotypeListPooled[hap] += data2[hap - nhaps1][currentLocus]; + hapStr = haplotypeListPooled[hap]; + if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; + else hapCountPooled[hapStr]++; + } } current_pop1_ehh = calculateHomozygosity(hapCount1, nhaps1, ALT); @@ -2078,24 +2204,26 @@ void calc_xpihh(void *order) haplotypeListPooled = new string[nhaps1 + nhaps2]; for (int hap = 0; hap < nhaps1 + nhaps2; hap++) { - char digit[2]; //Pop1 if (hap < nhaps1) { - sprintf(digit, "%d", data1[hap][locus]); - haplotypeList1[hap] = digit; + //sprintf(digit, "%d", data1[hap][locus]); + haplotypeList1[hap] = data1[hap][locus]; derivedCount1 += data1[hap][locus]; + + //Pooled + haplotypeListPooled[hap] = data1[hap][locus]; } //Pop2 else { - sprintf(digit, "%d", data2[hap - nhaps1][locus]); - haplotypeList2[hap - nhaps1] = digit; + //sprintf(digit, "%d", data2[hap - nhaps1][locus]); + haplotypeList2[hap - nhaps1] = data2[hap - nhaps1][locus]; derivedCount2 += data2[hap - nhaps1][locus]; - } - //Pooled - haplotypeListPooled[hap] = digit; + //Pooled + haplotypeListPooled[hap] = data2[hap - nhaps1][locus]; + } } derivedCountPooled = derivedCount1 + derivedCount2; @@ -2164,32 +2292,37 @@ void calc_xpihh(void *order) //build haplotype strings for (int hap = 0; hap < nhaps1 + nhaps2; hap++) { - char digit[2]; - string hapStr; + //char digit[2]; + //string hapStr; //pop1 if (hap < nhaps1) { - sprintf(digit, "%d", data1[hap][currentLocus]); - haplotypeList1[hap] += digit; + haplotypeList1[hap] += data1[hap][currentLocus]; hapStr = haplotypeList1[hap]; if (hapCount1.count(hapStr) == 0) hapCount1[hapStr] = 1; else hapCount1[hapStr]++; + + //Pooled + haplotypeListPooled[hap] += data1[hap][currentLocus]; + hapStr = haplotypeListPooled[hap]; + if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; + else hapCountPooled[hapStr]++; } //Pop2 else { - sprintf(digit, "%d", data2[hap - nhaps1][currentLocus]); - haplotypeList2[hap - nhaps1] += digit; + haplotypeList2[hap - nhaps1] += data2[hap - nhaps1][currentLocus]; hapStr = haplotypeList2[hap - nhaps1]; if (hapCount2.count(hapStr) == 0) hapCount2[hapStr] = 1; else hapCount2[hapStr]++; + + //Pooled + haplotypeListPooled[hap] += data2[hap - nhaps1][currentLocus]; + hapStr = haplotypeListPooled[hap]; + if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; + else hapCountPooled[hapStr]++; } - //Pooled - haplotypeListPooled[hap] += digit; - hapStr = haplotypeListPooled[hap]; - if (hapCountPooled.count(hapStr) == 0) hapCountPooled[hapStr] = 1; - else hapCountPooled[hapStr]++; } current_pop1_ehh = calculateHomozygosity(hapCount1, nhaps1, ALT); @@ -2225,13 +2358,13 @@ void calc_xpihh(void *order) if (ihh1[locus] != MISSING) { ihh1[locus] = ihhPop1; - freq1[locus] = double(derivedCount1) / double(nhaps1); + //freq1[locus] = double(derivedCount1) / double(nhaps1); } if (ihh2[locus] != MISSING) { ihh2[locus] = ihhPop2; - freq2[locus] = double(derivedCount2) / double(nhaps2); + //freq2[locus] = double(derivedCount2) / double(nhaps2); } } }