Skip to content

Commit

Permalink
Merge branch 'devel'
Browse files Browse the repository at this point in the history
  • Loading branch information
szpiech committed Jun 17, 2014
2 parents c1c24ba + e098e9f commit 4250a3c
Show file tree
Hide file tree
Showing 11 changed files with 466 additions and 183 deletions.
19 changes: 19 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ PC Sabeti, et al. (2002) Detecting recent positive selection in the human

USAGE:

Data must be phased and have no missing genotypes.

To calculate EHH:

./selscan --ehh <locusID> --hap <haplotypes> --map <mapfile> --out <outfile>
Expand Down Expand Up @@ -82,6 +84,16 @@ selscan --xpehh --hap example2.pop1.hap --ref example2.pop2.hap --map example2.m
population for XP-EHH calculations. Ignored otherwise.
Default: __hapfile2

--tped <string>: A TPED file containing haplotype and map data.
Variants should be coded 0/1
Default: __hapfile1

--tped-ref <string>: A TPED file containing haplotype and map data.
Variants should be coded 0/1. This is the 'reference'
population for XP-EHH calculations and should contain the same number
of loci as the query population. Ignored otherwise.
Default: __hapfile2

--out <string>: The basename for all output files.
Default: outfile

Expand Down Expand Up @@ -114,6 +126,13 @@ selscan --xpehh --hap example2.pop1.hap --ref example2.pop2.hap --map example2.m
--max-gap <int>: Maximum allowed gap in bp between two snps.
Default: 200000

--max-extend <int>: The maximum distance an EHH decay curve is allowed to extend from the core.
Set <= 0 for no restriction.
Default: 1000000

--skip-low-freq <bool>: Do not include low frequency variants in the construction of haplotypes (iHS only).
Default: false

--threads <int>: The number of threads to spawn during the calculation.
Partitions loci across threads.
Default: 1
Expand Down
Binary file modified bin/linux/norm
Binary file not shown.
Binary file modified bin/linux/selscan
Binary file not shown.
Binary file modified bin/osx/norm
Binary file not shown.
Binary file modified bin/osx/selscan
Binary file not shown.
Binary file modified bin/win/norm.exe
Binary file not shown.
Binary file modified bin/win/selscan.exe
Binary file not shown.
8 changes: 4 additions & 4 deletions src/norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ int main(int argc, char *argv[])

cerr << "Calculating mean and variance per frequency bin:\n\n";
getMeanVarBins(freq, score, totalLoci, mean, variance, n, numBins, threshold);
/*

gsl_sort(score, 1, totalLoci);
double upperCutoff = gsl_stats_quantile_from_sorted_data (score, 1, totalLoci, 0.995);
double lowerCutoff = gsl_stats_quantile_from_sorted_data (score, 1, totalLoci, 0.005);
Expand All @@ -288,9 +288,9 @@ int main(int argc, char *argv[])
cerr << "Bottom 0.5% cutoff: " << lowerCutoff << "\n\n";
flog << "\nTop 0.5% cutoff: " << upperCutoff << endl;
flog << "Bottom 0.5% cutoff: " << lowerCutoff << "\n\n";
*/
double upperCutoff = 2;
double lowerCutoff = -2;

//double upperCutoff = 2;
//double lowerCutoff = -2;
delete [] freq;
delete [] score;

Expand Down
140 changes: 134 additions & 6 deletions src/selscan-data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Expand Down Expand Up @@ -75,6 +75,62 @@ MapData *readMapData(string filename, int expected_loci)
return data;
}

MapData *readMapDataTPED(string filename, int expected_loci, int expected_haps)
{
ifstream fin;
cerr << "Opening " << filename << "...\n";
fin.open(filename.c_str());

if (fin.fail())
{
cerr << "ERROR: Failed to open " << filename << " for reading.\n";
throw 0;
}

int fileStart = fin.tellg();
string line;
int nloci = 0;
int num_cols = 4;
int current_cols = 0;
while (getline(fin, line))
{
nloci++;
current_cols = countFields(line);
if (current_cols != num_cols + expected_haps)
{
cerr << "ERROR: line " << nloci << " of " << filename << " has " << current_cols
<< ", but expected " << num_cols + expected_haps << ".\n";
throw 0;
}
}

if (nloci != expected_loci)
{
cerr << "ERROR: Expected " << expected_loci << " loci in map file but found " << nloci << ".\n";
throw 0;
}

fin.clear(); // clear error flags
fin.seekg(fileStart);

cerr << "Loading map data for " << nloci << " loci\n";

MapData *data = initMapData(nloci);

string chr;
for (int locus = 0; locus < data->nloci; locus++)
{
fin >> data->chr;
fin >> data->locusName[locus];
fin >> data->geneticPos[locus];
fin >> data->physicalPos[locus];
getline(fin, line);
}

fin.close();
return data;
}

//allocates the arrays and populates them with MISSING or "--" depending on type
MapData *initMapData(int nloci)
{
Expand Down Expand Up @@ -168,7 +224,79 @@ HaplotypeData *readHaplotypeData(string filename)
for (int locus = 0; locus < data->nloci; locus++)
{
fin >> data->data[hap][locus];
if(data->data[hap][locus] != 0 && data->data[hap][locus] != 1)
if (data->data[hap][locus] != '0' && data->data[hap][locus] != '1')
{
cerr << "ERROR: Alleles must be coded 0/1 only.\n";
throw 0;
}
}
}

fin.close();

return data;
}

HaplotypeData *readHaplotypeDataTPED(string filename)
{
ifstream fin;
cerr << "Opening " << filename << "...\n";
fin.open(filename.c_str());

if (fin.fail())
{
cerr << "ERROR: Failed to open " << filename << " for reading.\n";
throw 0;
}

int numMapCols = 4;
int fileStart = fin.tellg();
string line;
int nloci = 0;
int previous_nhaps = -1;
int current_nhaps = 0;
//Counts number of haps (cols) and number of loci (rows)
//if any lines differ, send an error message and throw an exception
while (getline(fin, line))
{
//getline(fin,line);
//if(fin.eof()) break;
nloci++;
current_nhaps = countFields(line);
//cout << "nloci: " << current_nhaps << endl;
if (previous_nhaps < 0)
{
previous_nhaps = current_nhaps;
continue;
}
else if (previous_nhaps != current_nhaps)
{
cerr << "ERROR: line " << nloci << " of " << filename << " has " << current_nhaps
<< " fields, but the previous line has " << previous_nhaps << " fields.\n";
throw 0;
}
previous_nhaps = current_nhaps;
}

fin.clear(); // clear error flags
fin.seekg(fileStart);

cerr << "Loading " << current_nhaps - numMapCols << " haplotypes and " << nloci << " loci...\n";

HaplotypeData *data = initHaplotypeData(current_nhaps - numMapCols, nloci);

string junk;

for (int locus = 0; locus < data->nloci; locus++)
{
for (int i = 0; i < numMapCols; i++)
{
fin >> junk;
}
for (int hap = 0; hap < data->nhaps; hap++)
{
fin >> data->data[hap][locus];
if (data->data[hap][locus] != '0' && data->data[hap][locus] != '1')
{
cerr << "ERROR: Alleles must be coded 0/1 only.\n";
throw 0;
Expand All @@ -193,13 +321,13 @@ HaplotypeData *initHaplotypeData(unsigned int nhaps, unsigned int nloci)
data->nhaps = nhaps;
data->nloci = nloci;

data->data = new short*[nhaps];
data->data = new char *[nhaps];
for (unsigned int i = 0; i < nhaps; i++)
{
data->data[i] = new short[nloci];
data->data[i] = new char[nloci];
for (unsigned int j = 0; j < nloci; j++)
{
data->data[i][j] = MISSING;
data->data[i][j] = MISSING_CHAR;
}
}

Expand Down
5 changes: 4 additions & 1 deletion src/selscan-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@
using namespace std;

const int MISSING = -9999;
const char MISSING_CHAR = '9';

struct HaplotypeData
{
short **data;
char **data;
int nhaps;
int nloci;
};
Expand All @@ -50,6 +51,7 @@ void releaseMapData(MapData *data);
//returns a populated MapData structure if successful
//throws an exception otherwise
MapData *readMapData(string filename, int expected_loci);
MapData *readMapDataTPED(string filename, int expected_loci, int expected_haps);

//allocates the 2-d array and populated it with -9
HaplotypeData *initHaplotypeData(unsigned int nhaps, unsigned int nloci);
Expand All @@ -59,6 +61,7 @@ void releaseHapData(HaplotypeData *data);
//returns a populated HaplotypeData structure if successful
//throws an exception otherwise
HaplotypeData *readHaplotypeData(string filename);
HaplotypeData *readHaplotypeDataTPED(string filename);

//counts the number of "fields" in a string
//where a field is defined as a contiguous set of non whitespace
Expand Down
Loading

0 comments on commit 4250a3c

Please sign in to comment.