diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..40e9f40 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,5 @@ + + +Default +python 2.7 + diff --git a/PL_to_GL_reorder.py b/PL_to_GL_reorder.py new file mode 100755 index 0000000..6e53133 --- /dev/null +++ b/PL_to_GL_reorder.py @@ -0,0 +1,77 @@ +#!/bin/env python3 + +import argparse +from Bio.bgzf import BgzfWriter +import gzip +import sys + +def main(): + + args = parse_args() + with gzip.open(args.out, 'wt') as fd_out: + for i_vcf, vcf in enumerate(args.vcf): + with gzip.open(vcf, 'rt') as fd_vcf: + ## loop metadata lines and header line + for line in fd_vcf: + ## end of metadata lines + if line[0:2] != '##': + ## print header line + print(line, end='', file=fd_out) + ## break loop at the header line + break + if i_vcf == 0: + ## print metadata line + print(line, end='', file=fd_out) + ## loop records + for line in fd_vcf: + l = line.rstrip().split('\t') + field = l[8].split(':') + ## index PL + i_PL = l[8].split(':').index('PL') + field.append('GL') + GT = field.pop(0) + field.sort() + field.insert(0, GT) + l[8] = ':'.join(field) + ## index GL + i_GL = l[8].split(':').index('GL') + s = '\t'.join(l[:9]) + print(s, sep='\t', file=fd_out, end='') + for i_GT in range(9, len(l)): + lGT = l[i_GT] + info = l[i_GT].split(':') + if lGT == './.:0,0': + toSplit = './.:0,0:0' + info = toSplit.split(':') + if info[0] == './.': + s_GL = '.,.,.' + else: + s_PL = l[i_GT].split(':')[i_PL] + if s_PL == '.': + s_GL = '.,.,.' + else: + s_GL = ','.join( + str(-float(PL)/10) for PL in s_PL.split(',')) + if s_GL==".": + print(line) + exit() + info.insert(i_GL, s_GL) + l[i_GT] = info + print('\t' + + ':'.join(l[i_GT]), sep='\t', end='', + file=fd_out) + print(end='\n', file=fd_out) + + return + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--vcf', nargs = '+') + parser.add_argument('--out', required=True) + args = parser.parse_args() + + return args + +if __name__ == '__main__': + main() diff --git a/prepareGenFromBeagle4_modified20160601/bin/prepareGenFromBeagle4 b/prepareGenFromBeagle4_modified20160601/bin/prepareGenFromBeagle4 new file mode 100755 index 0000000..6ec9ef0 Binary files /dev/null and b/prepareGenFromBeagle4_modified20160601/bin/prepareGenFromBeagle4 differ diff --git a/prepareGenFromBeagle4_modified20160601/makefile b/prepareGenFromBeagle4_modified20160601/makefile new file mode 100644 index 0000000..5cf3579 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/makefile @@ -0,0 +1,90 @@ +#compiler +CXX=g++ + +#compiler flags +COPTI= -O3 +CDEBG= -g +CWARN= -Wall -Wextra -Wno-sign-compare +CBAMH= -D_LIBBAM +CSHV1= + +#linker flags +LOPTI= -O3 +LDEBG= -g +LSTDD= -lm -lpthread -lboost_iostreams -lboost_program_options +LSTDS= -Wl,-Bstatic -lboost_iostreams -lz -lbz2 -Wl,-Bdynamic -lm -lpthread +LCL3S= -Wl,-Bstatic -lz -lbz2 -Wl,-Bdynamic -lm -lpthread +LBAMD= -lbam -lz + +#executable file +EFILE= bin/prepareGenFromBeagle4 + +#header files +HFILE= $(shell find src -name *.h) + +#source files +CFILE= $(shell find src -name *.cpp) + +#source path +VPATH= $(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done) + +#include path +ISTDP= -Isrc +IBAMP= -Ilib +#ICL3P= -I/apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/include +#ICL3P= -I$EBROOTBOOST/include +ICL3P= -I/apps/software/Boost/1.60.0-foss-2015b-Python-2.7.9/include/ + +#library path +LSTDP= -Llib + +#object files +OFILE= $(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done) +OBOST= + +#default +all: cluster + +#dynamic release +dynamic: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +dynamic: LFLAG=$(LOPTI) $(LSTDD) +dynamic: IFLAG=$(ISTDP) +dynamic: $(EFILE) + +#debug release +debug: CFLAG=$(CDEBG) $(CWARN) $(CSHV1) +debug: LFLAG=$(LDEBG) $(LSTDD) +debug: IFLAG=$(ISTDP) +debug: $(EFILE) + +#static release +static: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +static: LFLAG=$(LOPTI) $(LSTDS) +static: IFLAG=$(ISTDP) +static: $(EFILE) + +#cluster release +cluster: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +cluster: LFLAG=$(LOPTI) $(LCL3S) +cluster: IFLAG=$(ISTDP) $(ICL3P) +#cluster: OBOST=/apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/lib/libboost_iostreams.a /apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/lib/libboost_program_options.a +OBOST=-lz /apps/software/Boost/1.60.0-foss-2015b-Python-2.7.9/lib/libboost_iostreams.a +cluster: $(EFILE) + +$(EFILE): $(OFILE) + $(CXX) $^ $(OBOST) -o $@ $(LFLAG) + +obj/%.o: %.cpp $(HFILE) + $(CXX) -o $@ -c $< $(CFLAG) $(IFLAG) + +clean: + rm -f obj/*.o $(EFILE) + +test: + cp $(EFILE) ~/$(EFILE).v10 + +oxford: + cp $(EFILE) ~/bin/. + +install: + cp $(EFILE) ./tmp/ diff --git a/prepareGenFromBeagle4_modified20160601/makefile.bak b/prepareGenFromBeagle4_modified20160601/makefile.bak new file mode 100644 index 0000000..3182d38 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/makefile.bak @@ -0,0 +1,87 @@ +#compiler +CXX=g++ + +#compiler flags +COPTI= -O3 +CDEBG= -g +CWARN= -Wall -Wextra -Wno-sign-compare +CBAMH= -D_LIBBAM +CSHV1= + +#linker flags +LOPTI= -O3 +LDEBG= -g +LSTDD= -lm -lpthread -lboost_iostreams -lboost_program_options +LSTDS= -Wl,-Bstatic -lboost_iostreams -lz -lbz2 -Wl,-Bdynamic -lm -lpthread +LCL3S= -Wl,-Bstatic -lz -lbz2 -Wl,-Bdynamic -lm -lpthread +LBAMD= -lbam -lz + +#executable file +EFILE= bin/prepareGenFromBeagle4 + +#header files +HFILE= $(shell find src -name *.h) + +#source files +CFILE= $(shell find src -name *.cpp) + +#source path +VPATH= $(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done) + +#include path +ISTDP= -Isrc +IBAMP= -Ilib +ICL3P= -I/users/delaneau/BOOST/include + +#library path +LSTDP= -Llib + +#object files +OFILE= $(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done) +OBOST= + +#default +all: dynamic + +#dynamic release +dynamic: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +dynamic: LFLAG=$(LOPTI) $(LSTDD) +dynamic: IFLAG=$(ISTDP) +dynamic: $(EFILE) + +#debug release +debug: CFLAG=$(CDEBG) $(CWARN) $(CSHV1) +debug: LFLAG=$(LDEBG) $(LSTDD) +debug: IFLAG=$(ISTDP) +debug: $(EFILE) + +#static release +static: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +static: LFLAG=$(LOPTI) $(LSTDS) +static: IFLAG=$(ISTDP) +static: $(EFILE) + +#cluster release +cluster: CFLAG=$(COPTI) $(CWARN) $(CSHV1) +cluster: LFLAG=$(LOPTI) $(LCL3S) +cluster: IFLAG=$(ISTDP) $(ICL3P) +cluster: OBOST=~/BOOST/lib/libboost_iostreams.a ~/BOOST/lib/libboost_program_options.a +cluster: $(EFILE) + +$(EFILE): $(OFILE) + $(CXX) $^ $(OBOST) -o $@ $(LFLAG) + +obj/%.o: %.cpp $(HFILE) + $(CXX) -o $@ -c $< $(CFLAG) $(IFLAG) + +clean: + rm -f obj/*.o $(EFILE) + +test: + cp $(EFILE) ~/$(EFILE).v10 + +oxford: + cp $(EFILE) ~/bin/. + +install: + cp $(EFILE) /usr/local/bin/. diff --git a/prepareGenFromBeagle4_modified20160601/obj/chunk.o b/prepareGenFromBeagle4_modified20160601/obj/chunk.o new file mode 100644 index 0000000..3ab7ecc Binary files /dev/null and b/prepareGenFromBeagle4_modified20160601/obj/chunk.o differ diff --git a/prepareGenFromBeagle4_modified20160601/obj/data.o b/prepareGenFromBeagle4_modified20160601/obj/data.o new file mode 100644 index 0000000..b7f5355 Binary files /dev/null and b/prepareGenFromBeagle4_modified20160601/obj/data.o differ diff --git a/prepareGenFromBeagle4_modified20160601/obj/prepareGenFromBeagle4.o b/prepareGenFromBeagle4_modified20160601/obj/prepareGenFromBeagle4.o new file mode 100644 index 0000000..73749df Binary files /dev/null and b/prepareGenFromBeagle4_modified20160601/obj/prepareGenFromBeagle4.o differ diff --git a/prepareGenFromBeagle4_modified20160601/obj/utils.o b/prepareGenFromBeagle4_modified20160601/obj/utils.o new file mode 100644 index 0000000..9d7c0f7 Binary files /dev/null and b/prepareGenFromBeagle4_modified20160601/obj/utils.o differ diff --git a/prepareGenFromBeagle4_modified20160601/old/chunk.cpp b/prepareGenFromBeagle4_modified20160601/old/chunk.cpp new file mode 100644 index 0000000..4efdb35 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/old/chunk.cpp @@ -0,0 +1,76 @@ +#include "classes.h" + + +int hamming (vector < bool > & h1, vector < bool > & h2) { + assert(h1.size() == h2.size()); + int c = 0; + for(int l = 0 ; l < h1.size(); l++) if (h1[l] != h2[l]) c++; + return c; +} + +chunk::chunk(char * id) { + this->id = string(id); +} + +chunk::~chunk() { + mappingS.clear(); + H.clear(); + P.clear(); + switches.clear(); +} + +bool chunk::operator < (const chunk & c) const { + return (mappingS[0] < c.mappingS[0]); +} + +void chunk::assemble(chunk & pc) { + //mapping starting point of the overlap + int b1 = -1; + int e1 = pc.mappingS.size() - 1; + for (int l = 0 ; l < pc.mappingS.size() && b1 < 0 ; l ++ ) if (pc.mappingS[l] == mappingS[0]) b1 = l; + if (b1 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc.id << "], code 1" << endl; exit(1);} + + //mapping starting point of the overlap + int b2 = 0; + int e2 = -1; + for (int l = 0 ; l < mappingS.size() && e2 < 0 ; l ++ ) if (mappingS[l] == pc.mappingS.back()) e2 = l; + if (e2 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc.id << "], code 2" << endl; exit(1);} + + //overlap size + int osize = e1 - b1 + 1; + int osize2 = e2 - b2 + 1; + if (osize != osize2) {cout << "Overlaps are differents between [" << id << "] does not overlap chunk [" << pc.id << "] o1=" << osize << " o2=" << osize2 <chr = chr; + this->id = id; + this->pos = pos; + this->a0 = a0; + this->a1 = a1; + this->idx = idx; + done = false; + } +}; + +class snp_set { +public: + vector < snp * > vec_snp; + multimap < int, snp *> map_snp; + + //C/D + snp_set() {}; + ~snp_set() {}; + + //M + int size() { + return vec_snp.size(); + } + + void push(snp * s) { + vec_snp.push_back(s); + map_snp.insert ( std::pair< int, snp * >( s->pos , s)); + } + + snp * get(int p, string & r0, string & r1) { + pair < multimap < int, snp *>::iterator, multimap < int, snp *>::iterator > seqM = map_snp.equal_range(p); + for (multimap < int, snp *>::iterator itM = seqM.first; itM != seqM.second; ++itM) + if (r0 == itM->second->a0 && r1 == itM->second->a1) return itM->second; + return NULL; + } +}; + + +class genotype { +public: + string id; + int idx; + + vector < float > L; + vector < float > P; + vector < char > G; + vector < bool > h1; + vector < bool > h2; + + //C/D + genotype(string & id, int idx){ + this->id = id; + this->idx = idx; + } + + ~genotype () { + L.clear(); P.clear(); G.clear(); h1.clear(); h2.clear(); + } + + int call(float threshold) { + int n = 0; + for (int l = 0 ; l < h1.size() ; l ++) { + G[l] = -1; + if (P[3 * l + 0] >= threshold) G[l] = 0; + if (P[3 * l + 1] >= threshold) G[l] = 1; + if (P[3 * l + 2] >= threshold) G[l] = 2; + if (G[l] >= 0) n ++; + } + return n; + } +}; + +class chunk { +public: + string id; + vector < int > mappingS; + vector < vector < bool > > H; + vector < vector < float > > P; + vector < bool > switches; + + chunk(char *) ; + ~chunk(); + + bool operator < (const chunk & c) const; + + void assemble(chunk & pc); +}; + +class data { +public: + vector < genotype * > G; + snp_set M; + vector < chunk * > C; + + data(); + ~data(); + + void readLikelihoods(char *); + void readPosteriors(char *); + void assemble(); + void call(double); + void writeHaplotypes(char *); + void writeGenotypes(char *); +}; + +#endif diff --git a/prepareGenFromBeagle4_modified20160601/old/data.cpp b/prepareGenFromBeagle4_modified20160601/old/data.cpp new file mode 100644 index 0000000..ec532a6 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/old/data.cpp @@ -0,0 +1,183 @@ +#include "classes.h" + +data::data() { +} + +data::~data() { + for (int i = 0 ; i < G.size() ; i ++) delete G[i]; + G.clear(); + for (int i = 0 ; i < C.size() ; i ++) delete C[i]; + C.clear(); +} + +void data::readLikelihoods (char * vcf) { + string buffer; + vector < string > tok, field, prob; + + cout << "Reading likelihoods from [" << vcf << "]" << endl; + int line = 0; + ifile fd(vcf); + while(getline(fd, buffer, '\n')) { + tok = sutils::tokenize(buffer, "\t"); + if (tok[0] == "#CHROM") { + for (int i = 9 ; i < tok.size() ; i ++) G.push_back(new genotype(tok[i], G.size())); + cout << " * #inds = " << G.size() << endl; + } else if (tok[0][0] != '#') { + M.push(new snp (atoi(tok[0].c_str()), tok[2], atoi(tok[1].c_str()), tok[3], tok[4], M.size())); + int idx_gl = -1; + field = sutils::tokenize(tok[8], ":"); + for(int f = 0 ; f < field.size() && idx_gl < 0 ; f++) if (field[f] == "GL") idx_gl = f; + if (idx_gl < 0) { cout << "No GL field in VCF file at line = " << line << endl; exit(1); } + for (int i = 9 ; i < tok.size() ; i ++) { + field = sutils::tokenize(tok[i], ":"); + prob = sutils::tokenize(field[idx_gl], ","); + double l0 = atof(prob[0].c_str()); + double l1 = atof(prob[1].c_str()); + double l2 = atof(prob[2].c_str()); + G[i - 9]->L.push_back(l0); + G[i - 9]->L.push_back(l1); + G[i - 9]->L.push_back(l2); + G[i - 9]->P.push_back(0.0); + G[i - 9]->P.push_back(0.0); + G[i - 9]->P.push_back(0.0); + G[i - 9]->G.push_back(-1); + G[i - 9]->h1.push_back(false); + G[i - 9]->h2.push_back(false); + } + } + + if (line % 1000 == 0) { cout << "\r" << line ;cout.flush();} + line ++; + } + cout << "\r * #sites = " << M.size() << endl; +} + +void data::readPosteriors (char * vcf) { + string buffer; + vector < string > tok, field, prob; + chunk * c = new chunk(vcf); + + cout << "Reading posteriors from [" << vcf << "]" << endl; + int line = 0; + ifile fd(vcf); + while(getline(fd, buffer, '\n')) { + sutils::tokenize(buffer, "\t"); + if (tok[0] == "#CHROM") { + for (int i = 9 ; i < tok.size() ; i ++) + if (G[i-9]->id != tok[i]) { cout << "Sample inconsistency like=" << G[i-9]->id << " post=" << tok[i] << endl;exit(1); } + c->switches = vector < bool > (G.size(), false); + cout << " * All " << G.size() << " samples found" << endl; + } else if (tok[0][0] != '#') { + snp * s = M.get(atoi(tok[1].c_str()), tok[3], tok[4]); + if (s == NULL) { cout << "Site unfound in likelihoods file! pos = " << atoi(tok[1].c_str()) << endl; exit(1); } + c->mappingS.push_back(s->idx); + c->H.push_back(vector < bool > (2 * G.size(), false)); + c->P.push_back(vector < float > (3 * G.size(), 0.0)); + for (int i = 9 ; i < tok.size() ; i ++) { + field = sutils::tokenize(tok[i], ":"); + prob = sutils::tokenize(field[2], ","); + c->H.back()[(i-9) * 2 + 0] = (tok[i][0] == '1'); + c->H.back()[(i-9) * 2 + 1] = (tok[i][2] == '1'); + c->P.back()[(i-9) * 3 + 0] = atof(prob[0].c_str()); + c->P.back()[(i-9) * 3 + 1] = atof(prob[1].c_str()); + c->P.back()[(i-9) * 3 + 2] = atof(prob[2].c_str()); + } + } + + if (line % 1000 == 0) { cout << "\r" << line ;cout.flush();} + line ++; + } + cout << "\r * #sites = " << c->mappingS.size() << endl; + C.push_back(c); +} + +void data::assemble() { + cout << "Sorting the " << C.size() << " chunks of data" << endl; + sort(C.begin(), C.end()); + for (int c = 1 ; c < C.size() ; c++) { + cout << "Assembling chunk " << c -1 << " witch chunk " << c << endl; + C[c]->assemble(*C[c-1]); + } + + cout << "Copying chunk genotype & haplotype data" << endl; + for (int c = 0 ; c < C.size() ; c++) { + for (int l = 0 ; l < C[c]->mappingS.size() ; l ++) { + if (C[c]->mappingS[l] >= 0) { + //copying posteriors + for (int i = 0 ; i < G.size() ; i ++) { + G[i]->P[3 * C[c]->mappingS[l] + 0] = C[c]->P[l][3 * i + 0]; + G[i]->P[3 * C[c]->mappingS[l] + 1] = C[c]->P[l][3 * i + 1]; + G[i]->P[3 * C[c]->mappingS[l] + 2] = C[c]->P[l][3 * i + 2]; + } + //copying haplotypes + for (int i = 0 ; i < G.size() ; i ++) { + if (!C[c]->switches[i]) { + G[i]->h1[C[c]->mappingS[l]] = C[c]->H[l][2 * i + 0]; + G[i]->h2[C[c]->mappingS[l]] = C[c]->H[l][2 * i + 1]; + } else { + G[i]->h1[C[c]->mappingS[l]] = C[c]->H[l][2 * i + 1]; + G[i]->h2[C[c]->mappingS[l]] = C[c]->H[l][2 * i + 0]; + } + } + } + } + } +} + +void data::call(double threshold) { + int n_called = 0; + int n_total = 0; + cout << "Calling genotypes from posteriors" << endl; + for (int i = 0 ; i < G.size() ; i ++) { + n_called += G[i]->call(threshold); + n_total += M.size(); + } + cout << " * #called = " << sutils::double2str(n_called * 100 / n_total, 1) << "%" << endl; +} + +void data::writeHaplotypes(char * hap) { + string fhap = string(hap) + ".hap.gz"; + string fsam = string(hap) + ".hap.sample"; + + cout << "Writing samples in [" << fsam << "]" << endl; + ofile fds (fhap); + fds << "ID_1 ID_2 missing" << endl << "0 0 0" << endl; + for (int i = 0 ; i < G.size() ; i ++) fds << G[i]->id << " " << G[i]->id << " 0" << endl; + fds.close(); + + cout << "Writing haplotypes in [" << fhap << "]" << endl; + ofile fdh (fsam); + for (int s = 0 ; s < M.size() ; s ++) { + fdh << M.vec_snp[s]->chr << " " << M.vec_snp[s]->id << " " << M.vec_snp[s]->pos << " " << M.vec_snp[s]->a0 << " " << M.vec_snp[s]->a1; + for (int i = 0 ; i < G.size() ; i ++) fdh << " " << G[i]->h1[s] << " " << G[i]->h2[s]; + fdh << endl; + } + fdh.close(); +} + +void data::writeGenotypes(char * gen) { + string fgen = string(gen) + ".gen.gz"; + string fsam = string(gen) + ".gen.sample"; + + cout << "Writing samples in [" << fsam << "]" << endl; + ofile fds (fsam); + fds << "ID_1 ID_2 missing" << endl << "0 0 0" << endl; + for (int i = 0 ; i < G.size() ; i ++) fds << G[i]->id << " " << G[i]->id << " 0" << endl; + fds.close(); + + cout << "Writing genotypes in [" << fgen << "]" << endl; + ofile fdg (fgen); + for (int s = 0 ; s < M.size() ; s ++) { + fdg << M.vec_snp[s]->chr << " " << M.vec_snp[s]->id << " " << M.vec_snp[s]->pos << " " << M.vec_snp[s]->a0 << " " << M.vec_snp[s]->a1; + for (int i = 0 ; i < G.size() ; i ++) { + switch (G[i]->G[s]) { + case 0: fdg << " 1 0 0"; break; + case 1: fdg << " 0 1 0"; break; + case 2: fdg << " 0 0 1"; break; + default: fdg << " " << G[i]->P[3 * i + 0] << " " << G[i]->P[3 * i + 1] << " " << G[i]->P[3 * i + 2]; break; + } + } + fdg << endl; + } + fdg.close(); +} diff --git a/prepareGenFromBeagle4_modified20160601/old/prepareGenFromBeagle4.cpp b/prepareGenFromBeagle4_modified20160601/old/prepareGenFromBeagle4.cpp new file mode 100644 index 0000000..eadad29 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/old/prepareGenFromBeagle4.cpp @@ -0,0 +1,55 @@ +#include "classes.h" + +int main(int argc, char ** argv) { + string buffer; + vector < string > tok; + + char * likelihoods = NULL; + vector < char * > posteriors; + char * haplotypes = NULL; + char * genotypes = NULL; + double threshold = 0.995; + + //Reading parameters + for (int a =1 ; a < argc ; a++) { + if (strcmp(argv[a], "--likelihoods") == 0) {likelihoods = argv[a+1];} + if (strcmp(argv[a], "--haplotypes") == 0) {haplotypes = argv[a+1];} + if (strcmp(argv[a], "--genotypes") == 0) {genotypes = argv[a+1];} + if (strcmp(argv[a], "--threshold") == 0) {threshold= atoi(argv[a+1]);} + if (strcmp(argv[a], "--posteriors") == 0) { + int cpt = a + 1; + while (cpt >= 0) { + if (argv[cpt][0] == '-') cpt = -1; + else { + posteriors.push_back(argv[cpt]); + cpt ++; + } + } + } + } + + //Checking parameters + if (likelihoods == NULL) { cout << "Missing --likelihoods arguments!" << endl; exit(1);} + else cout << "Likelihoods:\t[" << likelihoods << "]" << endl; + + if (posteriors.size() == 0) { cout << "Missing --posteriors arguments!" << endl; exit(1);} + else { + cout << "Posteriors:\n" << endl; + for (int p = 0 ; p < posteriors.size() ; p++) cout << "\t[" << posteriors[p] << "]" << endl; + } + + if (haplotypes == NULL) { cout << "Missing --haplotypes arguments!" << endl; exit(1);} + else cout << "Haplotypes:\t[" << haplotypes << "]" << endl; + if (genotypes == NULL) { cout << "Missing --haplotypes arguments!" << endl; exit(1);} + else cout << "Genotypes:\t[" << genotypes << "]" << endl; + cout << "Threshold:\t" << threshold << endl; + + cout << endl; + data D; + D.readLikelihoods(likelihoods); + for (int p = 0 ; p < posteriors.size() ; p++) D.readPosteriors(posteriors[p]); + D.assemble(); + D.call(threshold); + D.writeHaplotypes(haplotypes); + D.writeGenotypes(genotypes); +} diff --git a/prepareGenFromBeagle4_modified20160601/old/utils.cpp b/prepareGenFromBeagle4_modified20160601/old/utils.cpp new file mode 100644 index 0000000..18c63b4 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/old/utils.cpp @@ -0,0 +1,646 @@ +#include "utils.h" + +/******************************************************/ +/* UTILS STATISTICS */ +/******************************************************/ +long seed = -123456789; + +namespace putils { + void initRandom(long s) { + if (s == -1) seed = - time(NULL); + else seed = - s; + } + + double getRandom() { + int j; + long k; + static long iy=0; + static long iv[NTAB]; + double temp; + if (seed <= 0 || !iy) { + if (-(seed) < 1) seed=1; + else seed = -(seed); + for (j=NTAB+7;j>=0;j--) { + k=(seed)/IQ; + seed=IA*(seed-k*IQ)-IR*k; + if (seed < 0) seed += IM; + if (j < NTAB) iv[j] = seed; + } + iy=iv[0]; + } + k=(seed)/IQ; + seed=IA*(seed-k*IQ)-IR*k; + if (seed < 0) seed += IM; + j=iy/NDIV; + iy=iv[j]; + iv[j] = seed; + if ((temp=AM*iy) > RNMX) return RNMX; + else return temp; + } + + long getSeed() { + return seed; + } + + int getRandom(int n) { + return (int)floor(getRandom() * n); + } + + void normalise(vector < double > & v) { + double sum = 0.0; + for (int i = 0 ; i < v.size() ; i++) sum += v[i]; + if (sum != 0.0) for (int i = 0 ; i < v.size() ; i++) v[i] /= sum; + } + + int sample(vector< double > & v, double sum) { + double csum = v[0]; + double u = getRandom() * sum; + for (int i = 0; i < v.size() - 1; ++i) { + if ( u < csum ) return i; + csum += v[i+1]; + } + return v.size() - 1; + } + + double median (vector < double> & V) { + sort(V.begin(), V.end()); + return V[V.size() / 2]; + } + + double mean (vector < double> & V) { + double m = 0.0; + for (int i = 0 ; i < V.size() ; i ++) m += V[i]; + return m / V.size(); + } +}; + +/******************************************************/ +/* UTILS ALGORITHM */ +/******************************************************/ +namespace autils { + int max(vector < double > & v) { + double max = -1e300; + int index_max = 0; + for (int i = 0 ; i < v.size() ; i ++) + if (v[i] > max) { + max = v[i]; + index_max = i; + } + return index_max; + } + + int max(vector < int > & v) { + int max = -1000000000; + int index_max = 0; + for (int i = 0 ; i < v.size() ; i ++) + if (v[i] > max) { + max = v[i]; + index_max = i; + } + return index_max; + } + + int count(vector < bool > &v) { + int c=0; + for (int i = 0 ; i < v.size() ; i ++) if (v[i]) c++; + return c; + } + + void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB) { + if (B.size() < 2 * min || B.size() == 2) { + BB.push_back(B); + return; + } + + int l = putils::getRandom(B.size() - 2 * min) + min; + vector < vector < int > > LB = vector < vector < int > > (B.begin(), B.begin() + l + 1); + vector < vector < int > > RB = vector < vector < int > > (B.begin() + l - 1, B.end()); + vector < vector < vector < int > > > LBB; + vector < vector < vector < int > > > RBB; + decompose(min, LB, LBB); + decompose(min, RB, RBB); + BB = LBB; + BB.insert(BB.end(), RBB.begin(), RBB.end()); + } + + int checkDuo (int pa1, int pa2, int ca1, int ca2) { + if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return -1; } + if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return 1; } + if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 1; } + if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 1; } + if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return 1; } + if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return -1; } + if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 0; } + return 0; + } + + int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2) { + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 0; } + return 0; + } + + + int solveDuoF(int & fa1, int & fa2, int & ca1, int & ca2) { + int gf = fa1 + fa2; + int gc = ca1 + ca2; + + if (gf == 0 && gc == 2) { return -1;}; + if (gf == 2 && gc == 0) { return -1;}; + + if (gf == 0 && gc == 1) {fa1 = 0; fa2 = 0; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gc == 0) {fa1 = 0; fa2 = 1; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gc == 2) {fa1 = 1; fa2 = 0; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 2 && gc == 1) {fa1 = 1; fa2 = 1; ca1 = 1; ca2 = 0; return 1;}; + return 0; + } + + int solveDuoM(int & ma1, int & ma2, int & ca1, int & ca2) { + int gm = ma1 + ma2; + int gc = ca1 + ca2; + + if (gm == 0 && gc == 2) { return -1; }; + if (gm == 2 && gc == 0) { return -1; }; + + if (gm == 0 && gc == 1) {ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gm == 1 && gc == 0) {ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gm == 1 && gc == 2) {ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gm == 2 && gc == 1) {ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + return 0; + } + + + int solveTrio (int & fa1, int & fa2, int & ma1, int & ma2, int & ca1, int & ca2) { + int gf = fa1 + fa2; + int gm = ma1 + ma2; + int gc = ca1 + ca2; + + if (gf == 0 && gm == 0 && gc == 1) { return -1;}; + if (gf == 0 && gm == 0 && gc == 2) { return -1;}; + if (gf == 0 && gm == 1 && gc == 2) { return -1;}; + if (gf == 0 && gm == 2 && gc == 0) { return -1;}; + if (gf == 0 && gm == 2 && gc == 2) { return -1;}; + if (gf == 1 && gm == 0 && gc == 2) { return -1;}; + if (gf == 1 && gm == 2 && gc == 0) { return -1;}; + if (gf == 2 && gm == 0 && gc == 0) { return -1;}; + if (gf == 2 && gm == 0 && gc == 2) { return -1;}; + if (gf == 2 && gm == 1 && gc == 0) { return -1;}; + if (gf == 2 && gm == 2 && gc == 0) { return -1;}; + if (gf == 2 && gm == 2 && gc == 1) { return -1;}; + + if (gf == 0 && gm == 1 && gc == 0) {fa1 = 0; fa2 = 0; ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 0 && gm == 1 && gc == 1) {fa1 = 0; fa2 = 0; ma1 = 0; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 0 && gm == 2 && gc == 1) {fa1 = 0; fa2 = 0; ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gm == 0 && gc == 0) {fa1 = 0; fa2 = 1; ma1 = 0; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gm == 0 && gc == 1) {fa1 = 1; fa2 = 0; ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 1 && gm == 1 && gc == 0) {fa1 = 0; fa2 = 1; ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gm == 1 && gc == 2) {fa1 = 1; fa2 = 0; ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 1 && gm == 2 && gc == 1) {fa1 = 0; fa2 = 1; ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gm == 2 && gc == 2) {fa1 = 1; fa2 = 0; ma1 = 1; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 2 && gm == 0 && gc == 1) {fa1 = 1; fa2 = 1; ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 2 && gm == 1 && gc == 1) {fa1 = 1; fa2 = 1; ma1 = 1; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 2 && gm == 1 && gc == 2) {fa1 = 1; fa2 = 1; ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + return 0; + } + + int switch_error(vector < bool > & t1, vector < bool > & t2, vector < bool > & m12, vector < bool > & h1, vector < bool > & h2) { + int error = 0; + + //cerr << utils::bool2str(t1) << endl; + //cerr << utils::bool2str(t2) << endl; + //cerr << utils::bool2str(h1) << endl; + //cerr << utils::bool2str(h2) << endl; + + assert(t1.size() == h1.size()); + assert(t1.size() == h2.size()); + assert(t1.size() == t2.size()); + assert(t1.size() == m12.size()); + + int locus = -1; + + for (int l = 0 ; l < t1.size() && locus < 0 ; l ++) if (t1[l] != t2[l] && m12[l]) locus = l; + + if (locus < 0) return -1; + + bool swap = (t1[locus] == h2[locus]); + for (int l = locus + 1 ; l < t1.size() ; l ++) + if (m12[l]) { + if (!swap && t1[l] != h1[l]) { + swap = true; + error++; + } else if (swap && t1[l] != h2[l]) { + swap = false; + error++; + } + } + + return error; + } + +}; + +/******************************************************/ +/* UTILS STRING */ +/******************************************************/ +namespace sutils { + vector tokenize(string & str,string d) { + vector tokens; + string::size_type lastPos = str.find_first_not_of(d, 0); + string::size_type pos = str.find_first_of(d, lastPos); + while (string::npos != pos || string::npos != lastPos) { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(d, pos); + pos = str.find_first_of(d, lastPos); + } + return tokens; + } + + vector tokenize(string & str,string d, int n) { + vector tokens; + int cpt = 0; + string::size_type lastPos = str.find_first_not_of(d, 0); + string::size_type pos = str.find_first_of(d, lastPos); + while ((string::npos != pos || string::npos != lastPos) && cpt < n) { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(d, pos); + pos = str.find_first_of(d, lastPos); + cpt++; + } + return tokens; + } + + string int2str(int n) { + ostringstream s2( stringstream::out ); + s2 << n; + return s2.str(); + } + + string int2str(vector < int > & v) { + ostringstream s2( stringstream::out ); + for (int l = 0 ; l < v.size() ; l++) { + if (v[l] < 0) s2 << "-"; + else s2 << v[l] ; + } + return s2.str(); + } + + string long2str(long int n) { + ostringstream s2( stringstream::out ); + s2 << n; + return s2.str(); + } + + string double2str(double n, int prc) { + ostringstream s2; + s2 << setiosflags( ios::fixed ); + if ( prc > 0 ) s2.precision(prc); + s2 << n; + return s2.str(); + } + + string double2str(vector < double > &v, int prc) { + ostringstream s2; + s2 << setiosflags( ios::fixed ); + if ( prc >= 0 ) s2.precision(prc); + for (int l = 0 ; l < v.size() ; l++) { + s2 << v[l] << " "; + } + return s2.str(); + } + + string bool2str(vector & v) { + ostringstream s2( stringstream::out ); + for (int l = 0 ; l < v.size() ; l++) { + if (v[l]) s2 << "1"; + else s2 << "0"; + } + return s2.str(); + } + + string date2str(time_t * t, string format) { + struct tm * timeinfo = localtime(t); + char buffer[128]; + strftime(buffer, 128, format.c_str(), timeinfo); + ostringstream s2( stringstream::out ); + s2 << buffer; + return s2.str(); + } +}; + +/******************************************************/ +/* UTILS FILE */ +/******************************************************/ +namespace futils { + bool isFile(string f) { + ifstream inp; + inp.open(f.c_str(), ifstream::in); + if(inp.fail()) { + inp.clear(ios::failbit); + inp.close(); + return false; + } + inp.close(); + return true; + } + + bool createFile(string f) { + ofstream out; + out.open(f.c_str(), ofstream::out); + if(out.fail()) { + out.clear(ios::failbit); + out.close(); + return false; + } + out.close(); + return true; + } + + string extensionFile(string & filename) { + if (filename.find_last_of(".") != string::npos) + return filename.substr(filename.find_last_of(".") + 1); + return ""; + } + + + void bool2binary(vector < bool > & V, ostream &fd) { + int nb = V.size(); + fd.write((char*)&nb, sizeof(int)); + int cpt_byte = 0; + int cpt_bin = 0; + int nb_byte = (int)ceil( (V.size() * 1.0) / 8); + while (cpt_byte < nb_byte) { + bitset<8> byte_bitset; + for (int l = 0; l < 8 && cpt_bin < V.size() ; l++) { + byte_bitset[l] = V[cpt_bin]; + cpt_bin ++; + } + char byte_char = (char)byte_bitset.to_ulong(); + fd.write(&byte_char, 1); + cpt_byte++; + } + } + + bool binary2bool(vector < bool > & V, istream & fd) { + int nb; + fd.read((char*)&nb, sizeof(int)); + if (!fd) return false; + int cpt_byte = 0; + int cpt_bin = 0; + int nb_byte = (int)ceil( (nb * 1.0) / 8); + V = vector < bool >(nb); + while (cpt_byte < nb_byte) { + char byte_char; + fd.read(&byte_char, 1); + if (!fd) return false; + bitset<8> byte_bitset = byte_char; + for (int l = 0; l < 8 && cpt_bin < nb ; l++) { + V[cpt_bin] = byte_bitset[l]; + cpt_bin++; + } + cpt_byte++; + } + return true; + } +}; + +/******************************************************/ +/* INPUT FILE */ +/******************************************************/ +ifile::ifile() { +} + +ifile::ifile(string filename , bool binary) { + open(filename, binary); +} + +ifile::~ifile() { + close(); +} + +string ifile::name() { + return file; +} + +bool ifile::open(string filename, bool binary) { + file = filename; + string ext = futils::extensionFile(filename); + if (ext == "gz") { + fd.open(file.c_str(), ios::in | ios::binary); + push(gzip_decompressor()); + } else if (ext == "bz2") { + fd.open(file.c_str(), ios::in | ios::binary); + push(bzip2_decompressor()); + } else if (binary) { + fd.open(file.c_str(), ios::in | ios::binary); + } else { + fd.open(file.c_str()); + } + if (fd.fail()) return false; + push(fd); + return true; +} + +void ifile::close() { + if (!empty()) reset(); + fd.close(); +} + +/******************************************************/ +/* OUTPUT FILE */ +/******************************************************/ +ofile::ofile() { +} + +ofile::ofile(string filename , bool binary) { + open(filename, binary); +} + +ofile::~ofile() { + close(); +} + +string ofile::name() { + return file; +} + +bool ofile::open(string filename, bool binary) { + file = filename; + string ext = futils::extensionFile(filename); + if (ext == "gz") { + fd.open(file.c_str(), ios::out | ios::binary); + push(gzip_compressor()); + } else if (ext == "bz2") { + fd.open(file.c_str(), ios::out | ios::binary); + push(bzip2_compressor()); + } else if (binary) { + fd.open(file.c_str(), ios::out | ios::binary); + } else { + fd.open(file.c_str()); + } + if (fd.fail()) return false; + push(fd); + return true; +} + +void ofile::close() { + if (!empty()) reset(); + fd.close(); +} + +/******************************************************/ +/* LOG FILE */ +/******************************************************/ +lfile::lfile() { + verboseC = true; + verboseL = true; +} + +lfile::~lfile() { + close(); +} + +string lfile::name() { + return file; +} + +bool lfile::open(string filename) { + file = filename; + if (futils::extensionFile(file) != "log") file += ".log"; + fd.open(file.c_str()); + if (fd.fail()) return false; + return true; +} + +void lfile::close() { + fd.close(); +} + +string lfile::getPrefix() { + return file.substr(0, file.find_last_of(".")); +} + +void lfile::muteL() { + verboseL = false; +} + +void lfile::unmuteL() { + verboseL = true; +} + +void lfile::muteC() { + verboseC = false; +} + +void lfile::unmuteC() { + verboseC = true; +} + + +void lfile::print(string s) { + if (verboseL) { fd << s; fd.flush(); } + if (verboseC) { cerr << s; cerr.flush(); } +} + +void lfile::printC(string s) { + if (verboseC) { cerr << s; cerr.flush(); } +} + +void lfile::printL(string s) { + if (verboseL) { fd << s; fd.flush(); } +} + +void lfile::println(string s) { + if (verboseL) { fd << s << endl; } + if (verboseC) { cerr << s << endl; } +} + +void lfile::printlnC(string s) { + if (verboseC) { cerr << s << endl; } +} + +void lfile::printlnL(string s) { + if (verboseL) { fd << s << endl; } +} + +void lfile::warning(string s) { + cerr << "\33[33mWARNING:\33[0m " << s << endl; + if (verboseL) fd << "WARNING: " << s << endl; +} + +void lfile::error(string s) { + cerr << "\33[33mERROR:\33[0m " << s << endl; + if (verboseL) fd << "ERROR: " << s << endl; + close(); + exit(1); +} diff --git a/prepareGenFromBeagle4_modified20160601/old/utils.h b/prepareGenFromBeagle4_modified20160601/old/utils.h new file mode 100644 index 0000000..6fdf61a --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/old/utils.h @@ -0,0 +1,165 @@ +#ifndef _UTILS_H +#define _UTILS_H + +#define IA 16807 +#define IM 2147483647 +#define AM (1.0/IM) +#define IQ 127773 +#define IR 2836 +#define NTAB 32 +#define NDIV (1+(IM-1)/NTAB) +#define EPS 1.2e-7 +#define RNMX (1.0-EPS) +#define PI 3.14159265358979323846 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace boost::iostreams; + +/******************************************************/ +/* UTILS STATISTICS */ +/******************************************************/ +namespace putils { + void initRandom(long s); + double getRandom(); + int getRandom(int); + long getSeed(); + void normalise(vector < double > & v); + int sample(vector< double > & v, double sum); + double median (vector < double> &); + double mean (vector < double> &); +}; + +/******************************************************/ +/* UTILS ALGORITHM */ +/******************************************************/ +namespace autils { + int max(vector < double > & v); + int max(vector < int > & v); + int count(vector < bool > &); + void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB); + int checkDuo (int pa1, int pa2, int ca1, int ca2); + int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2) ; + + int solveDuoF(int & fa1, int & fa2, int & ca1, int & ca2); + int solveDuoM(int & ma1, int & ma2, int & ca1, int & ca2); + int solveTrio (int & fa1, int & fa2, int & ma1, int & ma2, int & ca1, int & ca2); + int switch_error(vector < bool > & t1, vector < bool > & t2, vector < bool > & m12, vector < bool > & h1, vector < bool > & h2); + + +}; + +/******************************************************/ +/* UTILS STRING */ +/******************************************************/ +namespace sutils { + vector tokenize(string & str,string d); + vector tokenize(string & str,string d, int n); + string int2str(int n); + string int2str(vector < int > & v); + string long2str(long int n); + string double2str(double n, int prc = 4); + string double2str(vector < double > &v, int prc = 4); + string bool2str(vector & v); + string date2str(time_t * t, string format); +}; + +/******************************************************/ +/* UTILS FILE */ +/******************************************************/ +namespace futils { + bool isFile(string f); + bool createFile(string f); + string extensionFile(string & filename); + void bool2binary(vector < bool > & V, ostream &fd); + bool binary2bool(vector < bool > & V, istream & fd); +}; + +/******************************************************/ +/* INPUT FILE */ +/******************************************************/ +class ifile : public filtering_istream { +private: + string file; + ifstream fd; + +public: + ifile(); + ifile(string filename , bool binary = false); + ~ifile(); + string name(); + bool open(string filename, bool binary = false); + void close(); +}; + +/******************************************************/ +/* OUTPUT FILE */ +/******************************************************/ +class ofile : public filtering_ostream { +private: + string file; + ofstream fd; + +public: + ofile(); + ofile(string filename , bool binary = false); + ~ofile(); + string name(); + bool open(string filename, bool binary = false); + void close(); +}; + +/******************************************************/ +/* LOG FILE */ +/******************************************************/ +class lfile { +private: + string file; + ofstream fd; + bool verboseC; + bool verboseL; + +public: + lfile(); + ~lfile(); + string name(); + bool open(string filename = "file.log"); + void close(); + string getPrefix(); + void muteL(); + void unmuteL(); + void muteC(); + void unmuteC(); + void print(string s); + void printC(string s); + void printL(string s); + void println(string s); + void printlnC(string s); + void printlnL(string s); + void warning(string s); + void error(string s); +}; + +#endif diff --git a/prepareGenFromBeagle4_modified20160601/src/chunk.cpp b/prepareGenFromBeagle4_modified20160601/src/chunk.cpp new file mode 100644 index 0000000..0e21fc8 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/src/chunk.cpp @@ -0,0 +1,88 @@ +#include "classes.h" + + +int hamming (vector < bool > & h1, vector < bool > & h2) { + assert(h1.size() == h2.size()); + int c = 0; + for(int l = 0 ; l < h1.size(); l++) if (h1[l] != h2[l]) c++; + return c; +} + +int discordance (vector < bool > & h11, vector < bool > & h12, vector < bool > & h21, vector < bool > & h22) { + int c = 0; + for(int l = 0 ; l < h11.size(); l++) { + int g1 = h11[l] + h12[l]; + int g2 = h21[l] + h22[l]; + if (g1 != g2) c++; + } + return c; +} + + +chunk::chunk(char * id) { + this->id = string(id); +} + +chunk::~chunk() { + mappingS.clear(); +} + +void chunk::assemble(chunk * pc) { + //mapping starting point of the overlap + int b1 = -1; + int e1 = pc->mappingS.size() - 1; + for (int l = 0 ; l < pc->mappingS.size() && b1 < 0 ; l ++ ) if (pc->mappingS[l] == mappingS[0]) b1 = l; + if (b1 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc->id << "], code 1" << endl; exit(1);} + + //mapping starting point of the overlap + int b2 = 0; + int e2 = -1; + for (int l = 0 ; l < mappingS.size() && e2 < 0 ; l ++ ) if (mappingS[l] == pc->mappingS.back()) e2 = l; + if (e2 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc->id << "], code 2" << endl; exit(1);} + + //overlap size + int osize = e1 - b1 + 1; + int osize2 = e2 - b2 + 1; + if (osize != osize2) {cout << "Overlaps are differents between [" << id << "] does not overlap chunk [" << pc->id << "] o1=" << osize << " o2=" << osize2 <chr = chr; + this->id = id; + this->pos = pos; + this->a0 = a0; + this->a1 = a1; + this->idx = idx; + chunk_id = -1; + chunk_idx = -1; + } +}; + + +class snp_set { +public: + vector < snp * > vec_snp; + multimap < int, snp *> map_snp; + + //C/D + snp_set() {}; + ~snp_set() {}; + + //M + int size() { + return vec_snp.size(); + } + + void push(snp * s) { + vec_snp.push_back(s); + map_snp.insert ( std::pair< int, snp * >( s->pos , s)); + } + + snp * get(int p, string & r0, string & r1) { + pair < multimap < int, snp *>::iterator, multimap < int, snp *>::iterator > seqM = map_snp.equal_range(p); + for (multimap < int, snp *>::iterator itM = seqM.first; itM != seqM.second; ++itM) + if (r0 == itM->second->a0 && r1 == itM->second->a1) return itM->second; + return NULL; + } +}; + +class chunk { +public: + string id; + vector < int > mappingS; + vector < vector < bool > > H; + vector < bool > switches; + + chunk(char *) ; + ~chunk(); + + void assemble(chunk *); +}; + +class data { +public: + string id; + double threshold; + snp_set M; + vector < chunk * > C; + vector < string > I; + vector < bool > males; + + data(string, double threshold); + ~data(); + + void readLikelihoodsMap(); + void readHaploidGuys(char *); + void readPosteriorsMapAndHap(char *); + void assembleMap(); + void writeHaplotypes(char *); + void writeGenotypes(char *); +}; + +#endif diff --git a/prepareGenFromBeagle4_modified20160601/src/data.cpp b/prepareGenFromBeagle4_modified20160601/src/data.cpp new file mode 100644 index 0000000..731c87b --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/src/data.cpp @@ -0,0 +1,305 @@ +#include "classes.h" +#include + +data::data(string id, double threshold) { + this->id = id; + this->threshold = threshold; +} + +data::~data() { + for (int i = 0 ; i < C.size() ; i ++) delete C[i]; + C.clear(); +} + +void data::readLikelihoodsMap () { + string buffer; + vector < string > tok, field, prob; + + cout << "Reading Likelihoods Map in [" << id << "]" << endl; + int line = 0; + ifile fd(id); + while(getline(fd, buffer, '\n')) { + if (buffer[0] == '#' && buffer[1] == 'C' && buffer[2] == 'H' && buffer[3] == 'R' && buffer[4] == 'O') { + tok = sutils::tokenize(buffer, "\t"); + for (int i = 9 ; i < tok.size() ; i ++) { + I.push_back(tok[i]); + males.push_back(false); + } + cout << " * #inds = " << I.size() << endl; + } else if (buffer[0] != '#') { + tok = sutils::tokenize(buffer, "\t", 5); + vector < string > alleles_alt = sutils::tokenize(tok[4], ","); + if (alleles_alt[0] != "." && alleles_alt[0] != "0" && alleles_alt.size() == 1) + M.push(new snp (atoi(tok[0].c_str()), tok[2], atoi(tok[1].c_str()), tok[3], tok[4], M.size())); + } + + if (line > 0 && line % 1000 == 0) { cout << "\r" << line ;cout.flush();} + line ++; + } + cout << "\r * #sites = " << M.size() << endl; +} + +void data::readHaploidGuys(char * haploid) { + string buffer; + vector < string > tok; + ifile fd(haploid); + int n_set = 0; + while(getline(fd, buffer, '\n')) { + bool done = false; + for (int i = 0 ; i < I.size() && !done ; i ++) if (I[i] == buffer) { + males[i] = true; + done = true; + n_set++; + } + } + cerr << " * #males=" << n_set << endl; + fd.close(); +} + +void data::readPosteriorsMapAndHap (char * vcf) { + string buffer; + vector < string > tok, field, prob; + chunk * c = new chunk(vcf); + + cout << "Reading Posteriors Map in [" << vcf << "]"; + ifile fd(vcf); + while(getline(fd, buffer)) { + if (buffer[0] == '#' && buffer[1] == 'C' && buffer[2] == 'H' && buffer[3] == 'R' && buffer[4] == 'O') { + tok = sutils::tokenize(buffer, "\t"); + for (int i = 9 ; i < tok.size() ; i ++) if (I[i-9] != tok[i]) { cout << "Sample inconsistency like=" << I[i-9] << " post=" << tok[i] << endl;exit(1); } + c->switches = vector < bool > (I.size(), false); + } else if (buffer[0] != '#') { + tok = sutils::tokenize(buffer, "\t", 5); + vector < string > alleles_alt = sutils::tokenize(tok[4], ","); + if (alleles_alt[0] != "." && alleles_alt[0] != "0" && alleles_alt.size() == 1) { + snp * s = M.get(atoi(tok[1].c_str()), tok[3], tok[4]); + if (s == NULL) { cout << "\nSite unfound in likelihoods file! pos = " << atoi(tok[1].c_str()) << endl; exit(1); } + c->mappingS.push_back(s->idx); + + int n_tok = 9 + I.size(); + int n_cha = buffer.size(); + int i_cha = 0; + c->H.push_back(vector < bool > (2 * I.size(), false)); + for (int t = 0 ; t < n_tok ; t ++) { + if (t >= 9) { + if (buffer[i_cha+0] == '0') c->H.back()[(t-9) * 2 + 0] = false; + else c->H.back()[(t-9) * 2 + 0] = true; + if (buffer[i_cha+2] == '0') c->H.back()[(t-9) * 2 + 1] = false; + else c->H.back()[(t-9) * 2 + 1] = true; + } + for (; buffer[i_cha] != '\t' && i_cha < n_cha ; i_cha ++); + i_cha ++; + } + } + } + } + cout << "\t[#sites = " << c->mappingS.size() << ", first=" << M.vec_snp[c->mappingS[0]]->pos << ", last =" << M.vec_snp[c->mappingS.back()]->pos << "]" << endl; + C.push_back(c); +} + +struct less_than_chunk { + inline bool operator() (const chunk * c1, const chunk * c2) { + return (c1->mappingS[0] < c2->mappingS[0]); + } +}; + +void data::assembleMap() { + cout << "Sorting the " << C.size() << " chunks of data" << endl; + sort(C.begin(), C.end(), less_than_chunk()); + for (int c = 1 ; c < C.size() ; c++) { + cout << "Assembling [" << C[c -1]->id << "] with [" << C[c]->id << "]" << endl; + C[c]->assemble(C[c-1]); + } + cout << "Mapping sites to chunks" << endl; + for (int c = 0 ; c < C.size() ; c++) { + for (int l = 0 ; l < C[c]->mappingS.size() ; l ++) { + if (C[c]->mappingS[l] >= 0) { + M.vec_snp[C[c]->mappingS[l]]->chunk_id = c; + M.vec_snp[C[c]->mappingS[l]]->chunk_idx = l; + } + } + } + cout << "Checking mapping sites to chunks" << endl; + for (int l = 0 ; l < M.vec_snp.size() ; l ++) + if (M.vec_snp[l]->chunk_id < 0) + cout << "Site " << l << " with pos=" << M.vec_snp[l]->pos << " is unmapped" << endl; +} + + +void data::writeHaplotypes(char * output) { + string fhap = string(output) + ".hap.gz"; + string fsam = string(output) + ".hap.sample"; + + cout << "Writing samples in [" << fsam << "]" << endl; + ofile fds (fsam); + fds << "ID_1 ID_2 missing" << endl << "0 0 0" << endl; + for (int i = 0 ; i < I.size() ; i ++) fds << I[i] << " " << I[i] << " 0" << endl; + fds.close(); + + cout << "Writing haplotypes in [" << fhap << "]" << endl; + ofile fdh (fhap); + for (int s = 0 ; s < M.size() ; s ++) { + fdh << M.vec_snp[s]->chr << " " << M.vec_snp[s]->id << " " << M.vec_snp[s]->pos << " " << M.vec_snp[s]->a0 << " " << M.vec_snp[s]->a1; + for (int i = 0 ; i < I.size() ; i ++) { + int chunk_id = M.vec_snp[s]->chunk_id; + int chunk_idx = M.vec_snp[s]->chunk_idx; + if (C[chunk_id]->switches[i]) fdh << " " << C[chunk_id]->H[chunk_idx][2 * i + 1] << " " << C[chunk_id]->H[chunk_idx][2 * i + 0]; + else fdh << " " << C[chunk_id]->H[chunk_idx][2 * i + 0] << " " << C[chunk_id]->H[chunk_idx][2 * i + 1]; + } + fdh << endl; + } + fdh.close(); +} + +void data::writeGenotypes(char * output) { + string fgen = string(output) + ".gen.gz"; + string fsam = string(output) + ".gen.sample"; + + cout << "Writing samples in [" << fsam << "]" << endl; + ofile fds (fsam); + fds << "ID_1 ID_2 missing" << endl << "0 0 0" << endl; + for (int i = 0 ; i < I.size() ; i ++) fds << I[i] << " " << I[i] << " 0" << endl; + fds.close(); + + cout << "Writing genotypes in [" << fgen << "]" << endl; + string like_buffer = "#"; + int prev_chunk = -1; + long geno_post = 0; + long geno_like = 0; + + ofile fdg (fgen); + ifile fdl (id); + ifile fdc; + + for (int s = 0 ; s < M.size();) { + getline(fdl, like_buffer, '\n'); + if (like_buffer[0] != '#') { + vector < string > like_tok = sutils::tokenize(like_buffer, "\t"); + vector < string > alleles_alt = sutils::tokenize(like_tok[4], ","); + +//cout << like_tok[1] << "\n"; + if (alleles_alt[0] != "0" && alleles_alt[0] != "." && alleles_alt.size() == 1) { + //get current site + snp * site = M.vec_snp[s]; + + //get corresponding chunk + int curr_chunk = site->chunk_id; + int curr_chunk_idx = site->chunk_idx; + + //if chunk not opened, open it + string chunk_buffer = "#"; + if (curr_chunk != prev_chunk) { + if (prev_chunk >= 0) fdc.close(); + fdc.open(C[curr_chunk]->id, false); + prev_chunk = curr_chunk; + } + + //parse chunk to relevant site + bool done = false; + while (! done) { + getline(fdc, chunk_buffer, '\n'); + vector < string > chunk_tok2 = sutils::tokenize(chunk_buffer, "\t", 5); + if (chunk_buffer[0] != '#' && atoi(chunk_tok2[1].c_str()) == site->pos && chunk_tok2[3] == site->a0 && chunk_tok2[4] == site->a1) done = true; + } + + //tokenize line of chunk + vector < string > chunk_tok = sutils::tokenize(chunk_buffer, "\t"); + + assert(chunk_tok[1] == like_tok[1]); + assert(chunk_tok[3] == like_tok[3]); + assert(chunk_tok[4] == like_tok[4]); + assert(chunk_tok.size() == like_tok.size()); + assert(atoi(like_tok[1].c_str()) == site->pos); + assert(like_tok[3] == site->a0); + assert(like_tok[4] == site->a1); + + //find GL field in like + int like_idx_gl = -1; + vector < string > like_field = sutils::tokenize(like_tok[8], ":"); + for(int f = 0 ; f < like_field.size() && like_idx_gl < 0 ; f++) if (like_field[f] == "GL") like_idx_gl = f; + assert(like_idx_gl >= 0); + + //find GP field in chunk + int chunk_idx_gp = -1; + vector < string > chunk_field = sutils::tokenize(chunk_tok[8], ":"); + for(int f = 0 ; f < chunk_field.size() && chunk_idx_gp < 0 ; f++) if (chunk_field[f] == "GP") chunk_idx_gp = f; + assert(chunk_idx_gp >= 0); + + //write site to output + fdg << M.vec_snp[s]->chr << " " << M.vec_snp[s]->id << " " << M.vec_snp[s]->pos << " " << M.vec_snp[s]->a0 << " " << M.vec_snp[s]->a1; + + //parse data + for (int i = 9 ; i < like_tok.size() ; i ++) { + chunk_field = sutils::tokenize(chunk_tok[i], ":"); + + vector < string > chunk_prob = sutils::tokenize(chunk_field[chunk_idx_gp], ","); + + double p0 = atof(chunk_prob[0].c_str()); + double p1 = atof(chunk_prob[1].c_str()); + double p2 = atof(chunk_prob[2].c_str()); + + char str[80]; + int g = -1; + threshold = 0.995; + +// printf("%f %f",p2,threshold); +//puts(str); + if (p0 >= threshold) g = 0; + if (p1 >= threshold && !males[i-9]) g = 1; + if (p2 >= threshold) g = 2; + + + if (g >= 0) { + switch (g) { + case 0 : fdg << " 1 0 0"; break; + case 1 : fdg << " 0 1 0"; break; + case 2 : fdg << " 0 0 1"; break; + } + geno_post ++; + } else if (like_tok[i] == ".") { + fdg << " 0.333 0.333 0.333"; + geno_like++; + + + } else { + like_field = sutils::tokenize(like_tok[i], ":"); + + vector < string > like_prob = sutils::tokenize(like_field[like_idx_gl], ","); +//sprintf(str,"test2"); +//cout << like_field[like_idx_gl] << "\n"; + +//puts(str); + double l0 = pow(10, atof(like_prob[0].c_str())); + + double l1 = pow(10, atof(like_prob[1].c_str())); + + double l2 = pow(10, atof(like_prob[2].c_str())); +//printf("%f $f",l0,l2); +//puts(str); + double sum = l0 + l1 + l2; +// sprintf(str,"test6"); +//puts(str); + fdg << " " << sutils::double2str(l0/sum, 5) << " " << sutils::double2str(l1/sum, 5) << " " << sutils::double2str(l2/sum, 5); + +// sprintf(str,"test7"); +//puts(str); + geno_like++; + } + } + + fdg << endl; + s++; + + } + if (s % 100 == 0) { cout << "\r" << s << " / " << M.size(); cout.flush(); } + } + } + cout << "\r * #fixd = " << geno_post << " (" << sutils::double2str(geno_post * 100.0 / (geno_post + geno_like), 2) << "%)" << endl; + cout << " * #miss = " << geno_like << " (" << sutils::double2str(geno_like * 100.0 / (geno_post + geno_like), 2) << "%)" << endl; + + + fdl.close(); + fdg.close(); + fdc.close(); +} + diff --git a/prepareGenFromBeagle4_modified20160601/src/prepareGenFromBeagle4.cpp b/prepareGenFromBeagle4_modified20160601/src/prepareGenFromBeagle4.cpp new file mode 100644 index 0000000..961b1b8 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/src/prepareGenFromBeagle4.cpp @@ -0,0 +1,51 @@ +#include "classes.h" + +int main(int argc, char ** argv) { + string buffer; + vector < string > tok; + + char * likelihoods = NULL; + vector < char * > posteriors; + char * output = NULL; + double threshold = 0.995; + char * haploid = NULL; + + //Reading parameters + for (int a =1 ; a < argc ; a++) { + if (strcmp(argv[a], "--likelihoods") == 0) {likelihoods = argv[a+1];} + if (strcmp(argv[a], "--output") == 0) {output = argv[a+1];} + if (strcmp(argv[a], "--threshold") == 0) {threshold= atoi(argv[a+1]);} + if (strcmp(argv[a], "--posteriors") == 0) { + int cpt = a + 1; + while (cpt >= 0) { + if (argv[cpt][0] == '-') cpt = -1; + else { + posteriors.push_back(argv[cpt]); + cpt ++; + } + } + } + if (strcmp(argv[a], "--haploid") == 0) {haploid = argv[a+1];} + } + + + //Checking parameters + if (likelihoods == NULL) { cout << "Missing --likelihoods argument!" << endl; exit(1);} + else cout << "Likelihoods:\t[" << likelihoods << "]" << endl; + + if (posteriors.size() == 0) { cout << "Missing --posteriors argument!" << endl; exit(1);} + else cout << "Posteriors:\t" << posteriors.size() << " VCF files" << endl; + + if (output == NULL) { cout << "Missing --output argument!" << endl; exit(1);} + else cout << "Output Prefix:\t[" << output << "]" << endl; + cout << "Threshold:\t" << threshold << endl; + + cout << endl; + data D = data(string(likelihoods), threshold); + D.readLikelihoodsMap(); + if (haploid != NULL) D.readHaploidGuys(haploid); + for (int p = 0 ; p < posteriors.size() ; p++) D.readPosteriorsMapAndHap(posteriors[p]); + D.assembleMap(); + D.writeHaplotypes(output); + D.writeGenotypes(output); +} diff --git a/prepareGenFromBeagle4_modified20160601/src/utils.cpp b/prepareGenFromBeagle4_modified20160601/src/utils.cpp new file mode 100644 index 0000000..18c63b4 --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/src/utils.cpp @@ -0,0 +1,646 @@ +#include "utils.h" + +/******************************************************/ +/* UTILS STATISTICS */ +/******************************************************/ +long seed = -123456789; + +namespace putils { + void initRandom(long s) { + if (s == -1) seed = - time(NULL); + else seed = - s; + } + + double getRandom() { + int j; + long k; + static long iy=0; + static long iv[NTAB]; + double temp; + if (seed <= 0 || !iy) { + if (-(seed) < 1) seed=1; + else seed = -(seed); + for (j=NTAB+7;j>=0;j--) { + k=(seed)/IQ; + seed=IA*(seed-k*IQ)-IR*k; + if (seed < 0) seed += IM; + if (j < NTAB) iv[j] = seed; + } + iy=iv[0]; + } + k=(seed)/IQ; + seed=IA*(seed-k*IQ)-IR*k; + if (seed < 0) seed += IM; + j=iy/NDIV; + iy=iv[j]; + iv[j] = seed; + if ((temp=AM*iy) > RNMX) return RNMX; + else return temp; + } + + long getSeed() { + return seed; + } + + int getRandom(int n) { + return (int)floor(getRandom() * n); + } + + void normalise(vector < double > & v) { + double sum = 0.0; + for (int i = 0 ; i < v.size() ; i++) sum += v[i]; + if (sum != 0.0) for (int i = 0 ; i < v.size() ; i++) v[i] /= sum; + } + + int sample(vector< double > & v, double sum) { + double csum = v[0]; + double u = getRandom() * sum; + for (int i = 0; i < v.size() - 1; ++i) { + if ( u < csum ) return i; + csum += v[i+1]; + } + return v.size() - 1; + } + + double median (vector < double> & V) { + sort(V.begin(), V.end()); + return V[V.size() / 2]; + } + + double mean (vector < double> & V) { + double m = 0.0; + for (int i = 0 ; i < V.size() ; i ++) m += V[i]; + return m / V.size(); + } +}; + +/******************************************************/ +/* UTILS ALGORITHM */ +/******************************************************/ +namespace autils { + int max(vector < double > & v) { + double max = -1e300; + int index_max = 0; + for (int i = 0 ; i < v.size() ; i ++) + if (v[i] > max) { + max = v[i]; + index_max = i; + } + return index_max; + } + + int max(vector < int > & v) { + int max = -1000000000; + int index_max = 0; + for (int i = 0 ; i < v.size() ; i ++) + if (v[i] > max) { + max = v[i]; + index_max = i; + } + return index_max; + } + + int count(vector < bool > &v) { + int c=0; + for (int i = 0 ; i < v.size() ; i ++) if (v[i]) c++; + return c; + } + + void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB) { + if (B.size() < 2 * min || B.size() == 2) { + BB.push_back(B); + return; + } + + int l = putils::getRandom(B.size() - 2 * min) + min; + vector < vector < int > > LB = vector < vector < int > > (B.begin(), B.begin() + l + 1); + vector < vector < int > > RB = vector < vector < int > > (B.begin() + l - 1, B.end()); + vector < vector < vector < int > > > LBB; + vector < vector < vector < int > > > RBB; + decompose(min, LB, LBB); + decompose(min, RB, RBB); + BB = LBB; + BB.insert(BB.end(), RBB.begin(), RBB.end()); + } + + int checkDuo (int pa1, int pa2, int ca1, int ca2) { + if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return -1; } + if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return 1; } + if (pa1 == 0 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 0 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 1; } + if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 0) { return 1; } + if (pa1 == 1 && pa2 == 0 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 1 && pa2 == 0 && ca1 == 1 && ca2 == 1) { return 1; } + if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 0) { return -1; } + if (pa1 == 1 && pa2 == 1 && ca1 == 0 && ca2 == 1) { return 0; } + if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 0) { return 0; } + if (pa1 == 1 && pa2 == 1 && ca1 == 1 && ca2 == 1) { return 0; } + return 0; + } + + int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2) { + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 0 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 2; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 0 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 0; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 0; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 0 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 0 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 0 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 0 && ca1 == 1 && ca2 == 1 ) { return 1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 0 && ca2 == 1 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 0 ) { return -1; } + if (fa1 == 1 && fa2 == 1 && ma1 == 1 && ma2 == 1 && ca1 == 1 && ca2 == 1 ) { return 0; } + return 0; + } + + + int solveDuoF(int & fa1, int & fa2, int & ca1, int & ca2) { + int gf = fa1 + fa2; + int gc = ca1 + ca2; + + if (gf == 0 && gc == 2) { return -1;}; + if (gf == 2 && gc == 0) { return -1;}; + + if (gf == 0 && gc == 1) {fa1 = 0; fa2 = 0; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gc == 0) {fa1 = 0; fa2 = 1; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gc == 2) {fa1 = 1; fa2 = 0; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 2 && gc == 1) {fa1 = 1; fa2 = 1; ca1 = 1; ca2 = 0; return 1;}; + return 0; + } + + int solveDuoM(int & ma1, int & ma2, int & ca1, int & ca2) { + int gm = ma1 + ma2; + int gc = ca1 + ca2; + + if (gm == 0 && gc == 2) { return -1; }; + if (gm == 2 && gc == 0) { return -1; }; + + if (gm == 0 && gc == 1) {ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gm == 1 && gc == 0) {ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gm == 1 && gc == 2) {ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gm == 2 && gc == 1) {ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + return 0; + } + + + int solveTrio (int & fa1, int & fa2, int & ma1, int & ma2, int & ca1, int & ca2) { + int gf = fa1 + fa2; + int gm = ma1 + ma2; + int gc = ca1 + ca2; + + if (gf == 0 && gm == 0 && gc == 1) { return -1;}; + if (gf == 0 && gm == 0 && gc == 2) { return -1;}; + if (gf == 0 && gm == 1 && gc == 2) { return -1;}; + if (gf == 0 && gm == 2 && gc == 0) { return -1;}; + if (gf == 0 && gm == 2 && gc == 2) { return -1;}; + if (gf == 1 && gm == 0 && gc == 2) { return -1;}; + if (gf == 1 && gm == 2 && gc == 0) { return -1;}; + if (gf == 2 && gm == 0 && gc == 0) { return -1;}; + if (gf == 2 && gm == 0 && gc == 2) { return -1;}; + if (gf == 2 && gm == 1 && gc == 0) { return -1;}; + if (gf == 2 && gm == 2 && gc == 0) { return -1;}; + if (gf == 2 && gm == 2 && gc == 1) { return -1;}; + + if (gf == 0 && gm == 1 && gc == 0) {fa1 = 0; fa2 = 0; ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 0 && gm == 1 && gc == 1) {fa1 = 0; fa2 = 0; ma1 = 0; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 0 && gm == 2 && gc == 1) {fa1 = 0; fa2 = 0; ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gm == 0 && gc == 0) {fa1 = 0; fa2 = 1; ma1 = 0; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gm == 0 && gc == 1) {fa1 = 1; fa2 = 0; ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 1 && gm == 1 && gc == 0) {fa1 = 0; fa2 = 1; ma1 = 1; ma2 = 0; ca1 = 0; ca2 = 0; return 1;}; + if (gf == 1 && gm == 1 && gc == 2) {fa1 = 1; fa2 = 0; ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 1 && gm == 2 && gc == 1) {fa1 = 0; fa2 = 1; ma1 = 1; ma2 = 1; ca1 = 0; ca2 = 1; return 1;}; + if (gf == 1 && gm == 2 && gc == 2) {fa1 = 1; fa2 = 0; ma1 = 1; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + if (gf == 2 && gm == 0 && gc == 1) {fa1 = 1; fa2 = 1; ma1 = 0; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 2 && gm == 1 && gc == 1) {fa1 = 1; fa2 = 1; ma1 = 1; ma2 = 0; ca1 = 1; ca2 = 0; return 1;}; + if (gf == 2 && gm == 1 && gc == 2) {fa1 = 1; fa2 = 1; ma1 = 0; ma2 = 1; ca1 = 1; ca2 = 1; return 1;}; + return 0; + } + + int switch_error(vector < bool > & t1, vector < bool > & t2, vector < bool > & m12, vector < bool > & h1, vector < bool > & h2) { + int error = 0; + + //cerr << utils::bool2str(t1) << endl; + //cerr << utils::bool2str(t2) << endl; + //cerr << utils::bool2str(h1) << endl; + //cerr << utils::bool2str(h2) << endl; + + assert(t1.size() == h1.size()); + assert(t1.size() == h2.size()); + assert(t1.size() == t2.size()); + assert(t1.size() == m12.size()); + + int locus = -1; + + for (int l = 0 ; l < t1.size() && locus < 0 ; l ++) if (t1[l] != t2[l] && m12[l]) locus = l; + + if (locus < 0) return -1; + + bool swap = (t1[locus] == h2[locus]); + for (int l = locus + 1 ; l < t1.size() ; l ++) + if (m12[l]) { + if (!swap && t1[l] != h1[l]) { + swap = true; + error++; + } else if (swap && t1[l] != h2[l]) { + swap = false; + error++; + } + } + + return error; + } + +}; + +/******************************************************/ +/* UTILS STRING */ +/******************************************************/ +namespace sutils { + vector tokenize(string & str,string d) { + vector tokens; + string::size_type lastPos = str.find_first_not_of(d, 0); + string::size_type pos = str.find_first_of(d, lastPos); + while (string::npos != pos || string::npos != lastPos) { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(d, pos); + pos = str.find_first_of(d, lastPos); + } + return tokens; + } + + vector tokenize(string & str,string d, int n) { + vector tokens; + int cpt = 0; + string::size_type lastPos = str.find_first_not_of(d, 0); + string::size_type pos = str.find_first_of(d, lastPos); + while ((string::npos != pos || string::npos != lastPos) && cpt < n) { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(d, pos); + pos = str.find_first_of(d, lastPos); + cpt++; + } + return tokens; + } + + string int2str(int n) { + ostringstream s2( stringstream::out ); + s2 << n; + return s2.str(); + } + + string int2str(vector < int > & v) { + ostringstream s2( stringstream::out ); + for (int l = 0 ; l < v.size() ; l++) { + if (v[l] < 0) s2 << "-"; + else s2 << v[l] ; + } + return s2.str(); + } + + string long2str(long int n) { + ostringstream s2( stringstream::out ); + s2 << n; + return s2.str(); + } + + string double2str(double n, int prc) { + ostringstream s2; + s2 << setiosflags( ios::fixed ); + if ( prc > 0 ) s2.precision(prc); + s2 << n; + return s2.str(); + } + + string double2str(vector < double > &v, int prc) { + ostringstream s2; + s2 << setiosflags( ios::fixed ); + if ( prc >= 0 ) s2.precision(prc); + for (int l = 0 ; l < v.size() ; l++) { + s2 << v[l] << " "; + } + return s2.str(); + } + + string bool2str(vector & v) { + ostringstream s2( stringstream::out ); + for (int l = 0 ; l < v.size() ; l++) { + if (v[l]) s2 << "1"; + else s2 << "0"; + } + return s2.str(); + } + + string date2str(time_t * t, string format) { + struct tm * timeinfo = localtime(t); + char buffer[128]; + strftime(buffer, 128, format.c_str(), timeinfo); + ostringstream s2( stringstream::out ); + s2 << buffer; + return s2.str(); + } +}; + +/******************************************************/ +/* UTILS FILE */ +/******************************************************/ +namespace futils { + bool isFile(string f) { + ifstream inp; + inp.open(f.c_str(), ifstream::in); + if(inp.fail()) { + inp.clear(ios::failbit); + inp.close(); + return false; + } + inp.close(); + return true; + } + + bool createFile(string f) { + ofstream out; + out.open(f.c_str(), ofstream::out); + if(out.fail()) { + out.clear(ios::failbit); + out.close(); + return false; + } + out.close(); + return true; + } + + string extensionFile(string & filename) { + if (filename.find_last_of(".") != string::npos) + return filename.substr(filename.find_last_of(".") + 1); + return ""; + } + + + void bool2binary(vector < bool > & V, ostream &fd) { + int nb = V.size(); + fd.write((char*)&nb, sizeof(int)); + int cpt_byte = 0; + int cpt_bin = 0; + int nb_byte = (int)ceil( (V.size() * 1.0) / 8); + while (cpt_byte < nb_byte) { + bitset<8> byte_bitset; + for (int l = 0; l < 8 && cpt_bin < V.size() ; l++) { + byte_bitset[l] = V[cpt_bin]; + cpt_bin ++; + } + char byte_char = (char)byte_bitset.to_ulong(); + fd.write(&byte_char, 1); + cpt_byte++; + } + } + + bool binary2bool(vector < bool > & V, istream & fd) { + int nb; + fd.read((char*)&nb, sizeof(int)); + if (!fd) return false; + int cpt_byte = 0; + int cpt_bin = 0; + int nb_byte = (int)ceil( (nb * 1.0) / 8); + V = vector < bool >(nb); + while (cpt_byte < nb_byte) { + char byte_char; + fd.read(&byte_char, 1); + if (!fd) return false; + bitset<8> byte_bitset = byte_char; + for (int l = 0; l < 8 && cpt_bin < nb ; l++) { + V[cpt_bin] = byte_bitset[l]; + cpt_bin++; + } + cpt_byte++; + } + return true; + } +}; + +/******************************************************/ +/* INPUT FILE */ +/******************************************************/ +ifile::ifile() { +} + +ifile::ifile(string filename , bool binary) { + open(filename, binary); +} + +ifile::~ifile() { + close(); +} + +string ifile::name() { + return file; +} + +bool ifile::open(string filename, bool binary) { + file = filename; + string ext = futils::extensionFile(filename); + if (ext == "gz") { + fd.open(file.c_str(), ios::in | ios::binary); + push(gzip_decompressor()); + } else if (ext == "bz2") { + fd.open(file.c_str(), ios::in | ios::binary); + push(bzip2_decompressor()); + } else if (binary) { + fd.open(file.c_str(), ios::in | ios::binary); + } else { + fd.open(file.c_str()); + } + if (fd.fail()) return false; + push(fd); + return true; +} + +void ifile::close() { + if (!empty()) reset(); + fd.close(); +} + +/******************************************************/ +/* OUTPUT FILE */ +/******************************************************/ +ofile::ofile() { +} + +ofile::ofile(string filename , bool binary) { + open(filename, binary); +} + +ofile::~ofile() { + close(); +} + +string ofile::name() { + return file; +} + +bool ofile::open(string filename, bool binary) { + file = filename; + string ext = futils::extensionFile(filename); + if (ext == "gz") { + fd.open(file.c_str(), ios::out | ios::binary); + push(gzip_compressor()); + } else if (ext == "bz2") { + fd.open(file.c_str(), ios::out | ios::binary); + push(bzip2_compressor()); + } else if (binary) { + fd.open(file.c_str(), ios::out | ios::binary); + } else { + fd.open(file.c_str()); + } + if (fd.fail()) return false; + push(fd); + return true; +} + +void ofile::close() { + if (!empty()) reset(); + fd.close(); +} + +/******************************************************/ +/* LOG FILE */ +/******************************************************/ +lfile::lfile() { + verboseC = true; + verboseL = true; +} + +lfile::~lfile() { + close(); +} + +string lfile::name() { + return file; +} + +bool lfile::open(string filename) { + file = filename; + if (futils::extensionFile(file) != "log") file += ".log"; + fd.open(file.c_str()); + if (fd.fail()) return false; + return true; +} + +void lfile::close() { + fd.close(); +} + +string lfile::getPrefix() { + return file.substr(0, file.find_last_of(".")); +} + +void lfile::muteL() { + verboseL = false; +} + +void lfile::unmuteL() { + verboseL = true; +} + +void lfile::muteC() { + verboseC = false; +} + +void lfile::unmuteC() { + verboseC = true; +} + + +void lfile::print(string s) { + if (verboseL) { fd << s; fd.flush(); } + if (verboseC) { cerr << s; cerr.flush(); } +} + +void lfile::printC(string s) { + if (verboseC) { cerr << s; cerr.flush(); } +} + +void lfile::printL(string s) { + if (verboseL) { fd << s; fd.flush(); } +} + +void lfile::println(string s) { + if (verboseL) { fd << s << endl; } + if (verboseC) { cerr << s << endl; } +} + +void lfile::printlnC(string s) { + if (verboseC) { cerr << s << endl; } +} + +void lfile::printlnL(string s) { + if (verboseL) { fd << s << endl; } +} + +void lfile::warning(string s) { + cerr << "\33[33mWARNING:\33[0m " << s << endl; + if (verboseL) fd << "WARNING: " << s << endl; +} + +void lfile::error(string s) { + cerr << "\33[33mERROR:\33[0m " << s << endl; + if (verboseL) fd << "ERROR: " << s << endl; + close(); + exit(1); +} diff --git a/prepareGenFromBeagle4_modified20160601/src/utils.h b/prepareGenFromBeagle4_modified20160601/src/utils.h new file mode 100644 index 0000000..6fdf61a --- /dev/null +++ b/prepareGenFromBeagle4_modified20160601/src/utils.h @@ -0,0 +1,165 @@ +#ifndef _UTILS_H +#define _UTILS_H + +#define IA 16807 +#define IM 2147483647 +#define AM (1.0/IM) +#define IQ 127773 +#define IR 2836 +#define NTAB 32 +#define NDIV (1+(IM-1)/NTAB) +#define EPS 1.2e-7 +#define RNMX (1.0-EPS) +#define PI 3.14159265358979323846 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace boost::iostreams; + +/******************************************************/ +/* UTILS STATISTICS */ +/******************************************************/ +namespace putils { + void initRandom(long s); + double getRandom(); + int getRandom(int); + long getSeed(); + void normalise(vector < double > & v); + int sample(vector< double > & v, double sum); + double median (vector < double> &); + double mean (vector < double> &); +}; + +/******************************************************/ +/* UTILS ALGORITHM */ +/******************************************************/ +namespace autils { + int max(vector < double > & v); + int max(vector < int > & v); + int count(vector < bool > &); + void decompose(int min, vector < vector < int > > & B, vector < vector < vector < int > > > & BB); + int checkDuo (int pa1, int pa2, int ca1, int ca2); + int checkTrio (int fa1, int fa2, int ma1, int ma2, int ca1, int ca2) ; + + int solveDuoF(int & fa1, int & fa2, int & ca1, int & ca2); + int solveDuoM(int & ma1, int & ma2, int & ca1, int & ca2); + int solveTrio (int & fa1, int & fa2, int & ma1, int & ma2, int & ca1, int & ca2); + int switch_error(vector < bool > & t1, vector < bool > & t2, vector < bool > & m12, vector < bool > & h1, vector < bool > & h2); + + +}; + +/******************************************************/ +/* UTILS STRING */ +/******************************************************/ +namespace sutils { + vector tokenize(string & str,string d); + vector tokenize(string & str,string d, int n); + string int2str(int n); + string int2str(vector < int > & v); + string long2str(long int n); + string double2str(double n, int prc = 4); + string double2str(vector < double > &v, int prc = 4); + string bool2str(vector & v); + string date2str(time_t * t, string format); +}; + +/******************************************************/ +/* UTILS FILE */ +/******************************************************/ +namespace futils { + bool isFile(string f); + bool createFile(string f); + string extensionFile(string & filename); + void bool2binary(vector < bool > & V, ostream &fd); + bool binary2bool(vector < bool > & V, istream & fd); +}; + +/******************************************************/ +/* INPUT FILE */ +/******************************************************/ +class ifile : public filtering_istream { +private: + string file; + ifstream fd; + +public: + ifile(); + ifile(string filename , bool binary = false); + ~ifile(); + string name(); + bool open(string filename, bool binary = false); + void close(); +}; + +/******************************************************/ +/* OUTPUT FILE */ +/******************************************************/ +class ofile : public filtering_ostream { +private: + string file; + ofstream fd; + +public: + ofile(); + ofile(string filename , bool binary = false); + ~ofile(); + string name(); + bool open(string filename, bool binary = false); + void close(); +}; + +/******************************************************/ +/* LOG FILE */ +/******************************************************/ +class lfile { +private: + string file; + ofstream fd; + bool verboseC; + bool verboseL; + +public: + lfile(); + ~lfile(); + string name(); + bool open(string filename = "file.log"); + void close(); + string getPrefix(); + void muteL(); + void unmuteL(); + void muteC(); + void unmuteC(); + void print(string s); + void printC(string s); + void printL(string s); + void println(string s); + void printlnC(string s); + void printlnL(string s); + void warning(string s); + void error(string s); +}; + +#endif