Skip to content

Commit

Permalink
Merge pull request #136 from freerkvandijk/master
Browse files Browse the repository at this point in the history
Two scripts needed for RNA-seq phasing pipeline
  • Loading branch information
npklein committed Jun 2, 2016
2 parents f85ca8d + 542a0e1 commit 2780500
Show file tree
Hide file tree
Showing 21 changed files with 2,849 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .pydevproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
</pydev_project>
77 changes: 77 additions & 0 deletions PL_to_GL_reorder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/env python3

import argparse
from Bio.bgzf import BgzfWriter
import gzip
import sys

def main():

args = parse_args()
with gzip.open(args.out, 'wt') as fd_out:
for i_vcf, vcf in enumerate(args.vcf):
with gzip.open(vcf, 'rt') as fd_vcf:
## loop metadata lines and header line
for line in fd_vcf:
## end of metadata lines
if line[0:2] != '##':
## print header line
print(line, end='', file=fd_out)
## break loop at the header line
break
if i_vcf == 0:
## print metadata line
print(line, end='', file=fd_out)
## loop records
for line in fd_vcf:
l = line.rstrip().split('\t')
field = l[8].split(':')
## index PL
i_PL = l[8].split(':').index('PL')
field.append('GL')
GT = field.pop(0)
field.sort()
field.insert(0, GT)
l[8] = ':'.join(field)
## index GL
i_GL = l[8].split(':').index('GL')
s = '\t'.join(l[:9])
print(s, sep='\t', file=fd_out, end='')
for i_GT in range(9, len(l)):
lGT = l[i_GT]
info = l[i_GT].split(':')
if lGT == './.:0,0':
toSplit = './.:0,0:0'
info = toSplit.split(':')
if info[0] == './.':
s_GL = '.,.,.'
else:
s_PL = l[i_GT].split(':')[i_PL]
if s_PL == '.':
s_GL = '.,.,.'
else:
s_GL = ','.join(
str(-float(PL)/10) for PL in s_PL.split(','))
if s_GL==".":
print(line)
exit()
info.insert(i_GL, s_GL)
l[i_GT] = info
print('\t' +
':'.join(l[i_GT]), sep='\t', end='',
file=fd_out)
print(end='\n', file=fd_out)

return


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--vcf', nargs = '+')
parser.add_argument('--out', required=True)
args = parser.parse_args()

return args

if __name__ == '__main__':
main()
Binary file not shown.
90 changes: 90 additions & 0 deletions prepareGenFromBeagle4_modified20160601/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#compiler
CXX=g++

#compiler flags
COPTI= -O3
CDEBG= -g
CWARN= -Wall -Wextra -Wno-sign-compare
CBAMH= -D_LIBBAM
CSHV1=

#linker flags
LOPTI= -O3
LDEBG= -g
LSTDD= -lm -lpthread -lboost_iostreams -lboost_program_options
LSTDS= -Wl,-Bstatic -lboost_iostreams -lz -lbz2 -Wl,-Bdynamic -lm -lpthread
LCL3S= -Wl,-Bstatic -lz -lbz2 -Wl,-Bdynamic -lm -lpthread
LBAMD= -lbam -lz

#executable file
EFILE= bin/prepareGenFromBeagle4

#header files
HFILE= $(shell find src -name *.h)

#source files
CFILE= $(shell find src -name *.cpp)

#source path
VPATH= $(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)

#include path
ISTDP= -Isrc
IBAMP= -Ilib
#ICL3P= -I/apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/include
#ICL3P= -I$EBROOTBOOST/include
ICL3P= -I/apps/software/Boost/1.60.0-foss-2015b-Python-2.7.9/include/

#library path
LSTDP= -Llib

#object files
OFILE= $(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done)
OBOST=

#default
all: cluster

#dynamic release
dynamic: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
dynamic: LFLAG=$(LOPTI) $(LSTDD)
dynamic: IFLAG=$(ISTDP)
dynamic: $(EFILE)

#debug release
debug: CFLAG=$(CDEBG) $(CWARN) $(CSHV1)
debug: LFLAG=$(LDEBG) $(LSTDD)
debug: IFLAG=$(ISTDP)
debug: $(EFILE)

#static release
static: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
static: LFLAG=$(LOPTI) $(LSTDS)
static: IFLAG=$(ISTDP)
static: $(EFILE)

#cluster release
cluster: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
cluster: LFLAG=$(LOPTI) $(LCL3S)
cluster: IFLAG=$(ISTDP) $(ICL3P)
#cluster: OBOST=/apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/lib/libboost_iostreams.a /apps/software/Boost/1.57.0-goolf-1.7.20-Python-2.7.9/lib/libboost_program_options.a
OBOST=-lz /apps/software/Boost/1.60.0-foss-2015b-Python-2.7.9/lib/libboost_iostreams.a
cluster: $(EFILE)

$(EFILE): $(OFILE)
$(CXX) $^ $(OBOST) -o $@ $(LFLAG)

obj/%.o: %.cpp $(HFILE)
$(CXX) -o $@ -c $< $(CFLAG) $(IFLAG)

clean:
rm -f obj/*.o $(EFILE)

test:
cp $(EFILE) ~/$(EFILE).v10

oxford:
cp $(EFILE) ~/bin/.

install:
cp $(EFILE) ./tmp/
87 changes: 87 additions & 0 deletions prepareGenFromBeagle4_modified20160601/makefile.bak
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#compiler
CXX=g++

#compiler flags
COPTI= -O3
CDEBG= -g
CWARN= -Wall -Wextra -Wno-sign-compare
CBAMH= -D_LIBBAM
CSHV1=

#linker flags
LOPTI= -O3
LDEBG= -g
LSTDD= -lm -lpthread -lboost_iostreams -lboost_program_options
LSTDS= -Wl,-Bstatic -lboost_iostreams -lz -lbz2 -Wl,-Bdynamic -lm -lpthread
LCL3S= -Wl,-Bstatic -lz -lbz2 -Wl,-Bdynamic -lm -lpthread
LBAMD= -lbam -lz

#executable file
EFILE= bin/prepareGenFromBeagle4

#header files
HFILE= $(shell find src -name *.h)

#source files
CFILE= $(shell find src -name *.cpp)

#source path
VPATH= $(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)

#include path
ISTDP= -Isrc
IBAMP= -Ilib
ICL3P= -I/users/delaneau/BOOST/include

#library path
LSTDP= -Llib

#object files
OFILE= $(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done)
OBOST=

#default
all: dynamic

#dynamic release
dynamic: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
dynamic: LFLAG=$(LOPTI) $(LSTDD)
dynamic: IFLAG=$(ISTDP)
dynamic: $(EFILE)

#debug release
debug: CFLAG=$(CDEBG) $(CWARN) $(CSHV1)
debug: LFLAG=$(LDEBG) $(LSTDD)
debug: IFLAG=$(ISTDP)
debug: $(EFILE)

#static release
static: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
static: LFLAG=$(LOPTI) $(LSTDS)
static: IFLAG=$(ISTDP)
static: $(EFILE)

#cluster release
cluster: CFLAG=$(COPTI) $(CWARN) $(CSHV1)
cluster: LFLAG=$(LOPTI) $(LCL3S)
cluster: IFLAG=$(ISTDP) $(ICL3P)
cluster: OBOST=~/BOOST/lib/libboost_iostreams.a ~/BOOST/lib/libboost_program_options.a
cluster: $(EFILE)

$(EFILE): $(OFILE)
$(CXX) $^ $(OBOST) -o $@ $(LFLAG)

obj/%.o: %.cpp $(HFILE)
$(CXX) -o $@ -c $< $(CFLAG) $(IFLAG)

clean:
rm -f obj/*.o $(EFILE)

test:
cp $(EFILE) ~/$(EFILE).v10

oxford:
cp $(EFILE) ~/bin/.

install:
cp $(EFILE) /usr/local/bin/.
Binary file not shown.
Binary file added prepareGenFromBeagle4_modified20160601/obj/data.o
Binary file not shown.
Binary file not shown.
Binary file not shown.
76 changes: 76 additions & 0 deletions prepareGenFromBeagle4_modified20160601/old/chunk.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "classes.h"


int hamming (vector < bool > & h1, vector < bool > & h2) {
assert(h1.size() == h2.size());
int c = 0;
for(int l = 0 ; l < h1.size(); l++) if (h1[l] != h2[l]) c++;
return c;
}

chunk::chunk(char * id) {
this->id = string(id);
}

chunk::~chunk() {
mappingS.clear();
H.clear();
P.clear();
switches.clear();
}

bool chunk::operator < (const chunk & c) const {
return (mappingS[0] < c.mappingS[0]);
}

void chunk::assemble(chunk & pc) {
//mapping starting point of the overlap
int b1 = -1;
int e1 = pc.mappingS.size() - 1;
for (int l = 0 ; l < pc.mappingS.size() && b1 < 0 ; l ++ ) if (pc.mappingS[l] == mappingS[0]) b1 = l;
if (b1 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc.id << "], code 1" << endl; exit(1);}

//mapping starting point of the overlap
int b2 = 0;
int e2 = -1;
for (int l = 0 ; l < mappingS.size() && e2 < 0 ; l ++ ) if (mappingS[l] == pc.mappingS.back()) e2 = l;
if (e2 < 0) {cout << "Chunk [" << id << "] does not overlap chunk [" << pc.id << "], code 2" << endl; exit(1);}

//overlap size
int osize = e1 - b1 + 1;
int osize2 = e2 - b2 + 1;
if (osize != osize2) {cout << "Overlaps are differents between [" << id << "] does not overlap chunk [" << pc.id << "] o1=" << osize << " o2=" << osize2 <<endl; exit(1);}
else cout << " * overlap=" << osize << endl;

//blanking posteriors
int cpt = 0;
int i1, i2;
for (i1 = b1, i2 = b2; i1 <= e1 && i2 <= e2; i1++, i2 ++) {
if (cpt < (osize / 2)) mappingS[i2] = -1;
else pc.mappingS[i1] = -1;
}

//switching haplotypes
int n_switches = 0;
for (int i = 0 ; i < switches.size() ; i ++) {
vector < bool > h11 = vector < bool > (pc.H[2 * i + 0].begin() + b1, pc.H[2 * i + 0].end());
vector < bool > h12 = vector < bool > (pc.H[2 * i + 1].begin() + b1, pc.H[2 * i + 1].end());
vector < bool > h21 = vector < bool > (H[2 * i + 0].begin(), pc.H[2 * i + 0].begin() + e2 + 1);
vector < bool > h22 = vector < bool > (H[2 * i + 1].begin(), pc.H[2 * i + 1].begin() + e2 + 1);
int score0 = 0, score1 = 0;
if (!pc.switches[i]) {
score0 = hamming(h11, h21) + hamming(h12, h22);
score1 = hamming(h11, h22) + hamming(h12, h21);
} else {
score0 = hamming(h12, h21) + hamming(h11, h22);
score1 = hamming(h12, h22) + hamming(h11, h21);
}

if (score0 <= score1) switches[i] = false;
else {
switches[i] = true;
n_switches ++;
}
}
cout << " * #switches=" <<n_switches << " out of " << switches.size() << " possible" << endl;
}
Loading

0 comments on commit 2780500

Please sign in to comment.