Skip to content

Commit

Permalink
update API
Browse files Browse the repository at this point in the history
  • Loading branch information
umahsn committed Jan 25, 2022
1 parent 108c166 commit d5f4b11
Show file tree
Hide file tree
Showing 103 changed files with 84 additions and 70 deletions.
23 changes: 14 additions & 9 deletions scripts/NanoCaller.py → NanoCaller
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#!/usr/bin/env python

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import time, argparse, os, shutil, sys, pysam, datetime
import multiprocessing as mp
from intervaltree import Interval, IntervalTree
from subprocess import PIPE, Popen
from utils import *
from nanocaller_src.utils import *


def run(args):
import snpCaller, indelCaller
from nanocaller_src import snpCaller, indelCaller

pool = mp.Pool(processes=args.cpu)

Expand Down Expand Up @@ -47,7 +50,7 @@ def run(args):
dirname = os.path.dirname(__file__)

if args.exclude_bed in ['hg38', 'hg19', 'mm10', 'mm39']:
args.exclude_bed=os.path.join(dirname, 'release_data/bed_files/%s_centro_telo.bed.gz' %args.exclude_bed)
args.exclude_bed=os.path.join(dirname, 'nanocaller_src/release_data/bed_files/%s_centro_telo.bed.gz' %args.exclude_bed)

if args.include_bed:
tbx = pysam.TabixFile(args.include_bed)
Expand Down Expand Up @@ -106,7 +109,7 @@ def run(args):
in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':sam_path, 'fasta_path':args.ref, \
'mincov':args.mincov, 'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \
'threshold':threshold, 'snp_model':args.snp_model,'indel_model':args.indel_model, 'cpu':args.cpu, 'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, 'seq':args.sequencing, \
'del_t':args.del_threshold,'ins_t':args.ins_threshold,'supplementary':args.supplementary, 'include_bed':args.include_bed\
'del_t':args.del_threshold,'ins_t':args.ins_threshold,'impute_indel_phase':args.impute_indel_phase, 'supplementary':args.supplementary, 'include_bed':args.include_bed\
, 'exclude_bed':args.exclude_bed,'win_size':args.win_size,'small_win_size':args.small_win_size}
ind_time=time.time()
indel_vcf=indelCaller.test_model(in_dict, pool)
Expand Down Expand Up @@ -141,15 +144,15 @@ def run(args):
t=time.time()


preset_dict={'ont':{'sequencing':'ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False},
preset_dict={'ont':{'sequencing':'ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'ul_ont': {'sequencing':'ul_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False},
'ul_ont': {'sequencing':'ul_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'ul_ont_extreme':{'sequencing':'ul_ont_extreme', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False},
'ul_ont_extreme':{'sequencing':'ul_ont_extreme', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False, 'impute_indel_phase':False},

'ccs':{'sequencing':'pacbio', 'snp_model': 'CCS-HG002', 'indel_model':'CCS-HG002', 'neighbor_threshold':'0.3,0.7', 'ins_threshold':0.4,'del_threshold':0.4, 'enable_whatshap':True},
'ccs':{'sequencing':'pacbio', 'snp_model': 'CCS-HG002', 'indel_model':'CCS-HG002', 'neighbor_threshold':'0.3,0.7', 'ins_threshold':0.4,'del_threshold':0.4, 'enable_whatshap':True, 'impute_indel_phase':True},

'clr':{'sequencing':'pacbio', 'snp_model':'CLR-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.3,0.6', 'ins_threshold':0.6,'del_threshold':0.6, 'win_size':10, 'small_win_size':2, 'enable_whatshap':True}
'clr':{'sequencing':'pacbio', 'snp_model':'CLR-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.3,0.6', 'ins_threshold':0.6,'del_threshold':0.6, 'win_size':10, 'small_win_size':2, 'enable_whatshap':True, 'impute_indel_phase':False}
}

flag_dict={"seq":"sequencing", "p":"preset", "o":"output", "sup":"supplementary","nbr_t":"neighbor_threshold","ins_t":"ins_threshold", "del_t":"del_threshold"}
Expand Down Expand Up @@ -212,6 +215,8 @@ def run(args):
indel_group.add_argument("-win_size", "--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40)
indel_group.add_argument("-small_win_size", "--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4)

indel_group.add_argument('-impute_indel_phase','--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true')


#phasing
phase_group.add_argument('-phase_bam','--phase_bam', help='Phase bam files if snps mode is selected. This will phase bam file without indel calling.', default=False, action='store_true')
Expand Down
4 changes: 3 additions & 1 deletion scripts/NanoCaller_WGS.py → NanoCaller_WGS
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/usr/bin/env python

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import time, argparse, os, shutil, sys, pysam, datetime, re
import multiprocessing as mp
from intervaltree import Interval, IntervalTree
from subprocess import PIPE, Popen
from utils import *
from nanocaller_src.utils import *

if __name__ == '__main__':

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,11 @@ def ex_bed(tree, pos):

ref_dict={j:s.upper() if s in 'AGTC' else '' for j,s in zip(range(max(1,start-200),end+400+1),fastafile.fetch(chrom,max(1,start-200)-1,end+400)) }

chrom_length=fastafile.get_reference_length(chrom)

hap_dict={1:[],2:[]}

for pread in samfile.fetch(chrom, max(0,dct['start']-100), dct['end']+100):
for pread in samfile.fetch(chrom, max(0,dct['start']-100000), dct['end']+1000):
if pread.has_tag('HP'):
hap_dict[pread.get_tag('HP')].append(pread.qname)

Expand All @@ -214,16 +216,15 @@ def ex_bed(tree, pos):
position_queue_small=collections.deque(small_size*[{}], small_size)

variants={}
extra_variants={}

max_range={0:max(10,window_size),1:10}

count=0
prev=0
for pcol in samfile.pileup(chrom,max(0,start-1),end,min_base_quality=0,\
flag_filter=flag,truncate=True):
v_pos=pcol.pos+1
if count>200:
print('%s: Skipping region %s:%d-%d due to poor alignment.' %(str(datetime.datetime.now()), chrom, v_pos, end), flush =True)
break

if in_bed(include_intervals, v_pos) and not ex_bed(exclude_intervals, v_pos):
read_names=pcol.get_query_names()
Expand Down Expand Up @@ -257,8 +258,10 @@ def ex_bed(tree, pos):
ins_queue_small_0.append(ins_set_small_0)
ins_queue_small_1.append(ins_set_small_1)


if v_pos>prev and len_seq_0>=mincov and len_seq_1>=mincov:
if v_pos<=prev:
continue

if len_seq_0>=mincov and len_seq_1>=mincov:

del_freq_0=len(set.union(*del_queue_0))/len_seq_0 if len_seq_0>0 else 0
ins_freq_0=len(set.union(*ins_queue_0))/len_seq_0 if len_seq_0>0 else 0
Expand All @@ -282,16 +285,15 @@ def ex_bed(tree, pos):
prev=v_pos+10
variants[max(1,v_pos-10)]=1
count+=1


elif dct['seq']=='pacbio' and len_seq_tot >=2*mincov:
elif dct['impute_indel_phase'] and len_seq_tot >=2*mincov:
seq_v2=[x.upper() for x in pcol.get_query_sequences( mark_matches=False, mark_ends=False, add_indels=True)]
seq=[x[:2] for x in seq_v2]
seq_tot=''.join(seq)


del_freq_tot=(seq_tot.count('-')+seq_tot.count('*'))/len_seq_tot if len_seq_tot>0 else 0
ins_freq_tot=seq_tot.count('+')/len_seq_tot if len_seq_tot>0 else 0

if (del_t<=del_freq_tot or ins_t<=ins_freq_tot):
groups={}
for s,n in zip(seq_v2,read_names):
Expand All @@ -300,67 +302,73 @@ def ex_bed(tree, pos):
groups[s].append(n)

counts=sorted([(x,len(groups[x])) for x in groups],key=lambda x:x[1],reverse=True)

if counts[0][1]<=0.8*len_seq_tot:
read_names_0=groups[counts[0][0]]
read_names_1=groups[counts[1][0]]
read_names_0=set(groups[counts[0][0]])
read_names_1=set(groups[counts[1][0]]) if counts[1][1]>=mincov else set(read_names)-read_names_0
else:
read_names_0=groups[counts[0][0]][:counts[0][1]//2]
read_names_1=groups[counts[0][0]][counts[0][1]//2:]

if len(read_names_0)>=mincov and len(read_names_1)>=mincov:
prev=v_pos+10
variants[max(1,v_pos-10)]=1
extra_variants[max(1,v_pos-10)]=(read_names_0,read_names_1)
count+=1

for pcol in samfile.pileup(chrom,max(0,start-10-window_size),end,min_base_quality=0, flag_filter=flag,truncate=True):

v_pos=pcol.pos+1

if v_pos in variants:
if v_pos in extra_variants:
read_names=pcol.get_query_names()
read_names_0, read_names_1 =extra_variants[v_pos]

elif v_pos in variants:
read_names=pcol.get_query_names()

read_names_0=set(read_names) & hap_reads_0
read_names_1=set(read_names) & hap_reads_1


d={'hap0':{},'hap1':{}}
d_tot={}

ref=''.join([ref_dict[p] for p in range(v_pos-window_before,min(end,v_pos+window_after+1))])


for pread in pcol.pileups:
dt=pread.alignment.query_sequence[max(0,pread.query_position_or_next-window_before):pread.query_position_or_next+window_after]
d_tot[pread.alignment.qname]=dt

if pread.alignment.qname in read_names_0:
d['hap0'][pread.alignment.qname]=dt
else:
continue

elif pread.alignment.qname in read_names_1:
d['hap1'][pread.alignment.qname]=dt


seq_list=d['hap0']
flag0,_, data_0,alt_0,ref_seq_0=msa(seq_list,ref,v_pos,2,dct['maxcov'])
d={'hap0':{},'hap1':{}}
d_tot={}

ref=''.join([ref_dict[p] for p in range(v_pos-window_before,min(chrom_length,v_pos+window_after+1))])


for pread in pcol.pileups:
dt=pread.alignment.query_sequence[max(0,pread.query_position_or_next-window_before):pread.query_position_or_next+window_after]
d_tot[pread.alignment.qname]=dt

if pread.alignment.qname in read_names_0:
d['hap0'][pread.alignment.qname]=dt

elif pread.alignment.qname in read_names_1:
d['hap1'][pread.alignment.qname]=dt


seq_list=d['hap0']
flag0,_, data_0,alt_0,ref_seq_0=msa(seq_list,ref,v_pos,2,dct['maxcov'])

seq_list=d['hap1']
flag1,_, data_1,alt_1,ref_seq_1=msa(seq_list,ref,v_pos,2,dct['maxcov'])

seq_list = d_tot
flag_total,indel_flag_total,data_total,alt_total,ref_seq_total=msa(seq_list,ref,v_pos,dct['mincov'],dct['maxcov'])

if flag0 and flag1 and flag_total:
output_pos.append(v_pos)
output_data_0.append(data_0)
output_data_1.append(data_1)
output_data_total.append(data_total)

tp=max_range[variants[v_pos]]

alleles.append([allele_prediction(alt_0, ref_seq_0, max_range[variants[v_pos]]),\
allele_prediction(alt_1, ref_seq_1, max_range[variants[v_pos]]), \
allele_prediction(alt_total, ref_seq_total, max_range[variants[v_pos]])])

seq_list=d['hap1']
flag1,_, data_1,alt_1,ref_seq_1=msa(seq_list,ref,v_pos,2,dct['maxcov'])

seq_list = d_tot
flag_total,indel_flag_total,data_total,alt_total,ref_seq_total=msa(seq_list,ref,v_pos,dct['mincov'],dct['maxcov'])

if flag0 and flag1 and flag_total:
output_pos.append(v_pos)
output_data_0.append(data_0)
output_data_1.append(data_1)
output_data_total.append(data_total)

tp=max_range[variants[v_pos]]

alleles.append([allele_prediction(alt_0, ref_seq_0, max_range[variants[v_pos]]),\
allele_prediction(alt_1, ref_seq_1, max_range[variants[v_pos]]), \
allele_prediction(alt_total, ref_seq_total, max_range[variants[v_pos]])])



if len(output_pos)==0:
return (output_pos,output_data_0,output_data_1,output_data_total,alleles)
Expand Down
9 changes: 4 additions & 5 deletions scripts/indelCaller.py → nanocaller_src/indelCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import numpy as np
import multiprocessing as mp
import tensorflow as tf
from model_architect_indel import *
from .model_architect_indel import *
from Bio import pairwise2
from generate_indel_pileups import get_indel_testing_candidates
from utils import *
from .generate_indel_pileups import get_indel_testing_candidates
from .utils import *

if type(tf.contrib) != type(tf): tf.contrib._warning = None
config = tf.compat.v1.ConfigProto()
Expand Down Expand Up @@ -136,8 +136,7 @@ def test_model(params,pool):
q=-100*np.log10(1e-6+batch_prob_all[j,0])
allele0_data, allele1_data,allele_total_data= batch_alleles_seq[j]

if batch_pred_all[j]==1:
if allele_total_data[0]:
if batch_pred_all[j]==1 and allele_total_data[0]:
gq=-100*np.log10(1+1e-6-batch_prob_all[j,1])
s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele_total_data[0], allele_total_data[1], q, gq)
f.write(s)
Expand Down
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions scripts/snpCaller.py → nanocaller_src/snpCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import numpy as np
import multiprocessing as mp
import tensorflow as tf
from model_architect import *
from generate_SNP_pileups import get_snp_testing_candidates
from .model_architect import *
from .generate_SNP_pileups import get_snp_testing_candidates
from intervaltree import Interval, IntervalTree
from utils import *
from .utils import *

if type(tf.contrib) != type(tf): tf.contrib._warning = None
config = tf.ConfigProto()
Expand Down
File renamed without changes.

0 comments on commit d5f4b11

Please sign in to comment.