-
Notifications
You must be signed in to change notification settings - Fork 1
/
modelbuilder
executable file
·87 lines (74 loc) · 2.6 KB
/
modelbuilder
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
import argparse
import glob
import math
import os
import openturns as ot # conda install -y openturns
from grimoire.genome import Reader
import isoform
def create_length_model(seqs, lmin, lmax):
# get the lengths
lens = [len(seq) for seq in seqs]
# train Frechet
sample = ot.Sample([[x] for x in lens if x < lmax])
f = ot.FrechetFactory().buildAsFrechet(sample)
a = f.getAlpha()
b = f.getBeta()
g = f.getGamma()
# create histogram from frechet
pdf = []
for x in range(lmax):
if x < g: pdf.append(0)
else:
z = (x-g)/b
pdf.append((a/b) * z**(-1-a) * math.exp(-z**-a))
# create leading zeros and rescale
for i in range(lmin): pdf[i] = 0
total = sum(pdf)
for i in range(len(pdf)): pdf[i] /= total
return pdf
#######
# CLI #
#######
parser = argparse.ArgumentParser(description='model builder for apc set')
parser.add_argument('apc', help='path to apc directory')
parser.add_argument('out', help='output directory')
parser.add_argument('--don', type=int, default=5, help="[%(default)i]")
parser.add_argument('--acc', type=int, default=6, help="[%(default)i]")
parser.add_argument('--emm', type=int, default=3, help="[%(default)i]")
parser.add_argument('--emin', type=int, default=25, help="[%(default)i]")
parser.add_argument('--emax', type=int, default=1000, help="[%(default)i]")
parser.add_argument('--imm', type=int, default=3, help="[%(default)i]")
parser.add_argument('--imin', type=int, default=35, help="[%(default)i]")
parser.add_argument('--imax', type=int, default=1000, help="[%(default)i]")
arg = parser.parse_args()
## collect subsequences
accs = []
dons = []
exons = []
introns = []
for ff in glob.glob(f'{arg.apc}/*.fa'):
gf = ff[:-2] + 'gff3'
genome = Reader(gff=gf, fasta=ff)
tx = next(genome).ftable.build_genes()[0].transcripts()[0]
for f in tx.exons: exons.append(f.seq_str())
for f in tx.introns:
iseq = f.seq_str()
dons.append(iseq[0:arg.don])
accs.append(iseq[-arg.acc:])
introns.append(iseq)
## build models
acc = isoform.create_pwm(accs)
don = isoform.create_pwm(dons)
emm = isoform.create_markov(exons, arg.emm, 0, 0)
imm = isoform.create_markov(introns, arg.imm, arg.don, arg.acc)
elen = create_length_model(exons, arg.emin, arg.emax)
ilen = create_length_model(introns, arg.imin, arg.imax)
## write models (hard-coded)
if not os.path.exists(arg.out): os.makedirs(arg.out)
isoform.write_pwm(f'{arg.out}/acc.pwm', acc)
isoform.write_pwm(f'{arg.out}/don.pwm', don)
isoform.write_markov(f'{arg.out}/exon.mm', emm)
isoform.write_markov(f'{arg.out}/intron.mm', imm)
isoform.write_len(f'{arg.out}/exon.len', elen)
isoform.write_len(f'{arg.out}/intron.len', ilen)