-
Notifications
You must be signed in to change notification settings - Fork 6
/
find_intron_distance_to_ATG.php
executable file
·269 lines (241 loc) · 11.7 KB
/
find_intron_distance_to_ATG.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/php
<?php
/*******************************************************************************
*
* FindIntronDistanceToATG analyzes the seq_gene.md file downloaded from:
* ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md.gz
* and generates a list of transcripts that have a very short distance between
* the ATG and the next intron. All transcripts where the distance is larger
* than 5 codons (15 bases), are discarded. This value is configurable.
* Transcripts that have no UTR annotated, and have a safe distance between the
* ATG and the first 3' intron, are reported elsewhere in a separate error file.
*
* Created : 2013-08-22
* Modified : 2013-10-01
* Version : 0.2
*
* Copyright : 2013-2015 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <[email protected]>
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.2',
'maximum_distance' => 15, // In bases, the maximum distance to still be reported.
'output_suffix' => '.distance_intron_to_ATG.txt',
'output_suffix_error' => '.distance_intron_to_ATG_errors.txt',
'warnings' => array(
'first_exon' => 'WARNING: 5\' distance is estimate; transcript may be longer',
'no_UTR' => 'WARNING: No 5\' UTR annotated',
),
);
echo 'FindIntronDistanceToATG v.' . $_SETT['version'] . "\n\n";
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles);
if (count($aFiles) != 1) {
die('Usage: ' . $sScriptName . ' path_to_seq_gene.md' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to open ' . $sFile . '.' . "\n");
}
}
// Checking if we are allowed to create the output file.
$sFileOut = $aFiles[0] . $_SETT['output_suffix'];
if (file_exists($sFileOut)) {
if (!is_writable($sFileOut)) {
die('Can not overwrite ' . $sFileOut . ', aborting.' . "\n");
}
} elseif (!is_writable(dirname($sFileOut))) {
die('Can not create ' . $sFileOut . ', aborting.' . "\n");
}
$sFileOutErr = $aFiles[0] . $_SETT['output_suffix_error'];
if (file_exists($sFileOutErr)) {
if (!is_writable($sFileOutErr)) {
die('Can not overwrite ' . $sFileOutErr . ', aborting.' . "\n");
}
} elseif (!is_writable(dirname($sFileOutErr))) {
die('Can not create ' . $sFileOutErr . ', aborting.' . "\n");
}
// Start by filtering using grep and cut, data directly into a variable (it's an 9.5MB string).
print('Parsing gene/exon locations file... ');
$aData = explode("\n", `grep -E "UTR|CDS" $aFiles[0] | grep GRCm38 | grep -v '|' | cut -f 3-5,12,14`);
$aTranscripts = array(); // array(transcript => array('UTRs' => array(array(5', 3'), ...), 'CDS' => array(5', 3'), 'distances' => array(5' distance, 3' distance), 'warning' => true|false);
foreach ($aData as $sLine) {
if (!trim($sLine)) {
continue;
}
list($nPosStart, $nPosEnd, $sStrand, $sType, $sTranscript) = explode("\t", $sLine);
if (!isset($aTranscripts[$sTranscript])) {
// Ignore non-coding transcripts, they don't have an ATG.
if (!preg_match('/^[NX]M_/', $sTranscript)) {
continue;
}
$aTranscripts[$sTranscript] =
array(
'UTRs' => array(),
'CDS' => array(),
'distances' => array(),
'warning' => 'first_exon', // Warning is on by default (because the - strand needs that).
);
}
if ($sStrand == '+' && !$aTranscripts[$sTranscript]['distances']) {
// Plus strand, and no distances calculated yet.
// 5' UTRs are stored, and after finding the first CDS we're done.
if ($sType == 'UTR') {
$aTranscripts[$sTranscript]['UTRs'][] = array($nPosStart, $nPosEnd);
} elseif ($sType == 'CDS') {
// If there is no distance between this (first) CDS and the previous UTR, we're in the same exon.
// The length of the previous UTR is then the upstream distance.
$nUTRs = count($aTranscripts[$sTranscript]['UTRs']);
if (!$nUTRs) {
// WTF! No UTRs before the CDS???
$aTranscripts[$sTranscript]['distances'][] = 0;
$aTranscripts[$sTranscript]['warning'] = 'no_UTR';
} elseif ($aTranscripts[$sTranscript]['UTRs'][$nUTRs-1][1] == ($nPosStart - 1)) {
// UTR joins directly to this CDS.
$aTranscripts[$sTranscript]['distances'][] = $aTranscripts[$sTranscript]['UTRs'][$nUTRs-1][1] - $aTranscripts[$sTranscript]['UTRs'][$nUTRs-1][0] + 1;
} else {
// UTR was in a different exon; exon starts with ATG.
$aTranscripts[$sTranscript]['distances'][] = 0;
$aTranscripts[$sTranscript]['warning'] = false;
}
$aTranscripts[$sTranscript]['distances'][] = $nPosEnd - $nPosStart + 1 - 3; // '0' means exon ends directly following the ATG.
// The number of UTRs define whether we throw a warning about the ATG in the first exon.
if ($nUTRs > 1) {
$aTranscripts[$sTranscript]['warning'] = false;
}
}
} elseif ($sStrand == '-') {
// Minus strand.
// We will ignore the first UTRs; every CDS gets stored, overwriting the previous one,
// when first encountering an UTR after the CDS we can calculate the distances.
if ($sType == 'UTR') {
// Only if we already have seen a CDS will we continue.
if (!$aTranscripts[$sTranscript]['CDS']) {
continue;
}
// UTR following an CDS. If this is the first UTR, its length will
// define the 5' distance. The length of the last CDS defines the 3'
// distance. If more UTRs follow, the "first exon" warning is
// dropped, unless this UTR does not connect perfectly to the CDS.
if (!$aTranscripts[$sTranscript]['distances']) {
// First UTR after the CDS.
if ($aTranscripts[$sTranscript]['CDS'][1] == ($nPosStart - 1)) {
// CDS joins directly to this UTR.
$aTranscripts[$sTranscript]['distances'][] = $nPosEnd - $nPosStart + 1; // '0' means exon ends directly following the ATG.
} else {
// CDS was in a different exon; exon starts with ATG.
$aTranscripts[$sTranscript]['distances'][] = 0;
$aTranscripts[$sTranscript]['warning'] = false;
}
$aTranscripts[$sTranscript]['distances'][1] = $aTranscripts[$sTranscript]['CDS'][1] - $aTranscripts[$sTranscript]['CDS'][0] + 1 - 3;
} else {
// We've seen the CDS, we've seen an UTR before. This is a
// non-coding exon, so we can remove the warning.
$aTranscripts[$sTranscript]['warning'] = false;
}
} elseif ($sType == 'CDS') {
// We store CDS, overwriting the previous one if present.
$aTranscripts[$sTranscript]['CDS'] = array($nPosStart, $nPosEnd);
}
}
}
print('Done, ' . count($aData) . ' lines parsed.' . "\n");
unset($aData);
// Now, we need to filter the data. Instead of removing transcripts from the big
// array based on their distance, I will just make a new list, easy to sort.
// Then I can remove $aTranscripts as a whole and then print the results.
$aData = array(); // array(transcript => array(5', 3', warning, transcript), ...);
foreach ($aTranscripts as $sTranscript => $aTranscript) {
$aDistance = $aTranscript['distances'];
if (!count($aDistance)) {
// Some genes do not have UTRs annotated. But for the genes in reverse I
// rely on the presence of the UTRs to be able to count the 5' distance.
// So I need to repeat the calculation...
// Only if we already have seen a CDS will we continue.
if (!$aTranscript['CDS']) {
// No CDSs either... :S
die('Impossible transcript: ' . $sTranscript . "\n");
}
// No distance between start of transcript and CDS.
$aDistance[] = 0;
$aDistance[] = $aTranscript['CDS'][1] - $aTranscript['CDS'][0] + 1 - 3;
$aTranscript['warning'] = 'no_UTR';
}
if (min($aDistance) <= $_SETT['maximum_distance']) {
$aData[$sTranscript] = array_merge($aDistance, array($aTranscript['warning']));
// 2013-09-25; OK, NOW I'm too lazy. I need to change the sorting algorithm
// AGAIN because sometimes it's returning 0, which makes the order of the
// elements undefined when the distances are exactly equal. However, that
// makes comparing the files completely unreliable, if run again on a
// different source file. So I always have to define the order. I need the
// transcript ID for it, but I can't use it because uasort() does not
// receive the key. So I just include the transcript ID as a value, which
// means I'm using more memory than necessary, but whatever...
$aData[$sTranscript][] = $sTranscript;
}
}
unset($aTranscripts);
// Sorting is not too easy, and I need a custom function for it.
print('Sorting... ');
uasort($aData, function ($a, $b) {
// Properly sort the results...
// Determine if a AND b sort on 5' or 3'. if a on 5' and b on 3', sort using that.
// otherwise, if sorting on the same thing, check the values.
if ($a[0] <= $a[1] && $b[0] > $b[1]) {
// 5' versus 3' preference.
return -1;
} elseif ($a[0] > $a[1] && $b[0] <= $b[1]) {
// 3' versus 5' preference.
return 1;
} elseif ($a[0] <= $a[1]) {
// 5' is shorter in A than 3', or equal.
return ($a[0] > $b[0]? -1 : ($a[0] < $b[0]? 1 : ($a[1] > $b[1]? -1 : ($a[1] < $b[1]? 1 : strcmp($a[3], $b[3])))));
} else {
// 3' shorter.
return ($a[1] < $b[1]? -1 : ($a[1] > $b[1]? 1 : ($a[0] < $b[0]? -1 : ($a[0] > $b[0]? 1 : strcmp($a[3], $b[3])))));
}
});
print('Done.' . "\n");
print('Writing output... ');
$fOut = @fopen($sFileOut, 'w');
if (!$fOut) {
die('Unable to open file for writing: ' . $sFileOut . '.' . "\n\n");
}
$fOutErr = @fopen($sFileOutErr, 'w');
if (!$fOutErr) {
die('Unable to open file for writing: ' . $sFileOutErr . '.' . "\n\n");
}
$i = 0;
fputs($fOut, '#transcript' . "\t" . '5\' distance' . "\t" . '3\' distance' . "\n");
fputs($fOutErr, '#transcript' . "\t" . '5\' distance' . "\t" . '3\' distance' . "\n");
foreach ($aData as $sTranscript => $aTranscript) {
// Unset unnecessary transcript ID.
unset($aTranscript[3]);
// Remove 'first_exon' warning when the 5' distance is big enough, anyways.
if ($aTranscript[2] == 'first_exon' && $aTranscript[0] > $_SETT['maximum_distance']) {
$aTranscript[2] = false;
}
// Transcripts without UTR and plenty of space 3' are moved to a different file, because we really can't say if they match our criteria or not.
if ($aTranscript[2] == 'no_UTR' && $aTranscript[1] > $_SETT['maximum_distance']) {
fputs($fOutErr, $sTranscript . "\t" . implode("\t", $aTranscript) . "\n");
continue;
}
if (isset($_SETT['warnings'][$aTranscript[2]])) {
// Replace code by human readable text.
$aTranscript[2] = $_SETT['warnings'][$aTranscript[2]];
}
fputs($fOut, $sTranscript . "\t" . implode("\t", $aTranscript) . "\n");
$i ++;
}
print('Done, ' . $i . ' transcripts found within threshold of ' . $_SETT['maximum_distance'] . '.' . "\n");
?>