-
Notifications
You must be signed in to change notification settings - Fork 6
/
mm10_transcript_positions_create.php
executable file
·278 lines (251 loc) · 11.5 KB
/
mm10_transcript_positions_create.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/php
<?php
/*******************************************************************************
*
* This script generates the mm10_transcript_positions.txt file based on the
* alignment of the transcriptome FASTA sequences to the GENOME. The results of
* this alignment is stored in a SAM file, which should be passed to this script
* as the first argument.
*
* Created : 2013-08-22
* Modified : 2016-09-12
* Version : 0.3
*
* Copyright : 2013-2016 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <[email protected]>
*
* Changelog : 0.3 2016-09-12
* Added link to inc-lib-json.php to be compatible with PHP
* versions < 5.2.0.
* 0.2 2016-09-09
* Updated preg_match_all() call to be compatible with PHP
* versions < 5.4.0.
* 0.1 2013-09-05
* First version.
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.3',
'output' => 'mm10_transcript_positions.txt',
'unsupported_transcripts_output' => 'transcriptome_alignment_unsupported_transcripts.txt',
'terminal_width' => 100,
);
echo 'CreateTranscriptPositions v.' . $_SETT['version'] . "\n" .
'PLEASE DO NOT USE THIS SCRIPT ON A NETWORK DRIVE; IT CAN BE INCREDIBLY SLOW THERE.' . "\n\n";
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles);
$sCWD = dirname($sScriptName);
if (!function_exists('json_encode') && is_readable($sCWD . '/inc-lib-json.php')) {
require $sCWD . '/inc-lib-json.php'; // For PHP <= 5.2.0.
}
if (count($aFiles) != 1) {
die('Usage: ' . $sScriptName . ' SAM_FILE' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to open ' . $sFile . '.' . "\n");
}
}
// Checking if we are allowed to create the output file.
$aFilesOut = array($_SETT['output'], $_SETT['unsupported_transcripts_output']);
foreach ($aFilesOut as $sFileOut) {
if (file_exists($sFileOut)) {
if (!is_writable($sFileOut)) {
die('Can not overwrite ' . $sFileOut . ', aborting.' . "\n");
}
} elseif (!is_writable(dirname($sFileOut))) {
die('Can not create ' . $sFileOut . ', aborting.' . "\n");
}
}
list($sFileOut, $sFileOutUnsupportedTranscripts) = $aFilesOut;
// Open the file, read out line by line.
$sFileIn = $aFiles[0];
$fIn = fopen($sFileIn, 'r');
if (!$fIn) {
die('Unable to open file for reading: ' . $sFileIn . '.' . "\n\n");
}
$nFileSize = filesize($sFileIn);
$nBytesRead = 0;
$fOut = @fopen($sFileOut, 'w');
if (!$fOut) {
die('Unable to open file for writing: ' . $sFileOut . '.' . "\n\n");
}
fputs($fOut, '# Generated ' . date('r') . ' by ' . $sScriptName . "\n" .
'# Transcriptome alignment to the genome taken from ' . $sFileIn . '.' . "\n" .
'# Transcript' . "\t" . 'Chr' . "\t" . 'Strand' . "\t" . 'Exon_positions' . "\n");
$fOutUnsupportedTranscripts = @fopen($sFileOutUnsupportedTranscripts, 'w');
if (!$fOutUnsupportedTranscripts) {
die('Unable to open file for writing: ' . $sFileOutUnsupportedTranscripts . '.' . "\n\n");
}
fputs($fOutUnsupportedTranscripts, '# Generated ' . date('r') . ' by ' . $sScriptName . "\n" .
'# Transcriptome alignment to the genome taken from ' . $sFileIn . '.' . "\n" .
'# Transcript' . "\t" . 'Reason_for_failure' . "\n");
$aData = array(); // Will contain transcripts as keys, with an array (chromosome, strand, positions_encoded) as value.
$aUnsupportedTranscripts = array(); // Will contain transcripts as keys, and the reason for rejection as value (no alignment, no real chromosome, twice in file (=chimeric), bad mapping (Del, Ins)).
$nLine = 0;
$sTranscript = '';
while ($sLine = fgets($fIn)) {
$nLine ++;
$nBytesRead += strlen($sLine);
$sLine = rtrim($sLine);
if (!$sLine || $sLine{0} == '@') {
continue;
}
$sPreviousTranscript = $sTranscript;
if (substr_count($sLine, "\t") < 5) {
// This doesn't look like the requested format...
die('Unable to parse file: ' . $sFileIn . ', line ' . $nLine . ':' . "\n" . $sLine . "\n\n");
}
list($sReference, $nBitFlag, $sChromosome, $nPosition, $nQuality, $sCIGAR) = explode("\t", $sLine); // Ignoring all the other cols.
list(, , , $sTranscript) = explode('|', $sReference);
$sStrand = ($nBitFlag & 16? '-' : '+');
// If we see this reference twice, we need to kill both instances.
if ($sTranscript == $sPreviousTranscript) {
$aUnsupportedTranscripts[$sTranscript] = 'chimeric';
unset($aData[$sPreviousTranscript]);
continue;
}
if ($nBitFlag & 4) {
// Could not be mapped, skip.
$aUnsupportedTranscripts[$sTranscript] = 'no_alignment';
continue;
/*
flag:
0 = forward
1 = template having multiple segments in sequencing
2 = each segment properly aligned according to the aligner
4 = segment unmapped
8 = next segment unmapped
16 = reverse
*/
}
if ($sTranscript && preg_match('/^chr([XYM]|\d{1,2})$/', $sChromosome) && $nPosition && preg_match_all('/^(\d+[MIDNSHP])+$/', $sCIGAR, $aTMP)) {
// All seem OK. Store basic info first.
$sChromosome = substr($sChromosome, 3);
$aData[$sTranscript] = array($sChromosome, $sStrand);
// Convert the CIGAR string into small bits, for each section.
preg_match_all('/(\d+)([MIDNSHP])/', $sCIGAR, $aMatches);
// Now, loop the CIGAR sections to collect the exon's positions.
$aExonPositions = array();
$nCurrentPosition = (int) $nPosition;
$nPrependBases = 0;
$nAppendBases = 0;
foreach ($aMatches[0] as $nSection => $sSection) {
$n = $aMatches[1][$nSection];
$s = $aMatches[2][$nSection];
switch ($s) {
case 'S':
case 'H':
// Soft or Hard clipping; these bases don't align; the first or the last bases of the transcript.
// To compensate, we reduce the position with the size of the clipping.
// According to the format, S and H can only be used at the start or the end of the CIGAR string.
if ($nCurrentPosition == $nPosition) {
// At the start of the read.
$nPrependBases += $n;
} else {
$nAppendBases += $n;
}
break;
case 'M':
// Match; these bases align. This is an exon, and should be counted that way.
$nStartPosition = $nCurrentPosition;
$nCurrentPosition += $n;
$nEndPosition = $nCurrentPosition - 1;
$aExonPositions[] = array($nStartPosition, $nEndPosition);
break;
case 'N':
// No match; these bases don't align. This is an intron, and should be counted that way.
$nCurrentPosition += $n;
break;
case 'D':
case 'I':
// Not allowed; this indicates a change in sequence that we can not handle. This transcript is too different
// from the genomic sequence, and can not be used for transcriptome alignment. Report it!
if (!isset($aUnsupportedTranscripts[$sTranscript])) {
// Not reported before.
$aUnsupportedTranscripts[$sTranscript] = 'bad_alignment:' . $s;
} else {
// Reported before. Just append code.
$aUnsupportedTranscripts[$sTranscript] .= $s;
}
continue 2;
default:
// Unsupported modifier!
die('Can\'t parse line ' . $nLine . ', CIGAR string contains unknown modifier ' . $s . '.' . "\n\n");
}
}
////////////////////////////////////////////////////////////////////////////////
/*
NR_029642.1 0 chr7 3219189 40 82M
NM_001104543.1 16 chr17 18243570 40 932M8736N124M2671N216M714N792M336N295M19025N156M14S
CIGAR: CIGAR string. The CIGAR operations are given in the following table (set ‘*’ if unavailable):
M alignment match (can be a sequence match or mismatch)
I insertion to the reference
D deletion from the reference
N skipped region from the reference
S soft clipping (clipped sequences present in SEQ)
H hard clipping (clipped sequences NOT present in SEQ)
P padding (silent deletion from padded reference)
= sequence match
X sequence mismatch
• H can only be present as the first and/or last operation.
• S may only have H operations between them and the ends of the CIGAR string.
• For mRNA-to-genome alignment, an N operation represents an intron. For other types of
alignments, the interpretation of N is not defined.
• Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ.
*/
// Now handle the clipping.
if ($nPrependBases) {
// Clipping at the start. Extend first exon.
$aExonPositions[0][0] -= $nPrependBases;
}
if ($nAppendBases) {
// Clipping at the end. Extend last exon.
$aExonPositions[count($aExonPositions)-1][1] += $nAppendBases;
}
$aData[$sTranscript][] = json_encode($aExonPositions);
} elseif (preg_match('/^chr(([XYM]|\d{1,2})_.+_random|Un_.+)$/', $sChromosome)) {
// Unrecognized ("fake") chromosome...
$aUnsupportedTranscripts[$sTranscript] = 'weird_alignment:' . $sChromosome;
} else {
die("\n" .
'Can\'t parse line ' . $nLine . ':' . "\n" . $sLine . "\n\n");
}
if (!($nLine % 1000)) {
$nPercentageRead = round($nBytesRead/$nFileSize, 2);
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
$lDone = round($nPercentageRead*$nAvailableWidth);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $lDone) . str_repeat(' ', $nAvailableWidth - $lDone) . '] ' . $nLine . ' ' . str_pad(round($nPercentageRead*100), 3, ' ', STR_PAD_LEFT) . '%');
}
}
$nAvailableWidth = $_SETT['terminal_width'] - 8 - strlen($nLine);
print(str_repeat(chr(8), $_SETT['terminal_width']) .
'[' . str_repeat('=', $nAvailableWidth) . '] ' . $nLine . ' 100%');
fclose($fIn);
print("\n" .
'Done reading ' . $nLine . ' lines, writing output... ');
// First, write unsupported transcripts.
ksort($aUnsupportedTranscripts, SORT_STRING);
foreach ($aUnsupportedTranscripts as $sTranscript => $sReason) {
fputs($fOutUnsupportedTranscripts, $sTranscript . "\t" . $sReason . "\n");
}
fclose($fOutUnsupportedTranscripts);
// Now, the actual results.
ksort($aData, SORT_STRING);
foreach ($aData as $sTranscript => $aTranscript) {
fputs($fOut, $sTranscript . "\t" . implode("\t", $aTranscript) . "\n");
}
fclose($fOut);
print('Done.' . "\n" .
'Identified ' . count($aUnsupportedTranscripts) . ' unsupported transcripts, wrote data for ' . count($aData) . ' transcripts to ' . $sFileOut . '.' . "\n");
?>