-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeafg.c
454 lines (387 loc) · 15.8 KB
/
makeafg.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/*
makeafg 256seq.fastq 256seq.fastq-NEWCONTIG
*/
// Take a contig as a simple text string, and the file of reads that it was extracted from, and generate a .afg
// matching up all the reads which match 100% (or with a small allowed number of errors) so that it can be viewed
// in 'tablet' etc.
// NOT YET WRITTEN. CURRENLT THIS IS THE CODE FROM glocate.c
#define _FILE_OFFSET_BITS 64
typedef unsigned long long EDGE;
typedef unsigned long long INDEX;
#define ENDS_WORD (1ULL<<63ULL)
#define EDGE_MASK (ENDS_WORD-1ULL)
#define MAX_LINE 1024
#define MIN_OVERLAP 13
//static INDEX MAX_SIZE = ((INDEX)0L);
#define ROOT_CELL ((INDEX)1L)
// Node 0 is unused, 0 is needed as a terminator.
// originally I used 'next_free_edge' and it was exclusive. Have now changed that
// to 'last_used_edge' which is inclusive. getting a free edge is now a function
// call that we can make atomic with an OMP PRAGMA.
typedef struct node {
// ACGT maps to 0 1 2 3, anything else such as 'N' is 4
EDGE edge[5];
} CELL;
CELL *trie_cell;
static INDEX last_used_edge; // inclusive
#define _A_ 0
#define _C_ 1
#define _G_ 2
#define _T_ 3
#define _N_ 4
char *trt = "ACGTN"; // translate table back from the above
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>
#define _XOPEN_SOURCE 500
#include <unistd.h>
ssize_t pread(int fd, void *buf, size_t count, off_t offset);
static int freq[256];
static long long *read_sequence_no_to_file_offset, contig_size;
static FILE *read_file, *gene_file, *afg_contig_file, *afg_tle_file, *afg_reads_file;
static int trie_fd = -1, index_fd = -1;
static off_t file_length;
static char gene_file_name[MAX_LINE];
static char trie_file_name[MAX_LINE];
static char index_file_name[MAX_LINE];
static char *stringat(long long textp)
{
char line[MAX_LINE], qual[MAX_LINE];
char *s;
(void)fseek(read_file, textp, SEEK_SET);
s = fgets(line, MAX_LINE, read_file);
fgets(line, MAX_LINE, read_file);
s = strchr(line, '\n'); if (s) *s = '\0'; // trim trailing newline
fgets(qual, MAX_LINE, read_file);
s = fgets(qual, MAX_LINE, read_file);
s = strchr(qual, '\n'); if (s) *s = '\0'; // trim trailing newline
strcat(line, ";");
strcat(line, qual);
return strdup(line);
}
void RED(FILE *f, long long read_id, char *seq, char *qlt);
void TLE(long long read_id, char *seq, int overlap_len, long long offset);
void walk_trie(INDEX trie_index, int trie_fd, int index_fd, int offset)
{
char *s;
int e, sp;
INDEX edge;
CELL tmp;
CELL *this;
if (trie_cell && (trie_index <= last_used_edge)) {
this = &trie_cell[trie_index];
} else {
ssize_t rc = pread(trie_fd, &tmp, sizeof(CELL), trie_index*sizeof(CELL));
if (rc != sizeof(CELL)) {
fprintf(stderr, "overlap: failed to fetch %ld bytes from offset 0x%llx on file %d, rc = %ld\n", sizeof(CELL), (long long)trie_index*sizeof(CELL), trie_fd, rc); exit(1);
exit(1);
}
this = &tmp;
}
for (e = 0; e < 5; e++) {
edge = this->edge[e]&EDGE_MASK;
if (this->edge[e]&ENDS_WORD) {
long long location;
char *q;
if (read_sequence_no_to_file_offset == NULL) {
ssize_t rc;
rc = pread(index_fd, &location, sizeof(long long), edge*sizeof(long long));
if (rc != sizeof(long long)) {
fprintf(stderr, "overlap: failed to fetch %ld bytes from offset 0x%llx on file %d, rc = %ld\n", sizeof(long long), (long long)edge*sizeof(long long), index_fd, rc); exit(1);
exit(1);
}
} else {
location = read_sequence_no_to_file_offset[edge];
}
s = stringat(location);q = strchr(s, ';'); *q++ = '\0';
for (sp = 0; sp < offset; sp++) fputc(' ', stdout);
fprintf(stdout, "%s (read #%lld)", s, edge);
fprintf(stdout, " %c", s[strlen(s)-offset]);
freq[(int)(s[strlen(s)-offset])] += 1;
RED(afg_reads_file, edge, s, q);
TLE(edge, s, offset, contig_size);
free(s); s = NULL;
fputc('\n', stdout);
} else {
if (edge != 0LL) walk_trie(edge, trie_fd, index_fd, offset);
}
}
}
static CELL *fetch_trie_cell(INDEX idx) {
static CELL tmp; // NOT THREAD SAFE. Only one call at a time.
ssize_t rc;
rc = pread(trie_fd, &tmp, sizeof(CELL), idx*sizeof(CELL));
if (rc != sizeof(CELL)) {
fprintf(stderr, "glocate: failed to fetch %d bytes from offset 0x%llx on file %d, rc = %d\n",
(int)sizeof(CELL), idx*sizeof(CELL), trie_fd, (int)rc); exit(1);
exit(1);
}
return &tmp;
}
long long lookup_read(INDEX trie_index, char *s)
{
int c;
INDEX edge;
CELL *this;
if (trie_cell && (trie_index <= last_used_edge)) {
this = &trie_cell[trie_index];
} else {
this = fetch_trie_cell(trie_index);
}
c = *s;
if (c == '\0') {
//fprintf(stderr, "This is a prefix of a string or strings at trie_cell[%lld]\n", trie_index);
return trie_index;
}
//fprintf(stderr, "Match: %c @%lld\n", c, trie_index);
if (c == 'A') c = _A_;
else if (c == 'C') c = _C_;
else if (c == 'G') c = _G_;
else if (c == 'T') c = _T_;
else if (c == 'N') c = _N_;
else {
fprintf(stderr, "glocate: bad character '%c' at %s\n", c, s);
c = _N_; // some other char
}
edge = this->edge[c]&EDGE_MASK;
if (edge == 0LL) return 0LL;
if (this->edge[c]&ENDS_WORD) {
if (*(s+1) != '\0') {
fprintf(stderr, "warning: target string is longer than the reads in this database - excess is: %s\n", s+1);
}
// assert(*s == '\0');
return edge;
} else if (!(this->edge[c]&ENDS_WORD)) {
return lookup_read(edge, s+1);
}
return 0LL; // NOT FOUND.
}
// Eliminate duplicates and sort into numerical order. Doesn't scale well due to external sort program,
// but as long as we're just exploring individual contigs and not assembling the whole genome, this should
// work fine. And if we were assembling the whole genome, we'd generate the reads.afg by translating our
// fastq file with no need to eliminate duplicates
// sort -u < 256seq.fastq-GTGTCCATTATGTTGAACAAGAACAGTTCCATCTCCTTTT-reads.afg |sort -n -k2 -t:|tr ' ' '\n'> ZZZ;mv ZZZ 256seq.fastq-GTGTCCATTATGTTGAACAAGAACAGTTCCATCTCCTTTT-reads.afg
void RED(FILE *f, long long read_id, char *seq, char *qlt)
{ // just the raw reads
fprintf(f, "{RED ");
fprintf(f, "iid:%lld ", read_id);
fprintf(f, "eid:%lld ", read_id);
fprintf(f, "seq: %s . ", seq);
fprintf(f, "qlt: %s . ", qlt);
fprintf(f, "}\n");
}
void CTG_begin(void)
{ // Contig_t: Sequence_t
static int next = 0;
next++;
fprintf(afg_contig_file, "{CTG\n");
fprintf(afg_contig_file, "iid:%d\n", next);
fprintf(afg_contig_file, "eid:%d-0\n", next);
fprintf(afg_contig_file, "seq:\n");
// output the contig here
}
void CTG_end(char *firstq,long long contig_length) {
long long i;
fprintf(afg_contig_file, "\n.\n");
fprintf(afg_contig_file, "qlt:\n");
// output the quality info here. We'll fake it.
for (i = 0LL-strlen(firstq); i < contig_length; i++) {
fputc(64, afg_contig_file);
if (((i % 80LL) == 79LL) && (i+1LL != contig_length)) fputc('\n', afg_contig_file);
}
fprintf(afg_contig_file, "\n.\n");
// now insert all the tiling info that was writen to the .tle file ...
fprintf(afg_contig_file, "}\n");
}
// sort -n -t: -k2 < 256seq.fastq-GTGTCCATTATGTTGAACAAGAACAGTTCCATCTCCTTTT-tle.afg > ZZZ;mv ZZZ 256seq.fastq-GTGTCCATTATGTTGAACAAGAACAGTTCCATCTCCTTTT-tle.afg
// and pass through only the highest overlap for each read
void TLE(long long read_id, char *seq, int overlap_len, long long offset) {
fprintf(afg_tle_file, "{TLE ");
fprintf(afg_tle_file, "src:%lld ", read_id);
fprintf(afg_tle_file, "off:%lld ", offset+overlap_len);
fprintf(afg_tle_file, "clr:0,%03d ", (int)(strlen(seq)-overlap_len));
fprintf(afg_tle_file, "}\n");
}
int main(int argc, char **argv)
{
char *target, *target_tail;
long long trie_index;
int loops, indent; //, len
char most_frequent;
if (argc != 3) {
fprintf(stderr, "syntax: glocate file.fastq ACTUAL_READ\n");
exit(EXIT_FAILURE);
}
read_file = fopen(argv[1], "r");
if (read_file == NULL) {
fprintf(stderr, "glocate: cannot access fastq file %s - %s\n", argv[1], strerror(errno));
exit(EXIT_FAILURE);
}
sprintf(gene_file_name, "%s-%s", argv[1], argv[2]);
gene_file = fopen(gene_file_name, "w");
if (gene_file == NULL) {
fprintf(stderr, "glocate: cannot write contig file %s - %s\n", gene_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
sprintf(gene_file_name, "%s-%s-reads.afg", argv[1], argv[2]);
afg_reads_file = fopen(gene_file_name, "w");
if (afg_reads_file == NULL) {
fprintf(stderr, "glocate: cannot write afg reads file %s - %s\n", gene_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
sprintf(gene_file_name, "%s-%s-contig.afg", argv[1], argv[2]);
afg_contig_file = fopen(gene_file_name, "w");
if (afg_contig_file == NULL) {
fprintf(stderr, "glocate: cannot write afg contig file %s - %s\n", gene_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
sprintf(gene_file_name, "%s-%s-tle.afg", argv[1], argv[2]);
afg_tle_file = fopen(gene_file_name, "w");
if (afg_tle_file == NULL) {
fprintf(stderr, "glocate: cannot write afg contig file %s - %s\n", gene_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
// the three afg files are later concatenated into a single afg file suitable for use in 'tablet'
sprintf(trie_file_name, "%s-edges", argv[1]);
trie_fd = open(trie_file_name, O_RDONLY);
if (trie_fd < 0) {
fprintf(stderr, "glocate: cannot access trie file %s - %s\n", trie_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
sprintf(index_file_name, "%s-index", argv[1]);
index_fd = open(index_file_name, O_RDONLY);
if (index_fd < 0) {
fprintf(stderr, "glocate: cannot access trie file %s - %s\n", index_file_name, strerror(errno));
exit(EXIT_FAILURE);
}
file_length = lseek(trie_fd, (off_t)0LL, SEEK_END);
//fprintf(stderr, "trie: %lld entries\n", (long long)file_length/sizeof(CELL)-1LL);
trie_cell = mmap(NULL, (size_t)file_length, PROT_READ, MAP_PRIVATE/*SHARED*/, trie_fd, (off_t)0LL);
if ((trie_cell == NULL) || (trie_cell == (void *)-1)) {
//fprintf(stderr, "glocate: failed to map %s - %s - so using direct access instead\n", trie_file_name, strerror(errno));
trie_cell = NULL;
last_used_edge = -1; // force all accesses via disk.
// later can load as much as possible into ram, thuis caching the majority of accesses.
}
file_length = lseek(index_fd, (off_t)0LL, SEEK_END);
last_used_edge = (long long)file_length/sizeof(long long)-1LL;
//fprintf(stderr, "index: %lld entries (last_used_edge = %lld)\n", last_used_edge-1, last_used_edge);
read_sequence_no_to_file_offset = mmap(NULL, (size_t)file_length, PROT_READ, MAP_SHARED, index_fd, (off_t)0LL);
if ((read_sequence_no_to_file_offset == NULL) || (read_sequence_no_to_file_offset == (void *)-1)) {
fprintf(stderr, "glocate: failed to map %s - %s - so using direct access instead\n", trie_file_name, strerror(errno));
read_sequence_no_to_file_offset = NULL;
} else {
//fprintf(stderr, "Successfully mmap'd sequence_number_to_file_offset[%d]\n", (int)((size_t)file_length/sizeof(INDEX))-1);
}
target = strdup(argv[2]);
fprintf(stdout, "%s\n", target);
fprintf(gene_file, "%s", target);
contig_size = 0LL;
CTG_begin();
fprintf(afg_contig_file, "%s", target);
loops = 0;
for (;;) {
target_tail = target;
freq['C'] = freq['G'] = freq['A'] = freq['T'] = freq['N'] = 0;
indent = 0;
trie_index = lookup_read(ROOT_CELL, target_tail);
if (trie_index != 0ULL) {
// output root read in various forms
char *s, *q;
//int sp;
INDEX edge = trie_index;
long long location;
if (read_sequence_no_to_file_offset == NULL) {
ssize_t rc;
rc = pread(index_fd, &location, sizeof(long long), edge*sizeof(long long));
if (rc != sizeof(long long)) {
fprintf(stderr, "overlap: failed to fetch %ld bytes from offset 0x%llx on file %d, rc = %ld\n", sizeof(long long), (long long)edge*sizeof(long long), index_fd, rc); exit(1);
exit(1);
}
} else {
location = read_sequence_no_to_file_offset[edge];
}
s = stringat(location);q = strchr(s, ';'); *q++ = '\0';
fprintf(stdout, "%s (read #%lld)", s, edge);
RED(afg_reads_file, edge, s, q);
TLE(edge, s, 0, contig_size);
free(s); s = NULL;
fputc('\n', stdout);
}
for (;;) { // we only look one character ahead
target_tail += 1;
loops += 1; indent += 1;
// next_char = 0;
trie_index = lookup_read(ROOT_CELL, target_tail);
if (trie_index == 0ULL) continue;
walk_trie(trie_index, trie_fd, index_fd, indent);
if (strlen(target_tail) < 16) break;
}
// this is crude. What we need here is to identify buckets of frequencies, taking into
// account the random sampling rate. We need to be able to identify when there is a fork
// in the road, so that the user can make a manual decision which branch to follow.
most_frequent = 'N';
if ((freq['C'] >= freq['G']) && (freq['C'] >= freq['A']) && (freq['C'] >= freq['T'])) most_frequent = 'C';
if ((freq['G'] >= freq['C']) && (freq['G'] >= freq['A']) && (freq['G'] >= freq['T'])) most_frequent = 'G';
if ((freq['A'] >= freq['C']) && (freq['A'] >= freq['G']) && (freq['A'] >= freq['T'])) most_frequent = 'A';
if ((freq['T'] >= freq['C']) && (freq['T'] >= freq['G']) && (freq['T'] >= freq['A'])) most_frequent = 'T';
{
int slop_factor = freq[(int)most_frequent]/8;
if (most_frequent == 'C') {
if ((freq['G']+slop_factor >= freq['C']) || (freq['A']+slop_factor >= freq['C']) || (freq['T']+slop_factor >= freq['C'])) { // (typo fixed! - rerun last results!)
most_frequent = 'N';
}
} else if (most_frequent == 'G') {
if ((freq['C']+slop_factor >= freq['G']) || (freq['A']+slop_factor >= freq['G']) || (freq['T']+slop_factor >= freq['G'])) {
most_frequent = 'N';
}
} else if (most_frequent == 'A') {
if ((freq['C']+slop_factor >= freq['A']) || (freq['G']+slop_factor >= freq['A']) || (freq['T']+slop_factor >= freq['A'])) {
most_frequent = 'N';
}
} else if (most_frequent == 'T') {
if ((freq['C']+slop_factor >= freq['T']) || (freq['G']+slop_factor >= freq['T']) || (freq['A']+slop_factor >= freq['T'])) {
most_frequent = 'N';
}
}
if (most_frequent == 'C') {
if (freq['G']+freq['A']+freq['T'] > freq[(int)most_frequent]-slop_factor) {
most_frequent = 'N';
}
} else if (most_frequent == 'G') {
if (freq['C']+freq['A']+freq['T'] > freq[(int)most_frequent]-slop_factor) {
most_frequent = 'N';
}
} else if (most_frequent == 'A') {
if (freq['C']+freq['G']+freq['T'] > freq[(int)most_frequent]-slop_factor) {
most_frequent = 'N';
}
} else if (most_frequent == 'T') {
if (freq['C']+freq['G']+freq['A'] > freq[(int)most_frequent]-slop_factor) {
most_frequent = 'N';
}
}
}
memmove(target, target+1, strlen(target+1));
target[strlen(target)-1] = most_frequent;
if (most_frequent == 'N') break;
fputc(most_frequent, gene_file); fflush(gene_file);
fputc(most_frequent, afg_contig_file); if ((contig_size % 80LL) == 79LL) fputc('\n', afg_contig_file); fflush(afg_contig_file); contig_size += 1LL;
{ // Warn if there is any other significant branching choice, even if it is not as common as the primary one...
fprintf(stdout, "#%c c: %d g: %d a: %d t: %d @%d\n%s\n",
most_frequent, freq['C'], freq['G'], freq['A'], freq['T'], contig_size, target);
}
}
CTG_end(target, contig_size);
fprintf(stdout, "#N c: %d g: %d a: %d t: %d\n", freq['C'], freq['G'], freq['A'], freq['T']);
fprintf(stderr, "Exited with target = \"%s\", c: %d g: %d a: %d t: %d\n", target, freq['C'], freq['G'], freq['A'], freq['T']);
exit(EXIT_SUCCESS);
return EXIT_FAILURE;
}