-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeSeedsGraph.cpp
133 lines (107 loc) · 3.17 KB
/
makeSeedsGraph.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*
Load a set of seed files, one for each read, as vertices of the graph.
Two seeds a and b are connected by a directed edge if they are adjacent
seeds (in this order) on some read.
Each read is therefore represented by a path in the resulting graph.
After all reads are processed, remove nodes (seeds) that only appear
in one read.
Output the graph in dot format.
By: Ke@PSU
Last edited: 03/10/2023
*/
#include "util.h"
#include "SeedsGraph.hpp"
#include <sys/stat.h>
#include <iostream>
#include <fstream>
using namespace std;
typedef SeedsGraph<kmer> Graph;
typedef Graph::Node Node;
string kmerToString(const kmer& x, unsigned int k, char* buf){
decode(x, k, buf);
return string(buf);
}
inline Node* storeSeedWithPosInGraph(
kmer seed, const size_t read_idx, const size_t cur_pos,
const size_t cur_span,
size_t* prev_pos, Node* prev, Graph& g){
//avoid self loops -- already done at seed generation
//if(prev && prev->seed == seed) return prev;
Node* cur = g.addNode(seed);
if(prev){
prev->addNext(read_idx, *prev_pos, cur_span, cur);
cur->addPrev(read_idx, cur_pos, cur_span, prev);
}
*prev_pos = cur_pos;
return cur;
}
void loadSubseqSeeds(const char* filename, const size_t read_idx, Graph& g){
FILE* fin = fopen(filename, "rb");
Seed s;
size_t prev_pos;
Node* prev=nullptr;
Node *head=nullptr, *tail=nullptr;
if(fread(&s, sizeof(s), 1, fin) == 1){
//first node
prev = g.addNode(s.v);
prev_pos = s.pos;
head = tail = prev;
//following nodes
while(fread(&s, sizeof(s), 1, fin) == 1){
prev = storeSeedWithPosInGraph(s.v, read_idx, s.pos, s.span,
&prev_pos, prev, g);
tail = prev;
}
g.addReadPath(read_idx, head, tail);
}
if(ferror(fin)){
fprintf(stderr, "Error reading %s/n", filename);
}
fclose(fin);
}
int main(int argc, const char * argv[])
{
if(argc != 4){
printf("usage: makeSeedsGraph.out seedsDir k numFiles\n");
return 1;
}
unsigned int n = atoi(argv[3]);
unsigned int k = atoi(argv[2]);
char filename[500];
unsigned int dir_len = strlen(argv[1]);
memcpy(filename, argv[1], dir_len);
if(filename[dir_len-1] != '/'){
filename[dir_len] = '/';
++dir_len;
}
Graph g(n);
size_t j;
//load all seeds
struct stat test_file;
for(j=1; j<=n; j+=1){
sprintf(filename+dir_len, "%zu.subseqseed", j);
if(stat(filename, &test_file) != 0){//seed file does not exist
fprintf(stderr, "Stopped, cannot find file %zu.subseqseed\n", j);
break;
}
loadSubseqSeeds(filename, j, g);
}
//only keep reads that appear on multiple distinct reads
g.removeUniqSeeds();
//output to dot file
sprintf(filename+dir_len, "overlap-n%d-graph.dot", n);
char buf[k+1];
buf[k] = '\0';
g.saveGraphToDot(filename, kmerToString, k, buf);
//save graph to binary file
sprintf(filename+dir_len, "overlap-n%d.graph", n);
g.saveGraph(filename);
//test save and load graph produce an identical copy
/*
Graph g2;
g2.loadGraph(filename);
sprintf(filename+dir_len, "overlap-n%d-graph.dot.cp", n);
g2.saveGraphToDot(filename, kmerToString, k, buf);
*/
return 0;
}