-
Notifications
You must be signed in to change notification settings - Fork 0
/
asm_gpu.cu
116 lines (98 loc) · 3.85 KB
/
asm_gpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>
#define block_size 16
#define alphabet_size 26
#define DEBUG 0
__device__ char POSSIBLE_CHAR[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
__global__ void compute_X(int *X, int *T, int n) {
int row = threadIdx.x;
for (int j = 0; j <= n; j++) {
if (j == 0)
X[row*(n+1)+j] = 0;
else if (T[j-1] == int(POSSIBLE_CHAR[row]))
X[row*(n+1)+j] = j;
else
X[row*(n+1)+j] = X[row*(n+1)+(j-1)];
}
}
__global__ void compute_Dist(int *Dist, int *X, int *T, int *P, int n, int m) {
int col = threadIdx.x;
for (int i = 0; i <= m; i++) {
if (i == 0)
Dist[i*(n+1)+col] = 0;
else if (col == 0)
Dist[i*(n+1)+col] = i;
else if (T[col-1] == P[i-1])
Dist[i*(n+1)+col] = Dist[(i-1)*(n+1)+(col-1)];
else if (X[(P[i-1]-int('A'))*(n+1) + col] == 0)
Dist[i*(n+1)+col] = 1 + min(Dist[(i-1)*(n+1)+col], min(Dist[(i-1)*(n+1)+(col-1)], i+col-1));
else // D[i-1, X[l,j]-1] + (j - 1 - X[l,j])
Dist[i*(n+1)+col] = 1 + min(min(Dist[(i-1)*(n+1)+col], Dist[(i-1)*(n+1)+(col-1)]), Dist[(i-1)*(n+1) + X[(P[i-1]-int('A'))*(n+1) + col] - 1] + (col-1-X[(P[i-1]-int('A'))*(n+1) + col]));
__syncthreads();
}
}
int main(int argc, char **argv) {
assert(argc == 4);
char *in_T = argv[1], *in_P = argv[2], *out = argv[3];
FILE *input_T, *input_P, *output;
input_T = fopen(in_T, "rb");
input_P = fopen(in_P, "rb");
int n, m;
int *T, *P, *device_T, *device_P;
fread(&n, 1, sizeof(int), input_T);
fread(&m, 1, sizeof(int), input_P);
T = (int*)malloc(sizeof(int)*n);
P = (int*)malloc(sizeof(int)*m);
// cudaMallocHost(&T, sizeof(int)*n);
// cudaMallocHost(&P, sizeof(int)*m);
fread(T, n, sizeof(int), input_T);
fread(P, m, sizeof(int), input_P);
# if DEBUG
printf("n = %d, m = %d\n", n, m);
for (int i = 0; i < n; i++)
printf("%d ", T[i]);
printf("\n");
for (int i = 0; i < m; i++)
printf("%d ", P[i]);
printf("\n");
# endif
fclose(input_T);
fclose(input_P);
int *host_Dist, *device_Dist, *host_X, *device_X;
// cudaMallocHost(&host_X, sizeof(int)*(n+1)*alphabet_size);
// cudaMallocHost(&host_Dist, sizeof(int)*(n+1)*(m+1));
host_X = (int*)malloc(sizeof(int)*(n+1)*alphabet_size);
host_Dist = (int*)malloc(sizeof(int)*(n+1)*(m+1));
cudaMalloc((void**)&device_Dist, sizeof(int)*(n+1)*(m+1));
cudaMalloc((void**)&device_X, sizeof(int)*(n+1)*alphabet_size);
cudaMalloc((void**)&device_T, sizeof(int)*n);
cudaMalloc((void**)&device_P, sizeof(int)*m);
cudaMemcpy(device_T, T, sizeof(int)*n, cudaMemcpyHostToDevice);
cudaMemcpy(device_P, P, sizeof(int)*m, cudaMemcpyHostToDevice);
// cudaMemcpy(device_Dist, host_Dist, sizeof(int)*n*m, cudaMemcpyHostToDevice);
// dim3 compute_X_nblocks((n+1)/block_size+1, m/block_size+1);
// dim3 compute_X_nthreads(block_size, block_size);
compute_X <<< 1, alphabet_size >>> (device_X, device_T, n);
compute_Dist <<< 1, n+1 >>> (device_Dist, device_X, device_T, device_P, n, m);
cudaMemcpy(host_Dist, device_Dist, sizeof(int)*(n+1)*(m+1), cudaMemcpyDeviceToHost);
cudaMemcpy(host_X, device_X, sizeof(int)*(n+1)*alphabet_size, cudaMemcpyDeviceToHost);
# if DEBUG
for (int i = 0; i < alphabet_size; i++) {
for (int j = 0; j < n+1; j++)
printf("%d ", host_X[i*(n+1)+j]);
printf("\n");
}
printf("\n");
for (int i = 0; i < m+1; i++) {
for (int j = 0; j < n+1; j++)
printf("%d ", host_Dist[i*(n+1)+j]);
printf("\n");
}
# endif
output = fopen(out, "wb");
fwrite(host_Dist, (n+1)*(m+1), sizeof(int), output);
fclose(output);
return 0;
}