-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwlcss_pycuda.py
98 lines (75 loc) · 3.54 KB
/
wlcss_pycuda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void wlcss_cuda_kernel(int32_t *d_mss, int32_t *d_mss_offsets, int32_t *d_ts, int32_t *d_ss, int32_t *d_tlen, int32_t *d_toffsets, int32_t *d_slen, int32_t *d_soffsets, int32_t *d_params){
int params_idx = threadIdx.x;
int template_idx = blockIdx.x;
int stream_idx = blockIdx.y;
int t_len = d_tlen[template_idx];
int s_len = d_slen[stream_idx];
int t_offset = d_toffsets[template_idx];
int s_offset = d_soffsets[stream_idx];
int d_mss_offset = d_mss_offsets[params_idx*gridDim.x*gridDim.y+template_idx*gridDim.y+stream_idx];
int32_t *tmp_window = new int32_t[(t_len + 2)]();
int32_t *t = &d_ts[t_offset];
int32_t *s = &d_ss[s_offset];
int32_t *mss = &d_mss[d_mss_offset];
int32_t reward = d_params[params_idx*3];
int32_t penalty = d_params[params_idx*3+1];
int32_t accepteddist = d_params[params_idx*3+2];
int32_t tmp = 0;
for(int32_t j=0;j<s_len;j++){
for(int32_t i=0;i<t_len;i++){
int32_t distance = abs(s[j]-t[i]);
if (distance <= accepteddist){
tmp = tmp_window[i]+reward;
} else{
tmp = max(tmp_window[i]-penalty*distance,
max(tmp_window[i+1]-penalty*distance,
tmp_window[t_len+1]-penalty*distance));
}
tmp_window[i] = tmp_window[t_len+1];
tmp_window[t_len+1] = tmp;
}
tmp_window[t_len] = tmp_window[t_len+1];
mss[j] = tmp_window[t_len+1];
tmp_window[t_len+1] = 0;
}
delete [] tmp_window;
}
""")
def compute_wlcss(templates, streams, params):
wlcss_pycuda = mod.get_function("wlcss_cuda_kernel")
h_t = templates
h_s = streams
h_params = np.array(params).astype(np.int32)
num_templates = len(h_t) # Num block on X
num_streams = len(h_s) # Num block on Y
num_params_sets = len(h_params) # Num thread per block
h_tlen = np.array([len(t) for t in h_t]).astype(np.int32)
h_toffsets = np.cumsum(h_tlen).astype(np.int32)
h_toffsets = np.insert(h_toffsets[0:-1], 0, 0)
h_slen = np.array([len(s) for s in h_s]).astype(np.int32)
h_soffsets = np.cumsum(h_slen).astype(np.int32)
h_soffsets = np.insert(h_soffsets[0:-1], 0, 0)
h_ts = np.array([item for sublist in h_t for item in sublist]).astype(np.int32) # Template as numpy array
h_ss = np.array([item for sublist in h_s for item in sublist]).astype(np.int32) # Stream as numpy array
h_mss = np.zeros((len(h_ss) * num_params_sets * num_templates)).astype(np.int32)
d_mss = drv.mem_alloc(h_mss.nbytes)
drv.memcpy_htod(d_mss, h_mss)
h_mss_offsets = np.cumsum(np.tile(h_slen, num_params_sets * num_templates)).astype(np.int32)
h_mss_offsets = np.insert(h_mss_offsets, 0, 0)
wlcss_pycuda(d_mss, drv.In(h_mss_offsets),
drv.In(h_ts), drv.In(h_ss),
drv.In(h_tlen), drv.In(h_toffsets),
drv.In(h_slen), drv.In(h_soffsets),
drv.In(h_params),
block=(num_params_sets, 1, 1), grid=(num_templates, num_streams))
h_mss = np.empty_like(h_mss).astype(np.int32)
drv.memcpy_dtoh(h_mss, d_mss)
tmp_mss = np.array([h_mss[offset - 1] for offset in h_mss_offsets[1:]])
mss = [np.reshape(np.ravel(x), (num_streams, num_templates), order='F') for x in
np.reshape(tmp_mss, (num_params_sets, num_streams, num_templates))]
return mss