forked from itzmeanjan/ff-gpu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench_ntt.cpp
153 lines (119 loc) · 4.46 KB
/
bench_ntt.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#include "bench_ntt.hpp"
int64_t
benchmark_forward_transform(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_src = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
uint64_t* vec_fwd = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
prepare_random_vector(vec_src, dim);
tp start = std::chrono::steady_clock::now();
{
buf_1d_u64_t buf_vec_src{ vec_src, sycl::range<1>{ dim } };
buf_1d_u64_t buf_vec_fwd{ vec_fwd, sycl::range<1>{ dim } };
forward_transform(q, buf_vec_src, buf_vec_fwd, dim, wg_size);
}
tp end = std::chrono::steady_clock::now();
std::free(vec_src);
std::free(vec_fwd);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}
int64_t
benchmark_inverse_transform(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_src = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
uint64_t* vec_inv = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
prepare_random_vector(vec_src, dim);
tp start = std::chrono::steady_clock::now();
{
buf_1d_u64_t buf_vec_src{ vec_src, sycl::range<1>{ dim } };
buf_1d_u64_t buf_vec_inv{ vec_inv, sycl::range<1>{ dim } };
inverse_transform(q, buf_vec_src, buf_vec_inv, dim, wg_size);
}
tp end = std::chrono::steady_clock::now();
std::free(vec_src);
std::free(vec_inv);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}
int64_t
benchmark_cooley_tukey_fft(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_src = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
uint64_t* vec_fwd = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
prepare_random_vector(vec_src, dim);
tp start = std::chrono::steady_clock::now();
{
buf_1d_u64_t buf_vec_src{ vec_src, sycl::range<1>{ dim } };
buf_1d_u64_t buf_vec_fwd{ vec_fwd, sycl::range<1>{ dim } };
cooley_tukey_fft(q, buf_vec_src, buf_vec_fwd, dim, wg_size);
}
tp end = std::chrono::steady_clock::now();
std::free(vec_src);
std::free(vec_fwd);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}
int64_t
benchmark_cooley_tukey_ifft(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_src = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
uint64_t* vec_inv = static_cast<uint64_t*>(malloc(sizeof(uint64_t) * dim));
prepare_random_vector(vec_src, dim);
tp start = std::chrono::steady_clock::now();
{
buf_1d_u64_t buf_vec_src{ vec_src, sycl::range<1>{ dim } };
buf_1d_u64_t buf_vec_inv{ vec_inv, sycl::range<1>{ dim } };
cooley_tukey_ifft(q, buf_vec_src, buf_vec_inv, dim, wg_size);
}
tp end = std::chrono::steady_clock::now();
std::free(vec_src);
std::free(vec_inv);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}
int64_t
benchmark_six_step_fft(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_h =
static_cast<uint64_t*>(sycl::malloc_host(sizeof(uint64_t) * dim, q));
uint64_t* vec_d =
static_cast<uint64_t*>(sycl::malloc_device(sizeof(uint64_t) * dim, q));
prepare_random_vector(vec_h, dim);
q.memcpy(vec_d, vec_h, sizeof(uint64_t) * dim).wait();
tp start = std::chrono::steady_clock::now();
six_step_fft(q, vec_d, dim, wg_size);
tp end = std::chrono::steady_clock::now();
sycl::free(vec_h, q);
sycl::free(vec_d, q);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}
int64_t
benchmark_six_step_ifft(sycl::queue& q,
const uint64_t dim,
const uint64_t wg_size)
{
uint64_t* vec_h =
static_cast<uint64_t*>(sycl::malloc_host(sizeof(uint64_t) * dim, q));
uint64_t* vec_d =
static_cast<uint64_t*>(sycl::malloc_device(sizeof(uint64_t) * dim, q));
prepare_random_vector(vec_h, dim);
q.memcpy(vec_d, vec_h, sizeof(uint64_t) * dim).wait();
tp start = std::chrono::steady_clock::now();
six_step_ifft(q, vec_d, dim, wg_size);
tp end = std::chrono::steady_clock::now();
sycl::free(vec_h, q);
sycl::free(vec_d, q);
return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
.count();
}