-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.cu
executable file
·89 lines (82 loc) · 3.89 KB
/
test.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include <cuda_runtime.h>
#include <cstdio>
#include "base/gpu/wave_fkernel.3d8o.cu"
#define SIZE 36
#define RADIUS 4
float coeffs_cpu[13] = get_coeffs(SIZE, SIZE, SIZE);
void prop(float *p0, float *p1, float *vel) {
int _nx = SIZE;
int _n12 = SIZE*SIZE;
for(int i3=4; i3 < SIZE-4; i3++){
for (int i2 = 4; i2 < SIZE-4; i2++) {
int ii = i2 * SIZE + 4 + SIZE*SIZE * i3;
for (int i1 = 4; i1 < SIZE-4; i1++, ii++) {
float x = p0[ii] =
vel[ii] *
( coeffs_cpu[C0] * p1[ii] +
coeffs_cpu[CX1] * (p1[ii - 1] + p1[ii + 1]) +
+coeffs_cpu[CX2] * (p1[ii - 2] + p1[ii + 2]) +
+coeffs_cpu[CX3] * (p1[ii - 3] + p1[ii + 3]) +
+coeffs_cpu[CX4] * (p1[ii - 4] + p1[ii + 4]) +
+coeffs_cpu[CY1] * (p1[ii - _nx] + p1[ii + _nx]) +
+coeffs_cpu[CY2] * (p1[ii - 2 * _nx] + p1[ii + 2 * _nx]) +
+coeffs_cpu[CY3] * (p1[ii - 3 * _nx] + p1[ii + 3 * _nx]) +
+coeffs_cpu[CY4] * (p1[ii - 4 * _nx] + p1[ii + 4 * _nx]) +
+coeffs_cpu[CZ1] * (p1[ii - 1 * _n12] + p1[ii + 1 * _n12]) +
+coeffs_cpu[CZ2] * (p1[ii - 2 * _n12] + p1[ii + 2 * _n12]) +
+coeffs_cpu[CZ3] * (p1[ii - 3 * _n12] + p1[ii + 3 * _n12]) +
+coeffs_cpu[CZ4] * (p1[ii - 4 * _n12] + p1[ii + 4 * _n12])) +
p1[ii] + p1[ii] - p0[ii];
}
}
}
}
int main()
{
// init array
float* ref_array_cpu = (float*)malloc(SIZE*SIZE*SIZE*sizeof(float));
float* test_array_gpu, *gpu_p0;
float* cpu_result = (float*)malloc(SIZE*SIZE*SIZE*sizeof(float));
float* gpu_result = (float*)malloc(SIZE*SIZE*SIZE*sizeof(float));
float* vel_cpu = (float*)malloc(SIZE*SIZE*SIZE*sizeof(float));
float* vel_gpu;
cudaMalloc(&test_array_gpu, SIZE*SIZE*SIZE*sizeof(float));
cudaMalloc(&gpu_p0, SIZE*SIZE*SIZE*sizeof(float));
cudaMalloc(&vel_gpu, SIZE*SIZE*SIZE*sizeof(float));
for (int i = 0; i < SIZE*SIZE*SIZE; ++i)
{
ref_array_cpu[i] = (float)(i % 4);
cpu_result[i] = 7.0009;
vel_cpu[i] = (float(i % 20));
}
cudaMemcpy(test_array_gpu, ref_array_cpu, SIZE*SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(vel_gpu, vel_cpu, SIZE*SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_p0, cpu_result, SIZE*SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(coeffs, coeffs_cpu, sizeof(float)*13, 0);
// run GPU kernel
dim3 block(16, 16);
dim3 grid(2, 2);
int offset = SIZE * SIZE * RADIUS + SIZE * RADIUS + RADIUS;
wave_kernel<<<grid, block>>>(gpu_p0+offset, test_array_gpu+offset, gpu_p0+offset, vel_gpu+offset, 0, 28, 1, 1);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("%s\n", cudaGetErrorString(err));
cudaMemcpy(gpu_result, gpu_p0, SIZE*SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
// run CPU impl
printf("current: %f\n", ref_array_cpu[offset]);
printf("x neg: %f %f %f %f\n", ref_array_cpu[offset-1], ref_array_cpu[offset-2], ref_array_cpu[offset-3], ref_array_cpu[offset-4]);
printf("x pos: %f %f %f %f\n", ref_array_cpu[offset+1], ref_array_cpu[offset+2], ref_array_cpu[offset+3], ref_array_cpu[offset+4]);
printf("y neg: %f %f %f %f\n", ref_array_cpu[offset-1*SIZE], ref_array_cpu[offset-2*SIZE], ref_array_cpu[offset-3*SIZE], ref_array_cpu[offset-4*SIZE]);
printf("y pos: %f %f %f %f\n", ref_array_cpu[offset+1*SIZE], ref_array_cpu[offset+2*SIZE], ref_array_cpu[offset+3*SIZE], ref_array_cpu[offset+4*SIZE]);
//printf("z: %f %f %f %f %f %f %f %f\n");
prop(cpu_result, ref_array_cpu, vel_cpu);
// compare
for (int i = 0; i < 28 * 28 * 28; ++i)
{
if (cpu_result[offset+i] - gpu_result[offset+i] > 1e-6)
{
printf("cpu at %d: %f\n", offset + i, cpu_result[offset+i]);
printf("gpu at %d: %f\n", offset + i, gpu_result[offset+i]);
}
}
}