-
Notifications
You must be signed in to change notification settings - Fork 1
/
calculate_pi_gpu_opt_clean_float64.py
78 lines (57 loc) · 2.41 KB
/
calculate_pi_gpu_opt_clean_float64.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import time
import sys
import numpy as np
import numba.cuda as cuda
from numba import vectorize
import math
# Here we calculate pi by assessing the area under the curve using the GPU (float64 precision)
# Then we check the value against what is calculated in numpy
# Original CPU code from -> source: https://github.com/UNM-CARC/QuickBytes/blob/master/workshop_slides/CS_Math_471.pdf
#@cuda.reduce
#def sum_reduce(a, b):
# return a + b
@cuda.jit
def Pi(step_vec, sum_vec, step_size):
tx = cuda.threadIdx.x # Unique thread ID within 1 block
ty = cuda.blockIdx.x # Unique block ID
block_size = cuda.blockDim.x # Number of threads per block
grid_size = cuda.gridDim.x # Size of the grid
# We define the grid size:
startX = cuda.grid(1) # Starting point on the Grid
gridX = cuda.gridDim.x * cuda.blockDim.x # stride in x
stride = cuda.gridsize(1)
# Or we could have written the following
#startX = tx + ty * block_size
#stride = block_size * grid_size
for i in range(startX, step_vec.shape[0], stride):
step_vec[i] = (step_vec[i]+0.5) * step_size[0]
sum_vec[i] += 4.0 / (1.0 + step_vec[i] * step_vec[i])
if len(sys.argv)!=2:
print("Usage: ", sys.argv[0], "<number of steps>")
sys.exit(1)
step_vec = np.arange(int(sys.argv[1],10), dtype=np.float64)
sum_vec = np.zeros(int(sys.argv[1],10), dtype=np.float64)
pi = np.empty(1).astype(np.float64)
N = int(sys.argv[1],10)
step_size = np.ones(1, dtype=np.float64) / N
# We convert the variables to device
step_device = cuda.to_device(step_vec)
sum_device = cuda.to_device(sum_vec)
pi_device = cuda.to_device(pi)
step_size_device = cuda.to_device(step_size)
# We decide to use 128 blocks, each containing 128 threads
blocks_per_grid = 128
# With grid size of 32 I get
#"Warning: Grid size 32 will likely result in
#GPU under-utilization due to low occupancy."
thread_per_block = 128 # was 128
# Call the function to calculate pi
start = time.time() #Start
Pi[blocks_per_grid, thread_per_block](step_device, sum_device, step_size_device)
# I copy the devices to host and calculate the final pi version
sum_vec = sum_device.copy_to_host()
pi = step_size * sum_vec.sum()
end = time.time() # End
# Print estimation, the difference from the numpy val and how long it took.
print("Pi = %.20f, (Diff=%.20f) (calculated in %f secs with %d steps)" %(pi, pi-np.pi, end-start, step_vec.shape[0]))
sys.exit(0)