-
Notifications
You must be signed in to change notification settings - Fork 4
/
mp3.cu
149 lines (118 loc) · 5.54 KB
/
mp3.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#include <wb.h>
#define wbCheck(value) { \
cudaError_t _m_cudaStat = value; \
if (_m_cudaStat != cudaSuccess) { \
printf("Error %s at line %d in file %s\n", \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
return -1; \
} \
}
#define matrixMemSize(rows, columns) \
(rows * columns) * sizeof(float)
#ifndef THREADS
# define THREADS 32
#endif
#define MAX_BLOCKS(size) \
(size - 1)/THREADS + 1
template <unsigned int TILE_WIDTH>
__global__ void matrixMultiplyTiled(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
__shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
unsigned int col = blockIdx.x * TILE_WIDTH + tx;
unsigned int row = blockIdx.y * TILE_WIDTH + ty;
float acc = 0;
for (int t = 0; t < (numAColumns-1)/TILE_WIDTH + 1; ++t) {
unsigned int ATilePitch = t * TILE_WIDTH + tx;
unsigned int BTilePitch = t * TILE_WIDTH + ty;
if (row < numARows && ATilePitch < numAColumns)
ds_A[ty][tx] = A[row * numAColumns + ATilePitch];
else
ds_A[ty][tx] = 0;
if (col < numBColumns && BTilePitch < numBRows)
ds_B[ty][tx] = B[BTilePitch * numBColumns + col];
else
ds_B[ty][tx] = 0;
__syncthreads();
#pragma unroll
for (int k = 0; k < TILE_WIDTH; ++k) acc += ds_A[ty][k] * ds_B[k][tx];
__syncthreads();
}
if (row < numCRows && col < numCColumns) C[row * numCColumns + col] = acc;
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
int deviceID = 0; // Let's get device of the default device
cudaDeviceProp deviceProperties;
wbCheck( cudaGetDevice(&deviceID) );
wbCheck( cudaGetDeviceProperties(&deviceProperties, deviceID) );
wbLog(TRACE,
"GPU Device ", deviceID, ": \"", deviceProperties.name,
"\" with compute capability ",
deviceProperties.major, ".", deviceProperties.minor);
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
numCRows = numARows;
numCColumns = numBColumns;
wbCheck( cudaMallocHost(&hostC, matrixMemSize(numCRows, numCColumns)) );
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbLog(TRACE, "The dimensions of C are ", numCRows, " x ", numCColumns);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck( cudaMalloc(&deviceA, matrixMemSize(numARows, numAColumns)) );
wbCheck( cudaMalloc(&deviceB, matrixMemSize(numBRows, numBColumns)) );
wbCheck( cudaMalloc(&deviceC, matrixMemSize(numCRows, numCColumns)) );
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck( cudaMemcpy(deviceA, hostA, matrixMemSize(numARows, numAColumns), cudaMemcpyHostToDevice) );
wbCheck( cudaMemcpy(deviceB, hostB, matrixMemSize(numBRows, numBColumns), cudaMemcpyHostToDevice) );
wbTime_stop(GPU, "Copying input memory to the GPU.");
dim3 grid(MAX_BLOCKS(numCColumns), MAX_BLOCKS(numCRows));
dim3 threads(THREADS, THREADS);
wbTime_start(Compute, "Performing CUDA computation");
if (deviceProperties.major < 2)
matrixMultiplyTiled<16><<<grid, threads>>>(deviceA, deviceB, deviceC,
numARows, numAColumns,
numBRows, numBColumns,
numCRows, numCColumns);
else
matrixMultiplyTiled<32><<<grid, threads>>>(deviceA, deviceB, deviceC,
numARows, numAColumns,
numBRows, numBColumns,
numCRows, numCColumns);
wbCheck( cudaThreadSynchronize() );
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck( cudaMemcpy(hostC, deviceC, matrixMemSize(numCRows, numCColumns), cudaMemcpyDeviceToHost) );
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
wbCheck( cudaFree(deviceA) );
wbCheck( cudaFree(deviceB) );
wbCheck( cudaFree(deviceC) );
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
cudaFreeHost(hostA);
cudaFreeHost(hostB);
cudaFreeHost(hostC);
wbCheck( cudaDeviceReset() );
return 0;
}