-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmmpy_kernel_tile.cu
74 lines (59 loc) · 1.89 KB
/
mmpy_kernel_tile.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Matrix multiply device code
#include <assert.h>
#include <math.h>
#include "utils.h"
#include "types.h"
using namespace std;
#define A(i, j) A[(i)*N + (j)]
#define B(i, j) B[(i)*N + (j)]
#define C(i, j) C[(i)*N + (j)]
#define TS 32
#define WPT 8 // work per thread
#define RTS (TS/WPT)
// srun -u -v --gres=gpu:1 ./mmpy -n 512 -x 1 -y 512 -r 3
// ./mmpy -n 512 -r 3
// make
__global__ void matMul(const int N, _DOUBLE_ *C, const _DOUBLE_ *A, const _DOUBLE_ *B) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int bx = blockIdx.x;
const int by = blockIdx.y;
const int numTiles = N / TS;
__shared__ _DOUBLE_ As[TS][TS], Bs[TS][TS+1];
_DOUBLE_ Cs[WPT];
#pragma unroll
for (int w=0; w<WPT; w++)
Cs[w] = 0.0f;
for (int t=0; t<numTiles; t++){
#pragma unroll
const int AtileRow = bx * TS;
const int AtileCol = t * TS;
const int BtileRow = t * TS;
const int BtileCol = by * TS;
for (int w=0; w<WPT; w++){
const int AworkRow = tx;
const int AworkCol = ty + w * RTS;
const int BworkRow = tx;
const int BworkCol = ty + w * RTS;
As[AworkCol][AworkRow] = __ldg(&A(AtileCol+AworkCol, AtileRow+AworkRow));
Bs[BworkCol][BworkRow] = __ldg(&B(BtileCol+BworkCol, BtileRow+BworkRow));
}
__syncthreads();
#pragma unroll
for (int k=0; k<TS; k++){
#pragma unroll
for (int w=0; w<WPT; w++){
Cs[w] += As[k][tx] * Bs[ty+w*RTS][k];
}
}
__syncthreads();
#pragma unroll
for (int w=0; w<WPT; w++){
const int CtileRow = bx * TS;
const int CtileCol = by * TS;
const int CworkRow = tx;
const int CworkCol = ty + w * RTS;
C(CtileCol+CworkCol, CtileRow+CworkRow) = Cs[w];
}
}
}