-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmmpy_kernel.cu
98 lines (82 loc) · 2.69 KB
/
mmpy_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Matrix multiply device code
#include <assert.h>
#include <math.h>
#include "utils.h"
#include "types.h"
using namespace std;
// srun -u -v --gres=gpu:1 ./mmpy -n 512 -x 1 -y 512 -r 3
// ./mmpy -n 512 -r 3
// make
#define A(i, j) A[(i)*N + (j)]
#define B(i, j) B[(i)*N + (j)]
#define C(i, j) C[(i)*N + (j)]
// Make sure to change TS and WPT in setGrid.cu accordingly
#define TS 128
#define TSK 16
#define WPT 8 // work per thread (work size)
#define RTS (TS/WPT)
#define LPTX (TS/RTS)
#define LPTY (TSK/RTS)
__global__ void matMul(const int N, _DOUBLE_ *C, const _DOUBLE_ *A, const _DOUBLE_ *B) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int bx = blockIdx.x;
const int by = blockIdx.y;
const int numTiles = N / TSK;
__shared__ _DOUBLE_ As[TSK][TS], Bs[TS][TSK+1];
_DOUBLE_ Areg, Breg[WPT], Creg[WPT][WPT];
#pragma unroll
for (int w1=0; w1<WPT; w1++){
#pragma unroll
for (int w2=0; w2<WPT; w2++){
Creg[w1][w2] = 0.0f;
}
}
for (int t=0; t<numTiles; t++){
const int AtileRow = bx * TS;
const int AtileCol = t * TSK;
const int BtileRow = t * TSK;
const int BtileCol = by * TS;
#pragma unroll
for (int w1=0; w1<LPTX; w1++){
#pragma unroll
for (int w2=0; w2<LPTY; w2++){
const int AworkRow = tx + w1 * RTS;
const int AworkCol = ty + w2 * RTS;
const int BworkRow = tx + w2 * RTS;
const int BworkCol = ty + w1 * RTS;
As[AworkCol][AworkRow] = __ldg(&A(AtileCol+AworkCol, AtileRow+AworkRow));
Bs[BworkCol][BworkRow] = __ldg(&B(BtileCol+BworkCol, BtileRow+BworkRow));
// As[AworkCol][AworkRow] = A(0, 0);
// Bs[BworkCol][BworkRow] = B(0, 0);
}
}
__syncthreads();
for (int k=0; k<TSK; k++){
#pragma unroll
for (int w=0; w<WPT; w++){
Breg[w] = Bs[ty+w*RTS][k];
}
#pragma unroll
for (int w1=0; w1<WPT; w1++){
Areg = As[k][tx+w1*RTS];
#pragma unroll
for (int w2=0; w2<WPT; w2++){
Creg[w1][w2] += Areg * Breg[w2];
}
}
}
__syncthreads();
}
#pragma unroll
for (int w1=0; w1<WPT; w1++){
int CtileRow = bx * TS;
int CtileCol = by * TS;
int CworkRow = tx + w1 * RTS;
#pragma unroll
for (int w2=0; w2<WPT; w2++){
int CworkCol = ty + w2 * RTS;
C(CtileCol+CworkCol, CtileRow+CworkRow) = Creg[w1][w2];
}
}
}