-
Notifications
You must be signed in to change notification settings - Fork 2
/
flag.cpp
110 lines (92 loc) · 2.67 KB
/
flag.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include "flag.h"
template<typename T, typename type_thread>
flag<T, type_thread>::flag(semiTensor<T> tensor, int BLOCK_SIZE) {
int threadLen = sizeof(T) * 8;
int nnz = tensor.nnz;
int flagLen = (nnz - 1) / threadLen + 1;
flagLen = 32 - flagLen % 32 + flagLen;
// int BLOCK_SIZE=tensor.BLOCK_SIZE;
int Gridsize = (flagLen - 1) / BLOCK_SIZE + 1;
cflag = (T *)malloc(sizeof(T) * flagLen);
bit_flag = (T *)malloc(sizeof(T) * flagLen);
first = (int *)malloc(sizeof(int) * flagLen);
startflag = (unsigned short *)malloc(sizeof(unsigned short) * flagLen);
block_flag = (unsigned short *)malloc(sizeof(unsigned short) * Gridsize);
memset(bit_flag, -1, sizeof(T)*flagLen);
memset(cflag, 0, sizeof(T)*flagLen);
memset(startflag, 0, sizeof(unsigned short)*flagLen);
memset(block_flag, 0, sizeof(unsigned short)*Gridsize);
for (int i = 0; i < flagLen; i++) {
// T ibits;
T bits = 0;
for (int j = 0; j < threadLen && (i * threadLen + j) < nnz; j++) {
unsigned int elem = tensor.flag[i * threadLen + j];
bits += (elem << j);
}
cflag[i] = bits;
}
for (int i = 0; i < flagLen - 1; i++) {
T ibits = 0;
for (int j = 0; j < threadLen; j++) {
unsigned int nextelem = tensor.flag[i * threadLen + j + 1];
if (nextelem == 1) {
ibits += (1 << j);
}
}
bit_flag[i] -= ibits;
}
T ibits = 0;
for (int j = 0; j < threadLen; j++) {
int index = (flagLen - 1) * threadLen + j;
unsigned int nextelem;
if (index < nnz - 1) {
nextelem = tensor.flag[index];
if (nextelem == 1) {
ibits += (1 << j);
}
}
if (index == nnz - 1) {
;
// ibits+=(1<<j);
}
if (index > nnz - 1 && j < threadLen - 1) {
;
}
if (j = threadLen - 1) {
ibits += (1 << j);
}
}
bit_flag[flagLen - 1] -= ibits;
for (int i = 0; i < flagLen; i++) {
if (bit_flag[i] != numeric_limits<T>::max()) {
startflag[i] = 1;
}
}
first[0] = 0; // first result entry on each thread
for (int i = 1; i < flagLen; i++) {
int sum = 0;
for (int j = 0; j < threadLen && (i * threadLen + j) < nnz; j++) {
if (cflag[(i - 1)*threadLen + j] == 1) {
++sum;
}
}
first[i] = sum - 1; // may be a bug
}
for (int i = 0; i < Gridsize - 1; i++) {
T *val = startflag + i * BLOCK_SIZE;
for (int j = 0; j < BLOCK_SIZE; j++) {
if (val[j] == 1) {
block_flag[i] = 1;
break;
}
}
}
int baseindex = (Gridsize - 1) * BLOCK_SIZE;
T *val = startflag + baseindex;
for (int j = 0; j < BLOCK_SIZE && baseindex + j < flagLen; j++) {
if (val[j] == 1) {
block_flag[Gridsize - 1] = 1;
break;
}
}
}