-
Notifications
You must be signed in to change notification settings - Fork 0
/
bundleElt4.h
126 lines (106 loc) · 3.96 KB
/
bundleElt4.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#ifndef _BUNDLE_ELT_H_
#define _BUNDLE_ELT_H_
#include "GPUincludes.h"
/*! \file bundleElt.h
* \brief Header file for data type to hold one element of a bundle.
*
* A bundle is the group of packets that are decoded at the same time.
* Each kernel must handle every packet in the bundle.
*
* bundleElt is typically implemented as a vector of a basic type
* (float for example).
* One or more samples may be stored in each element of this vector
* via packing. This capability would only be useful if the basic type
* is (unsigned) int; and < 32 bits are needed for the range of values.
*
* The number of packets in a bundle is controlled by two values:
* SLOTS_PER_ELT -- length of the vector
* SAMPLES_PER_SLOT -- samples stored in each vector element.
* and related:
* SAMPLE_WIDTH -- number of bits required for a sample.
*
* You must provide an implementation of the bundleElt structure
* that can satisfy the values you choose for these #define's.
* For example, if you set SLOTS_PER_ELT to 8
* bundleElt must contain at least 8 addressable fields.
* Likewise, (SAMPLE_WIDTH * SAMPLES_PER_SLOT) <= number of bits in a SLOT.
*
* Several common operators are overloaded for bundleElt in this include file.
* If you modiy bundleElt, you will need to update each of these definitions
* at the same time.
*/
#define SLOTS_PER_ELT 4
#define SAMPLES_PER_SLOT 1
#define SAMPLE_WIDTH 32
// #define SAMPLE_MASK ((1 << SAMPLE_WIDTH) -1)
/* Just for SAMPLE_WIDTH == 32 */
#define SAMPLE_MASK (~0)
#define USED_BITS (SAMPLES_PER_SLOT * SAMPLE_WIDTH)
// PKTS_PER_BUNDLE - the number of packets handled collectively
// in the basic data structure used here, bundleElt,
#define PKTS_PER_BUNDLE (SLOTS_PER_ELT * SAMPLES_PER_SLOT)
struct __builtin_align__(16) localBE
{
float s[SLOTS_PER_ELT];
};
typedef localBE bundleElt;
// typedef float4 bundleElt;
static __inline__ __host__ __device__ bundleElt make_bundleElt(float x0, float x1, float x2, float x3) {
bundleElt be;
be.s[0] = x0; be.s[1] = x1; be.s[2] = x2; be.s[3] = x3;
return be;}
static __inline__ __host__ __device__ bundleElt make_bundleElt(float x) {
return make_bundleElt(x,x,x,x);}
inline __host__ __device__ void operator+=(bundleElt &a, bundleElt b) {
a.s[0] += b.s[0];
a.s[1] += b.s[1];
a.s[2] += b.s[2];
a.s[3] += b.s[3];}
inline __host__ __device__ void operator*=(bundleElt &a, bundleElt b) {
a.s[0] *= b.s[0];
a.s[1] *= b.s[1];
a.s[2] *= b.s[2];
a.s[3] *= b.s[3];}
inline __host__ __device__ bundleElt operator+(bundleElt a, bundleElt b) {
bundleElt be;
be.s[0] = a.s[0] + b.s[0];
be.s[1] = a.s[1] + b.s[1];
be.s[2] = a.s[2] + b.s[2];
be.s[3] = a.s[3] + b.s[3];
return be;}
inline __host__ __device__ bundleElt operator-(bundleElt a, bundleElt b) {
bundleElt be;
be.s[0] = a.s[0] - b.s[0];
be.s[1] = a.s[1] - b.s[1];
be.s[2] = a.s[2] - b.s[2];
be.s[3] = a.s[3] - b.s[3];
return be;}
inline __host__ __device__ bundleElt operator*(bundleElt a, bundleElt b) {
bundleElt be;
be.s[0] = a.s[0] * b.s[0];
be.s[1] = a.s[1] * b.s[1];
be.s[2] = a.s[2] * b.s[2];
be.s[3] = a.s[3] * b.s[3];
return be;}
inline __host__ __device__ bundleElt operator/(bundleElt a, bundleElt b) {
bundleElt be;
be.s[0] = a.s[0] / b.s[0];
be.s[1] = a.s[1] / b.s[1];
be.s[2] = a.s[2] / b.s[2];
be.s[3] = a.s[3] / b.s[3];
return be;}
inline __host__ __device__ bundleElt operator/(bundleElt a, float b) {
bundleElt be;
be.s[0] = a.s[0] / b;
be.s[1] = a.s[1] / b;
be.s[2] = a.s[2] / b;
be.s[3] = a.s[3] / b;
return be;}
inline __device__ __host__ bundleElt clamp(bundleElt v, float a, float b) {
return make_bundleElt(clamp(v.s[0], a, b), clamp(v.s[1], a, b), clamp(v.s[2], a, b), clamp(v.s[3], a, b)); }
inline __host__ __device__ void fprintBE(FILE *fd, bundleElt a) {
fprintf(fd, "[%.2f, %.2f, %.2f, %.2f] ",
a.s[0],a.s[1],a.s[2],a.s[3]);
}
#define ONEVAL(be) (be).s[0]
#endif