-
Notifications
You must be signed in to change notification settings - Fork 2
/
gpu.h
440 lines (371 loc) · 12.8 KB
/
gpu.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
#ifndef _GPU_H
#define _GPU_H
#include <isl/ast.h>
#include <isl/id.h>
#include <isl/id_to_ast_expr.h>
#include <pet.h>
#include "ppcg.h"
#include "ppcg_options.h"
/* An access to an outer array element or an iterator.
* Accesses to iterators have an access relation that maps to an unnamed space.
* An access may be both read and write.
* If the access relation is empty, then the output dimension may
* not be equal to the dimension of the corresponding array.
*/
struct gpu_stmt_access {
/* Access reads elements */
int read;
/* Access writes elements */
int write;
/* All writes are definite writes. */
int exact_write;
/* Is a single, fixed element being accessed? */
isl_bool fixed_element;
/* The number of index expressions specified in the access. */
int n_index;
/* May access relation */
isl_map *access;
/* May access relation with as domain a mapping from iteration domain
* to a reference identifier.
*/
isl_map *tagged_access;
/* The reference id of the corresponding pet_expr. */
isl_id *ref_id;
struct gpu_stmt_access *next;
};
/* A representation of a user statement.
* "stmt" points to the corresponding pet statement.
* "id" is the identifier of the instance set of the statement.
* "accesses" is a linked list of accesses performed by the statement.
* If the statement has been killed, i.e., if it will not be scheduled,
* then this linked list may be empty even if the actual statement does
* perform accesses.
*/
struct gpu_stmt {
isl_id *id;
struct pet_stmt *stmt;
struct gpu_stmt_access *accesses;
};
/* Represents an outer array possibly accessed by a gpu_prog.
*/
struct gpu_array_info {
/* The array data space. */
isl_space *space;
/* Element type. */
char *type;
/* Element size. */
int size;
/* Name of the array. */
char *name;
/* Declared extent of original array. */
isl_set *declared_extent;
/* AST expression for declared size of original array. */
isl_ast_expr *declared_size;
/* Extent of the array that needs to be copied. */
isl_set *extent;
/* Number of indices. */
unsigned n_index;
/* For each index, a bound on "extent" in that direction. */
isl_multi_pw_aff *bound;
/* The corresponding access AST expression, if the array needs
* to be allocated on the device.
*/
isl_ast_expr *bound_expr;
/* All references to this array; point to elements of a linked list. */
int n_ref;
struct gpu_stmt_access **refs;
/* Is this array accessed at all by the program? */
int accessed;
/* Is this a scalar that is read-only within the entire program? */
int read_only_scalar;
/* Are the elements of the array structures? */
int has_compound_element;
/* Are the elements only accessed through constant index expressions? */
int only_fixed_element;
/* Is the array local to the scop? */
int local;
/* Is the array local and should it be declared on the host? */
int declare_local;
/* Is the corresponding global device memory accessed in any way? */
int global;
/* Should the array be linearized? */
int linearize;
/* Order dependences on this array.
* Only used if live_range_reordering option is set.
* It is set to NULL otherwise.
*/
isl_union_map *dep_order;
};
/* Represents an outer array accessed by a ppcg_kernel, localized
* to the context of this kernel.
*
* "array" points to the corresponding array in the gpu_prog.
* The "n_group" "groups" are the reference groups associated to the array.
* If "force_private" is set, then the array (in practice a scalar)
* must be mapped to a register.
* "global" is set if the global device memory corresponding
* to this array is accessed by the kernel.
* "bound" is equal to array->bound specialized to the current kernel.
* "bound_expr" is the corresponding access AST expression.
*/
struct gpu_local_array_info {
struct gpu_array_info *array;
int n_group;
struct gpu_array_ref_group **groups;
int force_private;
int global;
unsigned n_index;
isl_multi_pw_aff *bound;
isl_ast_expr *bound_expr;
};
__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
/* A sequence of "n" names of types.
*/
struct gpu_types {
int n;
char **name;
};
/* "read" and "write" contain the original access relations, possibly
* involving member accesses.
*
* The elements of "array", as well as the ranges of "copy_in" and "copy_out"
* only refer to the outer arrays of any possible member accesses.
*/
struct gpu_prog {
isl_ctx *ctx;
struct ppcg_scop *scop;
/* A schedule_node that is computed by get_schedule() */
isl_schedule_node *original_schedule_node;
/* Set of parameter values */
isl_set *context;
/* All potential read accesses in the entire program */
isl_union_map *read;
/* All potential write accesses in the entire program */
isl_union_map *may_write;
/* All definite write accesses in the entire program */
isl_union_map *must_write;
/* All tagged definite kills in the entire program */
isl_union_map *tagged_must_kill;
/* The set of inner array elements that may be preserved. */
isl_union_set *may_persist;
/* A mapping from all innermost arrays to their outer arrays. */
isl_union_map *to_outer;
/* A mapping from the outer arrays to all corresponding inner arrays. */
isl_union_map *to_inner;
/* A mapping from all intermediate arrays to their outer arrays,
* including an identity mapping from the anonymous 1D space to itself.
*/
isl_union_map *any_to_outer;
/* Order dependences on non-scalars. */
isl_union_map *array_order;
/* Array of statements */
int n_stmts;
struct gpu_stmt *stmts;
int n_array;
struct gpu_array_info *array;
};
struct gpu_gen {
isl_ctx *ctx;
struct ppcg_options *options;
/* Callback for printing of AST in appropriate format. */
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct gpu_types *types, void *user);
void *print_user;
struct gpu_prog *prog;
/* The generated AST. */
isl_ast_node *tree;
/* The sequence of types for which a definition has been printed. */
struct gpu_types types;
/* User specified tile, grid and block sizes for each kernel */
isl_union_map *sizes;
/* Effectively used tile, grid and block sizes for each kernel */
isl_union_map *used_sizes;
/* Identifier of the next kernel. */
int kernel_id;
};
enum ppcg_group_access_type {
ppcg_access_global,
ppcg_access_shared,
ppcg_access_private
};
enum ppcg_kernel_stmt_type {
ppcg_kernel_copy,
ppcg_kernel_domain,
ppcg_kernel_sync
};
/* Representation of special statements, in particular copy statements
* and __syncthreads statements, inside a kernel.
*
* type represents the kind of statement
*
*
* for ppcg_kernel_copy statements we have
*
* read is set if the statement should copy data from global memory
* to shared memory or registers.
*
* index expresses an access to the array element that needs to be copied
* local_index expresses the corresponding element in the tile
*
* array refers to the original array being copied
* local_array is a pointer to the appropriate element in the "array"
* array of the ppcg_kernel to which this copy access belongs
*
*
* for ppcg_kernel_domain statements we have
*
* stmt is the corresponding input statement
*
* n_access is the number of accesses in stmt
* access is an array of local information about the accesses
*/
struct ppcg_kernel_stmt {
enum ppcg_kernel_stmt_type type;
union {
struct {
int read;
isl_ast_expr *index;
isl_ast_expr *local_index;
struct gpu_array_info *array;
struct gpu_local_array_info *local_array;
} c;
struct {
struct gpu_stmt *stmt;
isl_id_to_ast_expr *ref2expr;
} d;
} u;
};
/* Representation of a local variable in a kernel.
*/
struct ppcg_kernel_var {
struct gpu_array_info *array;
enum ppcg_group_access_type type;
char *name;
isl_vec *size;
};
/* Representation of a kernel.
*
* prog describes the original code from which the kernel is extracted.
*
* id is the sequence number of the kernel.
*
* block_ids contains the list of block identifiers for this kernel.
* thread_ids contains the list of thread identifiers for this kernel.
*
* the first n_grid elements of grid_dim represent the specified size
* of the grid.
* the first n_block elements of block_dim represent the specified or
* effective size of the block.
* Note that in the input file, the sizes of the grid and the blocks
* are specified in the order x, y, z, but internally, the sizes
* are stored in reverse order, so that the last element always
* refers to the x dimension.
*
* grid_size reflects the effective grid size.
* grid_size_expr contains a corresponding access AST expression, built within
* the context where the launch appears.
*
* context contains the values of the parameters and outer schedule dimensions
* for which any statement instance in this kernel needs to be executed.
*
* n_sync is the number of synchronization operations that have
* been introduced in the schedule tree corresponding to this kernel (so far).
*
* core contains the spaces of the statement domains that form
* the core computation of the kernel. It is used to navigate
* the tree during the construction of the device part of the schedule
* tree in gpu_create_kernel.
*
* expanded_domain contains the original statement instances,
* i.e., those that appear in the domains of access relations,
* that are involved in the kernel.
* contraction maps those original statement instances to
* the statement instances that are active at the point
* in the schedule tree where the kernel is created.
*
* arrays is the set of possibly accessed outer array elements.
*
* space is the schedule space of the AST context. That is, it represents
* the loops of the generated host code containing the kernel launch.
*
* n_array is the total number of arrays in the input program and also
* the number of element in the array array.
* array contains information about each array that is local
* to the current kernel. If an array is not used in a kernel,
* then the corresponding entry does not contain any information.
*
* any_force_private is set if any array in the kernel is marked force_private
*
* block_filter contains constraints on the domain elements in the kernel
* that encode the mapping to block identifiers, where the block identifiers
* are represented by "n_grid" parameters with as names the elements
* of "block_ids".
*
* thread_filter contains constraints on the domain elements in the kernel
* that encode the mapping to thread identifiers, where the thread identifiers
* are represented by "n_block" parameters with as names the elements
* of "thread_ids".
*
* copy_schedule corresponds to the schedule dimensions of
* the (tiled) schedule for this kernel that have been taken into account
* for computing private/shared memory tiles.
* The domain corresponds to the original statement instances, i.e.,
* those that appear in the leaves of the schedule tree.
* copy_schedule_dim is the dimension of this schedule.
*
* sync_writes contains write references that require synchronization.
* Each reference is represented by a universe set in a space [S[i,j] -> R[]]
* with S[i,j] the statement instance space and R[] the array reference.
*/
struct ppcg_kernel {
isl_ctx *ctx;
struct ppcg_options *options;
struct gpu_prog *prog;
int id;
isl_id_list *block_ids;
isl_id_list *thread_ids;
int n_grid;
int n_block;
int grid_dim[2];
int block_dim[3];
isl_multi_pw_aff *grid_size;
isl_ast_expr *grid_size_expr;
isl_set *context;
int n_sync;
isl_union_set *core;
isl_union_set *arrays;
isl_union_pw_multi_aff *contraction;
isl_union_set *expanded_domain;
isl_space *space;
int n_array;
struct gpu_local_array_info *array;
int n_var;
struct ppcg_kernel_var *var;
int any_force_private;
isl_union_set *block_filter;
isl_union_set *thread_filter;
isl_union_pw_multi_aff *copy_schedule;
int copy_schedule_dim;
isl_union_set *sync_writes;
isl_ast_node *tree;
};
int gpu_array_is_scalar(struct gpu_array_info *array);
int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
int gpu_array_requires_device_allocation(struct gpu_array_info *array);
__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
void *gpu_prog_free(struct gpu_prog *prog);
int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
struct ppcg_options *options,
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct gpu_types *types, void *user), void *user);
__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
__isl_take isl_schedule_node *node, int scale,
__isl_keep isl_multi_val *sizes);
const char *get_outer_array_name(__isl_keep isl_map *access);
#endif