-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathqcl.hpp
1620 lines (1369 loc) · 50 KB
/
qcl.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* This file is part of QCL, a small OpenCL interface which makes it quick and
* easy to use OpenCL.
*
* Copyright (c) 2016,2017, Aksel Alpay
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef QCL_HPP
#define QCL_HPP
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include <CL/cl2.hpp>
// Disable OpenCL<->OpenGL interoperability by default
#ifndef WITH_GL_INTEROP
#define WITHOUT_GL_INTEROP
#endif
#ifndef WITHOUT_GL_INTEROP
#ifdef WIN32
// What is the correct header?
#elif __APPLE__
// What is the correct header?
#else
#include <GL/glew.h>
#include <GL/glx.h>
#endif
#endif
#include <sstream>
#include <fstream>
#include <memory>
#include <string>
#include <vector>
#include <stdexcept>
#include <map>
#include <cassert>
#include <boost/algorithm/string.hpp>
namespace qcl {
namespace detail {
/// Some functions of the OpenCL C++ Wrappers
/// will include a trailing \0 in the strings,
/// which can lead to problems. This function removes
/// '\0' from the string
static void remove_zeros(std::string& s)
{
std::size_t pos = std::string::npos;
while((pos = s.find('\0')) != std::string::npos)
s.erase(pos, 1);
}
/// The processor for the QCL meta language. This
/// is particularly useful in conjunction with modules.
class meta_source_processor
{
public:
std::string operator()(const std::string& source) const
{
std::string result;
for(std::size_t i = 0; i < source.size();)
{
if(source[i] == '$')
{
// Find terminating $
std::size_t terminating_char = source.find('$', i+1);
if(terminating_char == std::string::npos)
throw std::invalid_argument{"Error processing QCL source: "
"Expected terminating $ for $ starting at ...\""
+source.substr(i)+"\""};
std::string command_string = source.substr(i+1, terminating_char-i-1);
std::vector<std::string> elements;
boost::algorithm::split(elements, command_string, boost::algorithm::is_any_of(" \t\n"));
if(elements.size() > 0)
{
if(elements[0] == "pp")
{
result += "\n#";
for(std::size_t j = 1; j < elements.size(); ++j)
{
result += " ";
result += elements[j];
}
result += "\n";
}
else
{
throw std::invalid_argument{"Error processing QCL source: Encountered "
"invalid QCL meta command: "+elements[0]+
" (command string: "+command_string+")"};
}
}
i = terminating_char + 1;
}
else
{
result += source[i];
++i;
}
}
return result;
}
};
}
/// Class representing OpenCL errors that are encountered by QCL.
class qcl_error : public std::runtime_error
{
public:
/// \param what A description of the error
/// \param opencl_error The encountered error code
explicit qcl_error(const std::string& what, cl_int opencl_error)
: runtime_error{what},
_cl_err{opencl_error}
{}
qcl_error(const qcl_error& other) = default;
qcl_error& operator=(const qcl_error& other) = default;
virtual ~qcl_error(){}
/// \return The error code of the OpenCL error
cl_int get_opencl_error_code() const noexcept
{
return _cl_err;
}
private:
cl_int _cl_err;
};
/// Simple error checking function. In case of an error, throws
/// \c qcl_error
/// \throws \c std::rutime_error
/// \param err The error code that shall be checked
/// \param msg The error message
static
void check_cl_error(cl_int err, const std::string& msg)
{
if(err != CL_SUCCESS)
{
std::stringstream sstr;
sstr << "OpenCL error " << err << ": " << msg;
throw qcl_error{sstr.str(), err};
}
}
using kernel_ptr = std::shared_ptr<cl::Kernel>;
using buffer_ptr = std::shared_ptr<cl::Buffer>;
using command_queue_id = std::size_t;
template<class T>
class local_memory
{
public:
local_memory(std::size_t num_elements)
: _num_elements{num_elements}
{}
std::size_t get_num_elements() const
{
return _num_elements;
}
std::size_t get_size() const
{
return _num_elements * sizeof(T);
}
private:
std::size_t _num_elements;
};
template<class T>
class raw_memory
{
public:
raw_memory(T* data, std::size_t num_bytes)
: _data{data}, _num_bytes{num_bytes}
{}
T* get_data() const
{
return _data;
}
std::size_t get_size() const
{
return _num_bytes;
}
private:
T* _data;
std::size_t _num_bytes;
};
template<class T>
class device_array;
namespace detail {
/// Set of overloads to allow passing
/// QCL memory wrapper objects directly as kernel arguments
template<class T>
cl_int set_kernel_arg(std::size_t pos,
const kernel_ptr& kernel,
const T& data)
{ return kernel->setArg(pos, data); }
template<class T>
cl_int set_kernel_arg(std::size_t pos,
const kernel_ptr& kernel,
const local_memory<T>& local_mem)
{ return kernel->setArg(pos, local_mem.get_size(), nullptr); }
template<class T>
cl_int set_kernel_arg(std::size_t pos,
const kernel_ptr &kernel,
const raw_memory<T> &mem)
{ return kernel->setArg(pos, mem.get_data(), mem.get_size()); }
template<class T>
cl_int set_kernel_arg(std::size_t pos,
const kernel_ptr& kernel,
const device_array<T>& array);
} // detail
/// This class simplifies passing arguments to kernels
/// by counting the argument index automatically.
class kernel_argument_list
{
public:
/// Construct object
/// \param kernel The kernel for which the arguments shall
/// be set
explicit kernel_argument_list(const kernel_ptr& kernel)
: _kernel(kernel), _num_arguments()
{
assert(kernel != nullptr);
}
/// Pass argument to the kernel.
/// \return The OpenCL error code
/// \param data The kernel argument
template<class T>
cl_int push(const T& data)
{
cl_int err = detail::set_kernel_arg(_num_arguments,
_kernel, data);
++_num_arguments;
return err;
}
/// Pass argument to the kernel.
/// \return The OpenCL error code
/// \param data The kernel argument
/// \param size The size in bytes of the argument
cl_int push(const void* data, std::size_t size)
{
cl_int err = _kernel->setArg(_num_arguments, size, data);
++_num_arguments;
return err;
}
/// \return The number of arguments that have been set
unsigned get_num_pushed_arguments() const noexcept
{
return _num_arguments;
}
/// Resets the argument counter, hence allowing to set
/// kernel arguments again.
void reset()
{
_num_arguments = 0;
}
private:
kernel_ptr _kernel;
unsigned _num_arguments;
};
/// Represents the OpenCL context of a device. This class contains everything
/// that is needed to execute OpenCL commands on a device. It stores one cl::Context,
/// at least one cl::CommandQueue and a number of kernels that have been compiled for the device.
class device_context
{
public:
/// Create NULL object
device_context()
{}
/// \param platform The OpenCL platform to which the device belongs
/// \param device The OpenCL device to which the instance shall be bound
device_context(const cl::Platform& platform, const cl::Device& device)
: _device(device)
{
cl_int err;
cl_context_properties cprops[3] =
{CL_CONTEXT_PLATFORM, (cl_context_properties)platform(), 0};
_context = cl::Context(
device,
cprops,
NULL,
NULL,
&err);
check_cl_error(err, "Could not spawn CL context!");
init_device();
}
/// Create object from existing cl context and a given device
device_context(const cl::Context& context, const cl::Device& device)
: _context(context), _device(device)
{
init_device();
}
device_context(const device_context& other) = delete;
device_context& operator=(const device_context& other) = delete;
/// \return The device that is accessed by this context
const cl::Device& get_device() const
{
return _device;
}
/// \return The underlying OpenCL context
const cl::Context& get_context() const
{
return _context;
}
/// \return The underlying OpenCL command queue
/// \param queue The id of the command queue. The id must be greater or
/// equal than 0 and smaller than \c get_num_command_queues().
const cl::CommandQueue& get_command_queue(command_queue_id queue) const
{
assert(queue < _queues.size());
return _queues[queue];
}
/// \return The underlying OpenCL command queue
/// \param queue The id of the command queue. The id must be greater or
/// equal than 0 and smaller than \c get_num_command_queues().
cl::CommandQueue& get_command_queue(command_queue_id queue)
{
assert(queue < _queues.size());
return _queues[queue];
}
/// \return The underlying OpenCL command queue
const cl::CommandQueue& get_command_queue() const
{
return _queues[0];
}
/// \return The underlying OpenCL command queue
cl::CommandQueue& get_command_queue()
{
return _queues[0];
}
/// \return The device name
std::string get_device_name() const
{
std::string dev_name;
check_cl_error(_device.getInfo(CL_DEVICE_NAME, &dev_name),
"Could not obtain device information!");
// Apparently, the OpenCL wrappers leave '\0' in the string (Bug?). This
// can lead to problems when concatenating the device_name with other strings
// and using c_str()
detail::remove_zeros(dev_name);
return dev_name;
}
/// \return The device vendor
std::string get_device_vendor() const
{
std::string vendor;
check_cl_error(_device.getInfo(CL_DEVICE_VENDOR, &vendor),
"Could not obtain device information!");
detail::remove_zeros(vendor);
return vendor;
}
/// \return The OpenCL version supported by the device
std::string get_device_cl_version() const
{
std::string cl_version;
check_cl_error(_device.getInfo(CL_DEVICE_VERSION, &cl_version),
"Could not obtain device information!");
detail::remove_zeros(cl_version);
return cl_version;
}
/// \return The driver version
std::string get_driver_version() const
{
std::string driver_version;
check_cl_error(_device.getInfo(CL_DRIVER_VERSION, &driver_version),
"Could not obtain device information!");
detail::remove_zeros(driver_version);
return driver_version;
}
/// \return The type of the device
cl_device_type get_device_type() const
{
return _device_type;
}
/// \return Whether the device is classified as a CPU
bool is_cpu_device() const
{
return get_device_type() == CL_DEVICE_TYPE_CPU;
}
/// \return Whether the device is classified as a GPU
bool is_gpu_device() const
{
return get_device_type() == CL_DEVICE_TYPE_GPU;
}
/// Compiles a file containing CL source code, and creates kernel objects
/// for the kernels contained in the file.
/// \param cl_source_file The CL source file
/// \param kernel_names The names of the kernels in the file as strings
/// \param scope The scope under which the kernels will be registered. If
/// scope is not an empty string, kernels will be registered under the name
/// \c scope::kernel_name
void register_source_file(const std::string& cl_source_file,
const std::vector<std::string>& kernel_names,
const std::string& scope = "")
{
std::string source_code = this->read_source_file(cl_source_file);
register_source_code(source_code, kernel_names, cl_source_file, scope);
}
/// Compiles CL source code in the form of a \c std::string, and creates kernel objects
/// for the kernels defined in the string.
/// \param cl_source The CL source code
/// \param kernel_names The names of the kernels defined in the string
void register_source_code(const std::string& cl_source,
const std::vector<std::string>& kernel_names)
{
// generate program name
std::string program_name;
for(const std::string& str : kernel_names)
program_name += str;
register_source_code(cl_source, kernel_names, program_name);
}
/// Compiles CL source code in the form of a \c std::string, and creates kernel objects
/// for the kernels defined in the string.
/// \param source_code The CL source code
/// \param kernel_names The names of the kernels defined in the string
/// \param program_name A unique identifier of the source code. This
/// identifier will be used to cache the compiled program.
/// \param scope The scope under which the kernels will be registered. If
/// scope is not an empty string, kernels will be registered under the name
/// \c scope::kernel_name
void register_source_code(const std::string& source_code,
const std::vector<std::string>& kernel_names,
const std::string& program_name,
const std::string& scope = "")
{
std::string scope_prefix = "";
if(scope.size() > 0)
scope_prefix = scope + "::";
std::vector<std::string> new_kernels;
for(const auto& kernel_name : kernel_names)
{
if(_kernels.find(scope_prefix+kernel_name) == _kernels.end())
new_kernels.push_back(kernel_name);
}
if(new_kernels.size() > 0)
{
auto cached_program = _program_cache.find(program_name);
if(cached_program == _program_cache.end())
{
detail::meta_source_processor source_processor;
cl::Program prog;
compile_source(source_processor(source_code), prog);
_program_cache[program_name] = prog;
}
load_kernels(_program_cache[program_name], new_kernels, scope);
}
}
/// Compiles a CL source module and creates kernel objects
/// \tparam Source_module_type The source module
/// \param kernel_names The names of the kernels defined in the source module
template<class Source_module_type>
void register_source_module(const std::vector<std::string>& kernel_names)
{
register_source_code(Source_module_type::_qcl_source(),
kernel_names,
Source_module_type::_qcl_get_module_name(),
Source_module_type::_qcl_get_module_name());
}
/// \return A kernel object
/// \param kernel_name The name of the kernel. Throws \c std::runtime_error if
/// the kernel is not found.
kernel_ptr get_kernel(const std::string& kernel_name)
{
kernel_ptr kernel = _kernels[kernel_name];
if(!kernel)
throw std::runtime_error("Requested kernel could not be found!");
return kernel;
}
/// Create an OpenCL buffer object. For CPU devices, it will
/// be attempted to construct a zero-copy buffer. In this case
/// \c initial_data must remain valid!
/// \return A pointer to the created buffer object
/// \param flags The OpenCL flags for the allocated memory
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
buffer_ptr create_buffer(cl_mem_flags flags,
std::size_t size,
T* initial_data = nullptr) const
{
if(this->is_cpu_device())
{
// Try a zero-copy buffer
if(!initial_data)
flags |= CL_MEM_ALLOC_HOST_PTR;
else
flags |= CL_MEM_USE_HOST_PTR;
}
else
{
if(initial_data)
flags |= CL_MEM_COPY_HOST_PTR;
}
cl_int err;
buffer_ptr buff = buffer_ptr(new cl::Buffer(_context, flags, size * sizeof(T), initial_data, &err));
check_cl_error(err, "Could not create buffer object!");
return buff;
}
/// Create an OpenCL buffer object. For CPU devices, it will
/// be attempted to construct a zero-copy buffer. In this case
/// \c initial_data must remain valid!
/// \param out The resulting new buffer object
/// \param flags The OpenCL flags for the allocated memory
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
void create_buffer(cl::Buffer& out,
cl_mem_flags flags,
std::size_t size,
T* initial_data = nullptr) const
{
if(this->is_cpu_device())
{
// Try a zero-copy buffer
if(!initial_data)
flags |= CL_MEM_ALLOC_HOST_PTR;
else
flags |= CL_MEM_USE_HOST_PTR;
}
else
{
if(initial_data)
flags |= CL_MEM_COPY_HOST_PTR;
}
cl_int err;
out = cl::Buffer(_context, flags, size * sizeof(T), initial_data, &err);
check_cl_error(err, "Could not create buffer object!");
}
/// Create a read-write OpenCL buffer object. For CPU devices, it will
/// be attempted to construct a zero-copy buffer. In this case
/// \c initial_data must remain valid!
/// \param out The created buffer
/// \param size The number of elements of the buffer
/// \param initial_data The initial_data buffer.
template<class T>
void create_buffer(cl::Buffer& out,
std::size_t size,
T* initial_data = nullptr) const
{
this->create_buffer<T>(out, CL_MEM_READ_WRITE, size, initial_data);
}
/// Create a read-write OpenCL buffer object. For CPU devices, it will
/// be attempted to construct a zero-copy buffer. In this case
/// \c initial_data must remain valid!
/// \param out The created buffer
/// \param size The number of elements of the buffer
/// \param initial_data The initial_data buffer.
template<class T>
buffer_ptr create_buffer(
std::size_t size,
T* initial_data = nullptr) const
{
return create_buffer<T>(CL_MEM_READ_WRITE, size, initial_data);
}
/// Creates a buffer that is read-only for the compute device.
/// \return A pointer to the created buffer object
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
buffer_ptr create_input_buffer(
std::size_t size,
T* initial_data = nullptr) const
{
return create_buffer<T>(CL_MEM_READ_ONLY, size, initial_data);
}
/// Creates a buffer that is write-only for the compute device.
/// \return A pointer to the created buffer object
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
buffer_ptr create_output_buffer(
std::size_t size,
T* initial_data = nullptr) const
{
return create_buffer<T>(CL_MEM_WRITE_ONLY, size, initial_data);
}
/// Creates a buffer that is read-only for the compute device.
/// \param out The created buffer object
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
void create_input_buffer(
cl::Buffer& out,
std::size_t size,
T* initial_data = nullptr) const
{
create_buffer<T>(out, CL_MEM_READ_ONLY, size, initial_data);
}
/// Creates a buffer that is write-only for the compute device.
/// \param out The created buffer object
/// \param size The number of elements to allocate
/// \tparam T The data type for which memory should be allocated
/// \param initial_data A pointer to data that will be used to initialize the buffer
/// if \c initial_data is not \c NULL.
template<class T>
void create_output_buffer(
cl::Buffer& out,
std::size_t size,
T* initial_data = nullptr) const
{
create_buffer<T>(out, CL_MEM_WRITE_ONLY, size, initial_data);
}
template<class T>
void memcpy_h2d(const cl::Buffer& buff,
const T* data,
std::size_t size,
command_queue_id queue = 0) const
{
cl_int err;
err = get_command_queue(queue).enqueueWriteBuffer(buff, CL_TRUE,
0, size * sizeof(T), data);
check_cl_error(err, "Could not enqueue buffer write!");
}
template<class T>
void memcpy_h2d_async(const cl::Buffer& buff,
const T* data, std::size_t size, cl::Event* event,
const std::vector<cl::Event>* dependencies=nullptr,
command_queue_id queue = 0) const
{
cl_int err;
err = get_command_queue(queue).enqueueWriteBuffer(buff,
CL_FALSE, 0, size * sizeof(T),
data, dependencies, event);
check_cl_error(err, "Could not enqueue async buffer write!");
}
template<class T>
void memcpy_d2h(T* data,
const cl::Buffer& buff,
std::size_t size,
command_queue_id queue = 0) const
{
cl_int err;
err = get_command_queue(queue).enqueueReadBuffer(buff,
CL_TRUE, 0, size * sizeof(T),
data);
check_cl_error(err, "Could not enqueue buffer write!");
}
template<class T>
void memcpy_d2h_async(T* data,
const cl::Buffer& buff,
std::size_t size,
cl::Event* event,
const std::vector<cl::Event>* dependencies = nullptr,
command_queue_id queue = 0) const
{
cl_int err;
err = get_command_queue(queue).enqueueReadBuffer(buff, CL_FALSE, 0, size * sizeof(T),
data, dependencies, event);
check_cl_error(err, "Could not enqueue async buffer write!");
}
template<class T>
void memcpy_h2d(const cl::Buffer& buff,
const T* data,
std::size_t begin, std::size_t end,
command_queue_id queue = 0) const
{
assert(end > begin);
std::size_t size = end - begin;
cl_int err;
err = get_command_queue(queue).enqueueWriteBuffer(buff, CL_TRUE,
begin * sizeof(T),
size * sizeof(T),
data);
check_cl_error(err, "Could not enqueue buffer write!");
}
template<class T>
void memcpy_h2d_async(const cl::Buffer& buff,
const T* data,
std::size_t begin,
std::size_t end,
cl::Event* event,
const std::vector<cl::Event>* dependencies=nullptr,
command_queue_id queue = 0) const
{
assert(end > begin);
std::size_t size = end - begin;
cl_int err;
err = get_command_queue(queue).enqueueWriteBuffer(buff, CL_FALSE,
begin * sizeof(T),
size * sizeof(T),
data, dependencies, event);
check_cl_error(err, "Could not enqueue async buffer write!");
}
template<class T>
void memcpy_d2h(T* data,
const cl::Buffer& buff,
std::size_t begin,
std::size_t end,
command_queue_id queue = 0) const
{
assert(end > begin);
std::size_t size = end - begin;
cl_int err;
err = get_command_queue(queue).enqueueReadBuffer(buff, CL_TRUE,
begin * sizeof(T),
size * sizeof(T),
data);
check_cl_error(err, "Could not enqueue buffer write!");
}
template<class T>
void memcpy_d2h_async(T* data,
const cl::Buffer& buff,
std::size_t begin,
std::size_t end,
cl::Event* event,
const std::vector<cl::Event>* dependencies = nullptr,
command_queue_id queue = 0) const
{
assert(end > begin);
std::size_t size = end - begin;
cl_int err;
err = get_command_queue(queue).enqueueReadBuffer(buff, CL_FALSE,
begin * sizeof(T),
size * sizeof(T),
data, dependencies, event);
check_cl_error(err, "Could not enqueue async buffer write!");
}
/// Queries the OpenCL extensions supported by a given device.
/// \param device The OpenCL device
/// \param extensions A string that will be used to store a list of all supported extensions
static void get_supported_extensions(const cl::Device& device, std::string& extensions)
{
check_cl_error(device.getInfo(CL_DEVICE_EXTENSIONS, &extensions),
"Could not query extensions!");
}
/// \return Whether a given OpenCL extension is supported by a given device
/// \param device The OpenCL device
/// \param extension The name of the extension
static bool is_extension_supported(const cl::Device& device, const std::string& extension)
{
std::string extensions;
get_supported_extensions(device, extensions);
return extensions.find(extension) != std::string::npos;
}
/// Queries the extensions supported by the device.
/// \param extensions A string that will be used to store a list of the supported extensions
void get_supported_extensions(std::string& extensions) const
{
get_supported_extensions(this->_device, extensions);
}
/// \return Whether a given OpenCL extension is supported by the device
/// \param extension The name of the extension
bool is_extension_supported(const std::string& extension) const
{
return is_extension_supported(this->_device, extension);
}
/// Adds a new command queue to the device. Note that one command queue
/// is always created during the constuction of the \c device_context
/// object.
/// \return The id of the created queue
/// \param props Optional properties of the queue
command_queue_id add_command_queue(cl_command_queue_properties props = 0)
{
cl_int err;
_queues.push_back(cl::CommandQueue(_context, _device, props, &err));
check_cl_error(err, "Could not create command queue!");
return _queues.size() - 1;
}
/// Adds a new out-of-order command queue to the device
/// \return The id of the created queue
command_queue_id add_out_of_order_command_queue()
{
return add_command_queue(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
}
/// \return The number of command queues
std::size_t get_num_command_queues() const
{
return _queues.size();
}
/// Ensures that at least \c num_queues are available.
/// If less command queues are available, creates new command
/// queues until enough are available.
void require_several_command_queues(std::size_t num_queues)
{
while(get_num_command_queues() < num_queues)
add_command_queue();
}
cl_int enqueue_ndrange_kernel(const kernel_ptr& kernel,
const cl::NDRange& minimum_num_work_items,
const cl::NDRange& num_local_items,
cl::Event* event = nullptr,
const cl::NDRange& offset = cl::NullRange,
const std::vector<cl::Event>* dependencies = nullptr,
command_queue_id queue = 0)
{
assert(queue < get_num_command_queues());
cl::NDRange global = minimum_num_work_items;
// Only make the global size a multiple of the lcoal size
// if we are not using a NullRange, i.e. the dimensions are > 0
if(num_local_items.dimensions() > 0)
{
assert(minimum_num_work_items.dimensions() == num_local_items.dimensions());
for(std::size_t i = 0; i < minimum_num_work_items.dimensions(); ++i)
{
std::size_t work_items = minimum_num_work_items.get()[i];
std::size_t local_items = num_local_items.get()[i];
std::size_t multiple = (work_items/local_items)*local_items;
if(multiple != work_items)
multiple += local_items;
assert(multiple % local_items == 0 && multiple >= work_items);
global.get()[i] = multiple;
}
}
cl_int err = get_command_queue(queue).enqueueNDRangeKernel(*kernel,
offset,
global,
num_local_items,
dependencies,
event);
return err;
}
/// \return A string containing the build options for kernels compiled
/// for this device context
const std::string& get_build_options() const
{
return _build_options;
}
/// Sets the build options passed to the OpenCL compiler to the options
/// specified by the \c option_string parameter.
void set_build_options(const std::string& option_string)
{
_build_options = option_string;
}
/// Appends the supplied build option \c option to the options
/// passed to the OpenCL compiler
void append_build_option(const std::string& option)
{
_build_options += " ";
_build_options += option;
}
/// Enables relaxed math optimizations by passing the -cl-fast-relaxed-math
/// flag to the compiler.
void enable_fast_relaxed_math()