diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp index e5495fe83f..d80af5fd36 100644 --- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp @@ -36,7 +36,7 @@ static constexpr SpecID BASIS_Q_1D_ID; // Interpolation kernel - tensor //------------------------------------------------------------------------------ template -static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, +static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclBundle_t &sycl_bundle, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u, CeedScalar *v) { const CeedInt buf_len = impl->buf_len; const CeedInt op_len = impl->op_len; @@ -55,7 +55,7 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t sycl_queue.submit([&](sycl::handler &cgh) { cgh.depends_on(e); - cgh.use_kernel_bundle(sycl_module); + cgh.use_kernel_bundle(sycl_bundle); sycl::local_accessor s_mem(op_len + 2 * buf_len, cgh); @@ -139,7 +139,7 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t // Gradient kernel - tensor //------------------------------------------------------------------------------ template -static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, +static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclBundle_t &sycl_bundle, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u, CeedScalar *v) { const CeedInt buf_len = impl->buf_len; const CeedInt op_len = impl->op_len; @@ -158,7 +158,7 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t & sycl_queue.submit([&](sycl::handler &cgh) { cgh.depends_on(e); - cgh.use_kernel_bundle(sycl_module); + cgh.use_kernel_bundle(sycl_bundle); sycl::local_accessor s_mem(2 * (op_len + buf_len), cgh); @@ -299,16 +299,16 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran switch (eval_mode) { case CEED_EVAL_INTERP: if (is_transpose) { - CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); + CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v)); } else { - CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); + CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v)); } break; case CEED_EVAL_GRAD: if (is_transpose) { - CeedCallBackend(CeedBasisApplyGrad_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); + CeedCallBackend(CeedBasisApplyGrad_Sycl(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v)); } else { - CeedCallBackend(CeedBasisApplyGrad_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); + CeedCallBackend(CeedBasisApplyGrad_Sycl(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v)); } break; case CEED_EVAL_WEIGHT: @@ -610,7 +610,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const input_bundle.set_specialization_constant(Q_1d); input_bundle.set_specialization_constant(P_1d); - CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle))); + CeedCallSycl(ceed, impl->sycl_bundle = new SyclBundle_t(sycl::build(input_bundle))); CeedCallBackend(CeedBasisSetData(basis, impl)); diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp index e7280b721d..bc0af096fd 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp @@ -30,7 +30,6 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { Ceed ceed; Ceed_Sycl *data; const char *read_write_kernel_path, *read_write_kernel_source; - const char *qfunction_name, *qfunction_source; CeedInt num_input_fields, num_output_fields; CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Sycl *impl; @@ -60,21 +59,21 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { std::string_view rw_source_view(read_write_kernel_source); const std::string kernel_name = "CeedKernelSyclRefQFunction_" + std::string(qf_name_view); - // std::vector input_sizes(num_input_fields); - // CeedQFunctionField *input_i = input_fields; + std::vector input_sizes(num_input_fields); + CeedQFunctionField *input_i = input_fields; - // for (auto &size_i : input_sizes) { - // CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i)); - // ++input_i; - // } + for (auto &size_i : input_sizes) { + CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i)); + ++input_i; + } - // std::vector output_sizes(num_output_fields); - // CeedQFunctionField *output_i = output_fields; + std::vector output_sizes(num_output_fields); + CeedQFunctionField *output_i = output_fields; - // for (auto &size_i : output_sizes) { - // CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i)); - // ++output_i; - // } + for (auto &size_i : output_sizes) { + CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i)); + ++output_i; + } // Defintions std::ostringstream code; @@ -88,7 +87,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { // code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) extern \"C\" void " << kernel_name code << "#include \n\n"; code << "extern \"C\" void " << kernel_name - << "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl fields) {\n"; + << "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl *fields) {\n"; // OpenCL doesn't allow for structs with pointers. // We will need to pass all of the arguments individually. @@ -97,7 +96,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { << "const CeedScalar *fields_inputs[" << num_input_fields << "];\n"; for (CeedInt i = 0; i < num_input_fields; ++i) { code << " " - << "fields_inputs[" << i << "] = fields.inputs[" << i << "];\n"; + << "fields_inputs[" << i << "] = fields->inputs[" << i << "];\n"; } // Output parameters @@ -105,7 +104,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { << "const CeedScalar *fields_outputs[" << num_output_fields << "];\n"; for (CeedInt i = 0; i < num_output_fields; ++i) { code << " " - << "fields_outputs[" << i << "] = fields.outputs[" << i << "];\n"; + << "fields_outputs[" << i << "] = fields->outputs[" << i << "];\n"; } code << "\n"; @@ -174,11 +173,10 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { CeedDebug(ceed, code.str().c_str()); // Compile kernel - CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module)); + CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), impl->sycl_module)); CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction)); // Cleanup - CeedCallBackend(CeedFree(&qfunction_source)); CeedCallBackend(CeedFree(&read_write_kernel_path)); CeedCallBackend(CeedFree(&read_write_kernel_source)); return CEED_ERROR_SUCCESS; diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp index ced56cb005..23600d80f6 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp @@ -58,44 +58,24 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &impl->fields.inputs[i])); } for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCallBackend(CeedVectorGetArrayRead(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i])); + CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i])); } // Get context data CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data)); - std::vector e; - - if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()}; - // Launch as a basic parallel_for over Q quadrature points - ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { - cgh.depends_on(e); - - int iarg{}; - cgh.set_arg(iarg, context_data); - ++iarg; - cgh.set_arg(iarg, Q); - ++iarg; - for (auto &input_i : inputs) { - cgh.set_arg(iarg, input_i); - ++iarg; - } - for (auto &output_i : outputs) { - cgh.set_arg(iarg, output_i); - ++iarg; - } // Hard-coding the work-group size for now // We could use the Level Zero API to query and set an appropriate size in future // Equivalent of CUDA Occupancy Calculator - int wg_size = WG_SIZE_QF; - sycl::range<1> rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size; - sycl::nd_range<1> kernel_range(rounded_Q, wg_size); - cgh.parallel_for(kernel_range, *(impl->QFunction)); - }); + int wg_size = WG_SIZE_QF; + sycl::range<1> rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size; + sycl::nd_range<1> kernel_range(rounded_Q, wg_size); // Call launcher function that executes kernel - *(impl->QFunction)(sycl_queue, context_data, Q, fields); + // Pass in nd_range as second argument + // Pass in vector of events as third argument + (*impl->QFunction)(ceed_Sycl->sycl_queue, kernel_range, context_data, Q, &impl->fields); // Restore vectors // U_i = U; diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp index 2b08f1f95a..daf6b4b517 100644 --- a/backends/sycl-ref/ceed-sycl-ref.hpp +++ b/backends/sycl-ref/ceed-sycl-ref.hpp @@ -52,7 +52,7 @@ typedef struct { CeedInt num_qpts; CeedInt buf_len; CeedInt op_len; - SyclModule_t *sycl_module; + SyclBundle_t *sycl_bundle; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; @@ -68,7 +68,7 @@ typedef struct { CeedScalar *d_q_weight; } CeedBasisNonTensor_Sycl; -using SyclQfunctionKernel_t = std::function, void*, CeedInt, Fields_Sycl)>; +using SyclQfunctionKernel_t = std::function, void*, CeedInt, Fields_Sycl*)>; typedef struct { SyclModule_t *sycl_module; diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp index 7b30a4e818..046d198b5b 100644 --- a/backends/sycl/ceed-sycl-compile.hpp +++ b/backends/sycl/ceed-sycl-compile.hpp @@ -12,16 +12,17 @@ #include #include -#include +#include "libprtc/prtc.h" using SyclModule_t = std::shared_ptr; +using SyclBundle_t = sycl::kernel_bundle; -CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, +CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t *sycl_module, const std::map &constants = {}); template -CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, SyclKernel_t **sycl_kernel); +int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, SyclKernel_t **sycl_kernel); template -CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, SyclKernel_t *kernel, const int grid_size, const int block_size_x, const int block_size_y, +int CeedRunKernelDimSharedSycl(Ceed ceed, SyclKernel_t *kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, const int shared_mem_size, void **args); diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index 28a7411309..26132fb93a 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "ceed-sycl-common.hpp" @@ -77,12 +78,12 @@ static inline int CeedJitGetFlags_Sycl(std::vector &flags) { // TODO: Check if source, module, etc. already exists //------------------------------------------------------------------------------ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, const std::string &kernel_source, std::string& output_path, - const std::vector &flags = {}) { + const std::vector flags = {}) { // Get cache path from env variable std::string cache_root; // TODO: Add default directory to current working directory - if(std::getenv()"CEED_CACHE_DIR")) { + if(std::getenv("CEED_CACHE_DIR")) { cache_root = std::string(std::getenv("CEED_CACHE_DIR")) + "/.ceed/cache"; } else { cache_root = std::string(std::getenv("PWD")) + "/.ceed/cache"; @@ -101,14 +102,15 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_ size_t kernel_source_hash = string_hash(kernel_source); // Hash compilation flags - std::sort(flags.begin(), flags.end()); - std::string all_flags = prtc::concatenateFlags(flags); + std::vector copy_flags = flags; + std::sort(copy_flags.begin(), copy_flags.end()); + std::string all_flags = prtc::concatenateFlags(copy_flags); size_t build_options_hash = string_hash(all_flags); // Hash compiler version prtc::ShellCommand command("icpx --version"); const auto [success, compiler_version] = command.result(); - if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, compiler_version); + if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, compiler_version.c_str()); size_t compiler_hash = string_hash(compiler_version); // Determine file paths for source and binaries based on hashes @@ -124,9 +126,9 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_ // TODO: Get compiler-path and flags from env or some other means prtc::ShellCompiler compiler("icpx","-o","-c","-fPIC","-shared"); - const auto [success, message] = compiler.compileAndLink(source_file_path,object_file_path,flags); + const auto [build_success, message] = compiler.compileAndLink(source_file_path,object_file_path,flags); // Q: Should we always output the compiler output in verbose/debug mode? - if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, message); + if (!build_success) return CeedError((ceed), CEED_ERROR_BACKEND, message.c_str()); return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl/libprtc/shell_compiler.h b/backends/sycl/libprtc/shell_compiler.h index 564160dbd2..f33a3e6969 100755 --- a/backends/sycl/libprtc/shell_compiler.h +++ b/backends/sycl/libprtc/shell_compiler.h @@ -6,6 +6,8 @@ namespace prtc { +std::string concatenateFlags(const std::vector& flags); + class ShellCompiler { public: ShellCompiler(const std::string& executable, const std::string& output_flag,