diff --git a/meson.build b/meson.build index 4b5c19c0cf..7449fc63b1 100644 --- a/meson.build +++ b/meson.build @@ -576,19 +576,23 @@ if get_option('build_backends') ## ONNX ## ~~~~~~~~~~ if get_option('onnx_libdir') != '' and get_option('onnx_include') != '' - onnx_lib = cc.find_library('onnxruntime', - dirs: get_option('onnx_libdir'), - required: true) + deps += cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'), + required: true) includes += include_directories(get_option('onnx_include'), is_system: true) - cc.has_header('onnxruntime_cxx_api.h', - required: true, + cc.has_header('onnxruntime_cxx_api.h', required: true, args: '-I' + get_option('onnx_include')) - deps += [onnx_lib] - - files += [ - 'src/neural/onnx/network_onnx.cc', - ] - + if not cc.has_header('cpu_provider_factory.h', + args: '-I' + get_option('onnx_include')) + cc.has_header('../providers/cpu/cpu_provider_factory.h', required: true, + args: '-I' + get_option('onnx_include')) + includes += include_directories(get_option('onnx_include') + '/../providers/cpu', + is_system: true) + endif + files += 'src/neural/onnx/network_onnx.cc' + if cc.find_library('onnxruntime_providers_rocm', + dirs: get_option('onnx_libdir'), required: false).found() + add_project_arguments('-DUSE_ROCM', language : 'cpp') + endif has_backends = true endif diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index c458b3a585..d222f745b5 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -52,7 +52,7 @@ namespace lczero { namespace { -enum class OnnxProvider { CPU, CUDA, DML }; +enum class OnnxProvider { CPU, CUDA, DML, ROCM }; class OnnxNetwork; @@ -240,12 +240,22 @@ void OnnxComputation::ComputeBlocking() { int batch = batch_size * step; auto input_tensor = PrepareInputs(i, batch); - if (network_->provider_ == OnnxProvider::DML) network_->lock_.lock(); + // The DML onnxruntime execution provider is documented as not supporting + // multi-threaded calls to Run on the same inference session. We found the + // same to be true for the ROCm execution provider (at least for CNNs). + // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release. + if (network_->provider_ == OnnxProvider::DML || + network_->provider_ == OnnxProvider::ROCM) { + network_->lock_.lock(); + } network_->session_[step - 1].Run( {}, network_->inputs_cstr_.data(), &input_tensor, 1, network_->outputs_cstr_.data(), output_tensors_.data(), output_tensors_.size()); - if (network_->provider_ == OnnxProvider::DML) network_->lock_.unlock(); + if (network_->provider_ == OnnxProvider::DML || + network_->provider_ == OnnxProvider::ROCM) { + network_->lock_.unlock(); + } i += batch; } } @@ -253,7 +263,6 @@ void OnnxComputation::ComputeBlocking() { Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, int batch_size) { Ort::SessionOptions options; - OrtCUDAProviderOptions cuda_options; options.SetIntraOpNumThreads(threads); options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); @@ -276,10 +285,18 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, throw Exception("ONNX backend internal error."); #endif break; - case OnnxProvider::CUDA: + case OnnxProvider::ROCM: { + OrtROCMProviderOptions rocm_options; + rocm_options.device_id = gpu; + options.AppendExecutionProvider_ROCM(rocm_options); + break; + } + case OnnxProvider::CUDA: { + OrtCUDAProviderOptions cuda_options; cuda_options.device_id = gpu; options.AppendExecutionProvider_CUDA(cuda_options); break; + } case OnnxProvider::CPU: auto status = OrtSessionOptionsAppendExecutionProvider_CPU(options, 0); if (status) { @@ -426,6 +443,9 @@ std::unique_ptr MakeOnnxNetwork(const std::optional& w, } } +#ifdef USE_ROCM +REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork, 64) +#endif #ifdef USE_DML REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork, 63) #endif