From 8e1345faef8db194828feacd8f6446b358fc07ae Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 19 Sep 2024 18:08:42 -0400
Subject: [PATCH] Intentionally leak thread_local CUDA resources to avoid crash
 (part 1) (#16787)

The NVbench application `PARQUET_READER_NVBENCH` in libcudf currently crashes with the segmentation fault. To reproduce:

```
./PARQUET_READER_NVBENCH -d 0 -b 1 --run-once -a io_type=FILEPATH -a compression_type=SNAPPY -a cardinality=0 -a run_length=1
```

The root cause is that some (1) `thread_local`  objects on the main thread in `libcudf` and (2) `static` objects in `kvikio` are destroyed after `cudaDeviceReset()` in NVbench and upon program termination. These objects should simply be leaked, since their destructors making CUDA calls upon program termination constitutes UB in CUDA.

This simple PR is the cuDF side of the fix. The other part is done here https://github.com/rapidsai/kvikio/pull/462.

closes #13229

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16787
---
 cpp/src/utilities/stream_pool.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d3a7ce5a4e..9824c472b20 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -132,6 +132,13 @@ struct cuda_event {
   cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
   virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
 
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
   operator cudaEvent_t() { return e_; }
 
  private:
@@ -147,11 +154,12 @@ struct cuda_event {
  */
 cudaEvent_t event_for_thread()
 {
-  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
   auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) {
-    thread_events[device_id.value()] = std::make_unique<cuda_event>();
-  }
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
   return *thread_events[device_id.value()];
 }