ci: Fix GPU usage

ethz-asl · Nov 13, 2024 · 55663e6 · 55663e6
1 parent cb1fe4d
commit 55663e6
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 7 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -6,6 +6,8 @@ jobs:
     runs-on: self-hosted
     container:
       image: omavteam/v4l2_camera:latest
+      options: |
+          --gpus all
 
     strategy:
       matrix:
@@ -31,4 +33,6 @@ jobs:
 
       - name: Run ${{ matrix.ci_script }}
         run: |
+          export ONNX_VERBOSE=1
+          export TRT_LOGGER=VERBOSE
           bash -x ./ci/${{ matrix.ci_script }}.sh
diff --git a/src/interface.cpp b/src/interface.cpp
@@ -35,8 +35,10 @@ void LearningInterface::_load_model() {
     if (_model_path.find(".onnx") != std::string::npos) {
         // Check if the engine file already exists
         std::ifstream engine_check(engine_path, std::ios::binary);
-
+
+        std::cout << "FOUND ONNX" << std::endl;
         if (engine_check.good()) {
+            std::cout << "GOT ENGINE" << std::endl;
             engine_check.seekg(0, std::ios::end);
             const size_t model_size = engine_check.tellg();
             engine_check.seekg(0, std::ios::beg);
@@ -51,6 +53,7 @@ void LearningInterface::_load_model() {
             _context = _engine->createExecutionContext();
 
         } else {
+            std::cout << "NO ENGINE" << std::endl;
             // Build an engine from the .onnx model and save it as .engine
             _build(_model_path);
             _save_engine(engine_path);
@@ -90,25 +93,118 @@ void LearningInterface::_load_model() {
 }
 
 void LearningInterface::_build(std::string onnx_path) {
+    std::cout << "BUILDING ENGINE" << std::endl;
+
+    // Create the builder
     auto builder = createInferBuilder(_logger);
+    if (!builder) {
+        throw std::runtime_error("Failed to create TensorRT builder.");
+    }
+
+    // Set up network with explicit batch flag
     const auto explicit_batch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
     INetworkDefinition* network = builder->createNetworkV2(explicit_batch);
+    if (!network) {
+        builder->destroy();
+        throw std::runtime_error("Failed to create TensorRT network definition.");
+    }
+
+    // Create builder configuration
     IBuilderConfig* config = builder->createBuilderConfig();
+    if (!config) {
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("Failed to create TensorRT builder configuration.");
+    }
 
-    // TODO: What about different hardware?
+    // Set configuration memory pool limit
+    std::cout << "SETTING CONFIG MEMORY LIMIT" << std::endl;
     config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, JETSON_MEM_LIMIT_B);
+
+    // Create parser
     nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, _logger);
+    if (!parser) {
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("Failed to create TensorRT ONNX parser.");
+    }
+
+    // Parse the ONNX model
+    std::cout << "PARSING ONNX MODEL" << std::endl;
     bool parsed = parser->parseFromFile(onnx_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO));
-    IHostMemory* plan{ builder->buildSerializedNetwork(*network, *config) };
+    if (!parsed) {
+        std::cerr << "Failed to parse ONNX model from file: " << onnx_path << std::endl;
+        parser->destroy();
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("ONNX model parsing failed.");
+    }
+
+    // Build the serialized network (engine plan)
+    std::cout << "BUILDING SERIALIZED NETWORK" << std::endl;
+    IHostMemory* plan = builder->buildSerializedNetwork(*network, *config);
+    if (!plan) {
+        std::cerr << "Failed to build serialized TensorRT engine plan." << std::endl;
+        parser->destroy();
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("Serialized network creation failed.");
+    }
 
+    // Create runtime
+    std::cout << "CREATING RUNTIME" << std::endl;
     _runtime = createInferRuntime(_logger);
+    if (!_runtime) {
+        std::cerr << "Failed to create TensorRT runtime." << std::endl;
+        plan->destroy();
+        parser->destroy();
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("Runtime creation failed.");
+    }
+
+    // Deserialize the engine from the plan
+    std::cout << "DESERIALIZING ENGINE" << std::endl;
     _engine = _runtime->deserializeCudaEngine(plan->data(), plan->size());
+    if (!_engine) {
+        std::cerr << "Failed to deserialize CUDA engine from serialized plan." << std::endl;
+        _runtime->destroy();
+        plan->destroy();
+        parser->destroy();
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("CUDA engine deserialization failed.");
+    }
+
+    // Create execution context
+    std::cout << "CREATING EXECUTION CONTEXT" << std::endl;
     _context = _engine->createExecutionContext();
+    if (!_context) {
+        std::cerr << "Failed to create execution context from CUDA engine." << std::endl;
+        _engine->destroy();
+        _runtime->destroy();
+        plan->destroy();
+        parser->destroy();
+        config->destroy();
+        network->destroy();
+        builder->destroy();
+        throw std::runtime_error("Execution context creation failed.");
+    }
+
+    // Clean up resources
+    std::cout << "CLEANING UP RESOURCES" << std::endl;
+    plan->destroy();
+    parser->destroy();
+    config->destroy();
+    network->destroy();
+    builder->destroy();
 
-    delete network;
-    delete config;
-    delete parser;
-    delete plan;
+    std::cout << "ENGINE BUILD SUCCESSFUL" << std::endl;
 }
 
 bool LearningInterface::_save_engine(const std::string& engine_path) {