dav2: Working on jetson, output weird

ethz-asl · Nov 12, 2024 · a2db862 · a2db862
1 parent fc36a4a
commit a2db862
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 65 deletions.
diff --git a/include/usb_cam/learning/depth_anything_v2.hpp b/include/usb_cam/learning/depth_anything_v2.hpp
@@ -9,13 +9,19 @@
 
 class DepthAnythingV2 : public LearningInterface {
 public:
-    DepthAnythingV2(ros::NodeHandle* nh, std::string model_path, std::string topic) {
+    DepthAnythingV2(ros::NodeHandle* nh, std::string& model_path, std::string& metric_topic, std::string& vis_topic) {
         _INPUT_SIZE = cv::Size(_HEIGHT, _WIDTH);
         _model_path = model_path;
         _load_model();
 
         if (nh != nullptr) {
-            _depth_publication = nh->advertise<sensor_msgs::Image>(topic, 1);
+            if (!metric_topic.empty()) {
+                _depth_metric_pub = nh->advertise<sensor_msgs::Image>(metric_topic, 1);
+            }
+            if (!vis_topic.empty()) {
+                _depth_vis_pub = nh->advertise<sensor_msgs::Image>(vis_topic, 1);
+            }
+
         }
     }
 
@@ -35,27 +41,58 @@ class DepthAnythingV2 : public LearningInterface {
         }
         cv::Mat float_image;
         cv::merge(channels, float_image);
-        _input_data = float_image.reshape(1, 1).ptr<float>(0);
+
+        // Normalize the image using the mean and std for each channel
+        // Mean and std are typically for [R, G, B] channels respectively
+        std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+        std::vector<float> std = {0.229f, 0.224f, 0.225f};
+        for (int c = 0; c < 3; ++c) {
+            cv::Mat channel = channels[c];
+            channel = (channel - mean[c]) / std[c];
+        }
+
+        cv::merge(channels, float_image);
+        memcpy(_input_data, float_image.data, _input_size_float);
     }
 
-    void publish() override {
-        if (_depth_publication.getTopic() != "") {
-            cv::Mat depth_prediction = cv::Mat(_HEIGHT, _WIDTH, CV_32FC1, _output_data);
+void publish() override {
+    cv::Mat depth_prediction = cv::Mat(_HEIGHT, _WIDTH, CV_32FC1, _output_data);
+    if (_depth_metric_pub.getTopic() != "") {
+        // Raw depth prediction (in meters, CV_32FC1)
+        cv_bridge::CvImage depth_image;
+        depth_image.header.stamp = ros::Time::now();
+        depth_image.header.frame_id = "depth_frame";
+        depth_image.encoding = sensor_msgs::image_encodings::TYPE_32FC1; // 32-bit float
+        depth_image.image = depth_prediction;
+        _depth_metric_pub.publish(depth_image.toImageMsg());
 
-            cv_bridge::CvImage depth_image;
-            depth_image.header.stamp = ros::Time::now();
-            depth_image.header.frame_id = "depth_frame";
-            depth_image.encoding = sensor_msgs::image_encodings::TYPE_32FC1;
-            depth_image.image = depth_prediction;
-            _depth_publication.publish(depth_image.toImageMsg());
-        }
     }
 
+    if (_depth_vis_pub.getTopic() != "") {
+        cv::Mat depth_visualized;
+        double min_val, max_val;
+        cv::minMaxLoc(depth_prediction, &min_val, &max_val);
+
+        // Normalize depth image for visualization
+        // Ignore values <= 0 (i.e., no depth / invalid depth)
+        cv::normalize(depth_prediction, depth_visualized, 0, 255, cv::NORM_MINMAX);
+        depth_visualized.convertTo(depth_visualized, CV_8UC1);
+
+        cv_bridge::CvImage visualized_image;
+        visualized_image.header.stamp = ros::Time::now();
+        visualized_image.header.frame_id = "depth_frame";
+        visualized_image.encoding = sensor_msgs::image_encodings::MONO8; // 8-bit grayscale for visualization
+        visualized_image.image = depth_visualized;
+        _depth_vis_pub.publish(visualized_image.toImageMsg());
+    }
+}
+
 private:
     const size_t _HEIGHT = 518;
     const size_t _WIDTH = 518;
     cv::Size _INPUT_SIZE;
-    ros::Publisher _depth_publication;
+    ros::Publisher _depth_metric_pub;
+    ros::Publisher _depth_vis_pub;
 };
 
 #endif // DEPTH_ANYTHING_HPP_
diff --git a/include/usb_cam/learning/interface.hpp b/include/usb_cam/learning/interface.hpp
@@ -35,15 +35,17 @@ class LearningInterface {
     nvinfer1::IExecutionContext* _context;
     nvinfer1::INetworkDefinition* _network;
     nvinfer1::IRuntime* _runtime;
+    size_t _input_c, _input_h, _input_w, _input_size_float;
+    size_t _output_c, _output_h, _output_w, _output_size_float;
     std::string _model_path;
 
     void _load_model();
 
 private:
+    static constexpr size_t JETSON_MEM_LIMIT_B{3ULL * 1024 * 1024 * 1024};
+    static std::mutex predict_mutex;
     void* _buffers[2] = { nullptr, nullptr };
 
-    // Global mutex for prediction calls
-    static std::mutex predict_mutex;
 
     // TODO: static?
     class Logger : public nvinfer1::ILogger {

diff --git a/include/usb_cam/usb_cam.hpp b/include/usb_cam/usb_cam.hpp
@@ -120,7 +120,8 @@ typedef struct
   std::string pixel_format_name;
   std::string av_device_format;
   std::string dav2_file;
-  std::string dav2_topic;
+  std::string dav2_metric_topic;
+  std::string dav2_vis_topic;
   int image_width;
   int image_height;
   int framerate;

diff --git a/launch/two_mipi_rgb.launch b/launch/two_mipi_rgb.launch
@@ -18,8 +18,9 @@
     <param name="wb_green_gain" value="0.88"/>
     <param name="wb_red_gain" value="1.18"/>
 
-    <param name="depth_anything_v2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_v2_vits_16.onnx" />
-    <param name="depth_anything_v2_topic" value="/depth0" />
+    <param name="dav2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_vits14.onnx" />
+    <param name="dav2_metric_topic" value="/depth0/relative" />
+    <param name="dav2_vis_topic" value="/depth0/vis" />
   </node>
 
   <node name="cam1" pkg="usb_cam" type="usb_cam_node" output="screen" >
@@ -39,7 +40,8 @@
     <param name="wb_green_gain" value="0.88"/>
     <param name="wb_red_gain" value="1.18"/>
 
-    <!-- <param name="depth_anything_v2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_v2_vits_16.onnx" /> -->
-    <!-- <param name="depth_anything_v2_topic" value="/depth1" /> -->
+    <!-- <param name="dav2_file" value="/home/asl/Workspace/src/v4l2_camera/test/resources/depth_anything_v2_vits_16_indoor.onnx" /> -->
+    <!-- <param name="dav2_metric_topic" value="/depth1/relative" /> -->
+    <!-- <param name="dav2_vis_topic" value="/depth1/vis" /> -->
   </node>
 </launch>
diff --git a/src/interface.cpp b/src/interface.cpp
@@ -9,35 +9,44 @@ std::mutex LearningInterface::predict_mutex;
 void LearningInterface::_load_model() {
     std::ifstream file_check(_model_path);
     if (!file_check.good()) {
-        std::cerr << "Error: " << _model_path << "' does not exist." << std::endl;
+        std::cerr << "Error: " << _model_path << " does not exist." << std::endl;
         throw std::runtime_error("Model file not found");
     }
     file_check.close();
 
-    if (_model_path.find(".onnx") == std::string::npos) {
-        std::ifstream engine_stream(_model_path, std::ios::binary);
-        engine_stream.seekg(0, std::ios::end);
+    // Define the expected .engine path based on the .onnx model path
+    std::string engine_path = _model_path.substr(0, _model_path.find_last_of('.')) + ".engine";
 
-        const size_t model_size = engine_stream.tellg();
-        engine_stream.seekg(0, std::ios::beg);
+    if (_model_path.find(".onnx") != std::string::npos) {
+        // Check if the engine file already exists
+        std::ifstream engine_check(engine_path, std::ios::binary);
 
-        std::unique_ptr<char[]> engine_data(new char[model_size]);
-        engine_stream.read(engine_data.get(), model_size);
-        engine_stream.close();
+        if (engine_check.good()) {
+            engine_check.seekg(0, std::ios::end);
+            const size_t model_size = engine_check.tellg();
+            engine_check.seekg(0, std::ios::beg);
 
-        // Create tensorrt model
-        _runtime = nvinfer1::createInferRuntime(_logger);
-        _engine = _runtime->deserializeCudaEngine(engine_data.get(), model_size);
-        _context = _engine->createExecutionContext();
+            std::unique_ptr<char[]> engine_data(new char[model_size]);
+            engine_check.read(engine_data.get(), model_size);
+            engine_check.close();
+
+            // Create TensorRT runtime and load engine
+            _runtime = nvinfer1::createInferRuntime(_logger);
+            _engine = _runtime->deserializeCudaEngine(engine_data.get(), model_size);
+            _context = _engine->createExecutionContext();
+
+        } else {
+            // Build an engine from the .onnx model and save it as .engine
+            _build(_model_path);
+            _save_engine(engine_path);
+        }
 
     } else {
-        // Build an engine from an onnx model
-        _build(_model_path);
-        _save_engine(_model_path);
+        std::cerr << "Error: Only .onnx model files are supported." << std::endl;
+        throw std::runtime_error("Unsupported model format");
     }
 
     // Define input dimensions
-    // TODO: Check dimensions, set as members
 #if NV_TENSORRT_MAJOR < 10
     auto input_dims = _engine->getBindingDimensions(0);
     auto output_dims = _engine->getBindingDimensions(1);
@@ -46,23 +55,33 @@ void LearningInterface::_load_model() {
     auto output_dims = _engine->getTensorShape(_engine->getIOTensorName(1));
 #endif
 
-    const size_t input_h = input_dims.d[2];
-    const size_t input_w = input_dims.d[3];
-    ROS_INFO("%ld, %ld", input_h, input_w);
+    // TODO: THis does not generalize so well
+    _input_c = input_dims.d[1];
+    _input_h = input_dims.d[2];
+    _input_w = input_dims.d[3];
+    _output_c = output_dims.d[0];
+    _output_h = output_dims.d[1];
+    _output_w = output_dims.d[2];
+    _input_size_float = _input_c * _input_h * _input_w * sizeof(float);
+    _output_size_float = _output_c * _output_h * _output_w * sizeof(float);
 
     cudaStreamCreate(&_stream);
 
-    cudaMalloc(&_buffers[0], 3 * input_h * input_w * sizeof(float));
-    cudaMalloc(&_buffers[1], input_h * input_w * sizeof(float));
+    cudaMalloc(&_buffers[0], _input_size_float);
+    cudaMalloc(&_buffers[1], _output_size_float);
 
-    _output_data = new float[input_h * input_w];
+    _input_data = new float[_input_c * _input_h * _input_w];
+    _output_data = new float[_output_c * _output_h * _output_w];
 }
 
 void LearningInterface::_build(std::string onnx_path) {
     auto builder = createInferBuilder(_logger);
     const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
     INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
     IBuilderConfig* config = builder->createBuilderConfig();
+
+    // TODO: What about different hardware?
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, JETSON_MEM_LIMIT_B);
     config->setFlag(BuilderFlag::kFP16);
     nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, _logger);
     bool parsed = parser->parseFromFile(onnx_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO));
@@ -78,16 +97,7 @@ void LearningInterface::_build(std::string onnx_path) {
     delete plan;
 }
 
-bool LearningInterface::_save_engine(const std::string& onnx_path) {
-    std::string engine_path;
-    size_t dot_index = onnx_path.find_last_of(".");
-    if (dot_index != std::string::npos) {
-        engine_path = onnx_path.substr(0, dot_index) + ".engine";
-
-    } else {
-        return false;
-    }
-
+bool LearningInterface::_save_engine(const std::string& engine_path) {
     if (_engine) {
         nvinfer1::IHostMemory* data = _engine->serialize();
         std::ofstream file;
@@ -107,9 +117,7 @@ bool LearningInterface::_save_engine(const std::string& onnx_path) {
 
 void LearningInterface::predict() {
     if (predict_mutex.try_lock()) {
-        ROS_INFO("INSIDE PREDICT");
-        // TODO: Use input, output size members to set memcpy size correctly
-        cudaMemcpyAsync(_buffers[0], _input_data, 0, cudaMemcpyHostToDevice, _stream);
+        cudaMemcpyAsync(_buffers[0], _input_data, _input_size_float , cudaMemcpyHostToDevice, _stream);
 
 #if NV_TENSORRT_MAJOR < 10
         _context->enqueueV2(_buffers, _stream, nullptr);
@@ -118,17 +126,17 @@ void LearningInterface::predict() {
 #endif
 
         cudaStreamSynchronize(_stream);
-        // TODO: Use input, output size members to set memcpy size correctly
-        cudaMemcpyAsync(_output_data, _buffers[1], 0, cudaMemcpyDeviceToHost);
+        cudaMemcpyAsync(_output_data, _buffers[1], _output_size_float, cudaMemcpyDeviceToHost);
         predict_mutex.unlock();
 
-    } else {
-        ROS_INFO("predict in progress, skipping");
     }
 }
 
 LearningInterface::~LearningInterface() {
     cudaFree(_stream);
     cudaFree(_buffers[0]);
     cudaFree(_buffers[1]);
+
+    delete[] _input_data;
+    delete[] _output_data;
 }
diff --git a/src/usb_cam_node.cpp b/src/usb_cam_node.cpp
@@ -124,11 +124,12 @@ class UsbCamNode {
         }
 
         // Setup the network that outputs derivates of the image captured
-        m_node.param("depth_anything_v2_file", m_parameters.dav2_file, std::string(""));
-        m_node.param("depth_anything_v2_topic", m_parameters.dav2_topic, std::string(""));
+        m_node.param("dav2_file", m_parameters.dav2_file, std::string(""));
+        m_node.param("dav2_metric_topic", m_parameters.dav2_metric_topic, std::string(""));
+        m_node.param("dav2_vis_topic", m_parameters.dav2_vis_topic, std::string(""));
 
-        if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_topic.empty()) {
-            networks.push_back(std::make_unique<DepthAnythingV2>(&m_node, m_parameters.dav2_file, m_parameters.dav2_topic));
+        if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_metric_topic.empty()) {
+            networks.push_back(std::make_unique<DepthAnythingV2>(&m_node, m_parameters.dav2_file, m_parameters.dav2_metric_topic, m_parameters.dav2_vis_topic));
         }
 
         ROS_INFO("Starting '%s' (%s) at %dx%d via %s (%s) at %i FPS",