diff --git a/include/usb_cam/learning/depth_anything_v2.hpp b/include/usb_cam/learning/depth_anything_v2.hpp index 978d7a20..5bd58398 100644 --- a/include/usb_cam/learning/depth_anything_v2.hpp +++ b/include/usb_cam/learning/depth_anything_v2.hpp @@ -9,13 +9,19 @@ class DepthAnythingV2 : public LearningInterface { public: - DepthAnythingV2(ros::NodeHandle* nh, std::string model_path, std::string topic) { + DepthAnythingV2(ros::NodeHandle* nh, std::string& model_path, std::string& metric_topic, std::string& vis_topic) { _INPUT_SIZE = cv::Size(_HEIGHT, _WIDTH); _model_path = model_path; _load_model(); if (nh != nullptr) { - _depth_publication = nh->advertise(topic, 1); + if (!metric_topic.empty()) { + _depth_metric_pub = nh->advertise(metric_topic, 1); + } + if (!vis_topic.empty()) { + _depth_vis_pub = nh->advertise(vis_topic, 1); + } + } } @@ -35,27 +41,58 @@ class DepthAnythingV2 : public LearningInterface { } cv::Mat float_image; cv::merge(channels, float_image); - _input_data = float_image.reshape(1, 1).ptr(0); + + // Normalize the image using the mean and std for each channel + // Mean and std are typically for [R, G, B] channels respectively + std::vector mean = {0.485f, 0.456f, 0.406f}; + std::vector std = {0.229f, 0.224f, 0.225f}; + for (int c = 0; c < 3; ++c) { + cv::Mat channel = channels[c]; + channel = (channel - mean[c]) / std[c]; + } + + cv::merge(channels, float_image); + memcpy(_input_data, float_image.data, _input_size_float); } - void publish() override { - if (_depth_publication.getTopic() != "") { - cv::Mat depth_prediction = cv::Mat(_HEIGHT, _WIDTH, CV_32FC1, _output_data); +void publish() override { + cv::Mat depth_prediction = cv::Mat(_HEIGHT, _WIDTH, CV_32FC1, _output_data); + if (_depth_metric_pub.getTopic() != "") { + // Raw depth prediction (in meters, CV_32FC1) + cv_bridge::CvImage depth_image; + depth_image.header.stamp = ros::Time::now(); + depth_image.header.frame_id = "depth_frame"; + depth_image.encoding = sensor_msgs::image_encodings::TYPE_32FC1; // 32-bit float + depth_image.image = depth_prediction; + _depth_metric_pub.publish(depth_image.toImageMsg()); - cv_bridge::CvImage depth_image; - depth_image.header.stamp = ros::Time::now(); - depth_image.header.frame_id = "depth_frame"; - depth_image.encoding = sensor_msgs::image_encodings::TYPE_32FC1; - depth_image.image = depth_prediction; - _depth_publication.publish(depth_image.toImageMsg()); - } } + if (_depth_vis_pub.getTopic() != "") { + cv::Mat depth_visualized; + double min_val, max_val; + cv::minMaxLoc(depth_prediction, &min_val, &max_val); + + // Normalize depth image for visualization + // Ignore values <= 0 (i.e., no depth / invalid depth) + cv::normalize(depth_prediction, depth_visualized, 0, 255, cv::NORM_MINMAX); + depth_visualized.convertTo(depth_visualized, CV_8UC1); + + cv_bridge::CvImage visualized_image; + visualized_image.header.stamp = ros::Time::now(); + visualized_image.header.frame_id = "depth_frame"; + visualized_image.encoding = sensor_msgs::image_encodings::MONO8; // 8-bit grayscale for visualization + visualized_image.image = depth_visualized; + _depth_vis_pub.publish(visualized_image.toImageMsg()); + } +} + private: const size_t _HEIGHT = 518; const size_t _WIDTH = 518; cv::Size _INPUT_SIZE; - ros::Publisher _depth_publication; + ros::Publisher _depth_metric_pub; + ros::Publisher _depth_vis_pub; }; #endif // DEPTH_ANYTHING_HPP_ diff --git a/include/usb_cam/learning/interface.hpp b/include/usb_cam/learning/interface.hpp index 336670fc..6960ee8c 100644 --- a/include/usb_cam/learning/interface.hpp +++ b/include/usb_cam/learning/interface.hpp @@ -35,15 +35,17 @@ class LearningInterface { nvinfer1::IExecutionContext* _context; nvinfer1::INetworkDefinition* _network; nvinfer1::IRuntime* _runtime; + size_t _input_c, _input_h, _input_w, _input_size_float; + size_t _output_c, _output_h, _output_w, _output_size_float; std::string _model_path; void _load_model(); private: + static constexpr size_t JETSON_MEM_LIMIT_B{3ULL * 1024 * 1024 * 1024}; + static std::mutex predict_mutex; void* _buffers[2] = { nullptr, nullptr }; - // Global mutex for prediction calls - static std::mutex predict_mutex; // TODO: static? class Logger : public nvinfer1::ILogger { diff --git a/include/usb_cam/usb_cam.hpp b/include/usb_cam/usb_cam.hpp index fdce6523..4c41be69 100644 --- a/include/usb_cam/usb_cam.hpp +++ b/include/usb_cam/usb_cam.hpp @@ -120,7 +120,8 @@ typedef struct std::string pixel_format_name; std::string av_device_format; std::string dav2_file; - std::string dav2_topic; + std::string dav2_metric_topic; + std::string dav2_vis_topic; int image_width; int image_height; int framerate; diff --git a/launch/two_mipi_rgb.launch b/launch/two_mipi_rgb.launch index 24fcfded..9120ddec 100644 --- a/launch/two_mipi_rgb.launch +++ b/launch/two_mipi_rgb.launch @@ -18,8 +18,9 @@ - - + + + @@ -39,7 +40,8 @@ - - + + + diff --git a/src/interface.cpp b/src/interface.cpp index 221c8684..8e528839 100644 --- a/src/interface.cpp +++ b/src/interface.cpp @@ -9,35 +9,44 @@ std::mutex LearningInterface::predict_mutex; void LearningInterface::_load_model() { std::ifstream file_check(_model_path); if (!file_check.good()) { - std::cerr << "Error: " << _model_path << "' does not exist." << std::endl; + std::cerr << "Error: " << _model_path << " does not exist." << std::endl; throw std::runtime_error("Model file not found"); } file_check.close(); - if (_model_path.find(".onnx") == std::string::npos) { - std::ifstream engine_stream(_model_path, std::ios::binary); - engine_stream.seekg(0, std::ios::end); + // Define the expected .engine path based on the .onnx model path + std::string engine_path = _model_path.substr(0, _model_path.find_last_of('.')) + ".engine"; - const size_t model_size = engine_stream.tellg(); - engine_stream.seekg(0, std::ios::beg); + if (_model_path.find(".onnx") != std::string::npos) { + // Check if the engine file already exists + std::ifstream engine_check(engine_path, std::ios::binary); - std::unique_ptr engine_data(new char[model_size]); - engine_stream.read(engine_data.get(), model_size); - engine_stream.close(); + if (engine_check.good()) { + engine_check.seekg(0, std::ios::end); + const size_t model_size = engine_check.tellg(); + engine_check.seekg(0, std::ios::beg); - // Create tensorrt model - _runtime = nvinfer1::createInferRuntime(_logger); - _engine = _runtime->deserializeCudaEngine(engine_data.get(), model_size); - _context = _engine->createExecutionContext(); + std::unique_ptr engine_data(new char[model_size]); + engine_check.read(engine_data.get(), model_size); + engine_check.close(); + + // Create TensorRT runtime and load engine + _runtime = nvinfer1::createInferRuntime(_logger); + _engine = _runtime->deserializeCudaEngine(engine_data.get(), model_size); + _context = _engine->createExecutionContext(); + + } else { + // Build an engine from the .onnx model and save it as .engine + _build(_model_path); + _save_engine(engine_path); + } } else { - // Build an engine from an onnx model - _build(_model_path); - _save_engine(_model_path); + std::cerr << "Error: Only .onnx model files are supported." << std::endl; + throw std::runtime_error("Unsupported model format"); } // Define input dimensions - // TODO: Check dimensions, set as members #if NV_TENSORRT_MAJOR < 10 auto input_dims = _engine->getBindingDimensions(0); auto output_dims = _engine->getBindingDimensions(1); @@ -46,16 +55,23 @@ void LearningInterface::_load_model() { auto output_dims = _engine->getTensorShape(_engine->getIOTensorName(1)); #endif - const size_t input_h = input_dims.d[2]; - const size_t input_w = input_dims.d[3]; - ROS_INFO("%ld, %ld", input_h, input_w); + // TODO: THis does not generalize so well + _input_c = input_dims.d[1]; + _input_h = input_dims.d[2]; + _input_w = input_dims.d[3]; + _output_c = output_dims.d[0]; + _output_h = output_dims.d[1]; + _output_w = output_dims.d[2]; + _input_size_float = _input_c * _input_h * _input_w * sizeof(float); + _output_size_float = _output_c * _output_h * _output_w * sizeof(float); cudaStreamCreate(&_stream); - cudaMalloc(&_buffers[0], 3 * input_h * input_w * sizeof(float)); - cudaMalloc(&_buffers[1], input_h * input_w * sizeof(float)); + cudaMalloc(&_buffers[0], _input_size_float); + cudaMalloc(&_buffers[1], _output_size_float); - _output_data = new float[input_h * input_w]; + _input_data = new float[_input_c * _input_h * _input_w]; + _output_data = new float[_output_c * _output_h * _output_w]; } void LearningInterface::_build(std::string onnx_path) { @@ -63,6 +79,9 @@ void LearningInterface::_build(std::string onnx_path) { const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicitBatch); IBuilderConfig* config = builder->createBuilderConfig(); + + // TODO: What about different hardware? + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, JETSON_MEM_LIMIT_B); config->setFlag(BuilderFlag::kFP16); nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, _logger); bool parsed = parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kINFO)); @@ -78,16 +97,7 @@ void LearningInterface::_build(std::string onnx_path) { delete plan; } -bool LearningInterface::_save_engine(const std::string& onnx_path) { - std::string engine_path; - size_t dot_index = onnx_path.find_last_of("."); - if (dot_index != std::string::npos) { - engine_path = onnx_path.substr(0, dot_index) + ".engine"; - - } else { - return false; - } - +bool LearningInterface::_save_engine(const std::string& engine_path) { if (_engine) { nvinfer1::IHostMemory* data = _engine->serialize(); std::ofstream file; @@ -107,9 +117,7 @@ bool LearningInterface::_save_engine(const std::string& onnx_path) { void LearningInterface::predict() { if (predict_mutex.try_lock()) { - ROS_INFO("INSIDE PREDICT"); - // TODO: Use input, output size members to set memcpy size correctly - cudaMemcpyAsync(_buffers[0], _input_data, 0, cudaMemcpyHostToDevice, _stream); + cudaMemcpyAsync(_buffers[0], _input_data, _input_size_float , cudaMemcpyHostToDevice, _stream); #if NV_TENSORRT_MAJOR < 10 _context->enqueueV2(_buffers, _stream, nullptr); @@ -118,12 +126,9 @@ void LearningInterface::predict() { #endif cudaStreamSynchronize(_stream); - // TODO: Use input, output size members to set memcpy size correctly - cudaMemcpyAsync(_output_data, _buffers[1], 0, cudaMemcpyDeviceToHost); + cudaMemcpyAsync(_output_data, _buffers[1], _output_size_float, cudaMemcpyDeviceToHost); predict_mutex.unlock(); - } else { - ROS_INFO("predict in progress, skipping"); } } @@ -131,4 +136,7 @@ LearningInterface::~LearningInterface() { cudaFree(_stream); cudaFree(_buffers[0]); cudaFree(_buffers[1]); + + delete[] _input_data; + delete[] _output_data; } diff --git a/src/usb_cam_node.cpp b/src/usb_cam_node.cpp index 1776330b..99f600b5 100644 --- a/src/usb_cam_node.cpp +++ b/src/usb_cam_node.cpp @@ -124,11 +124,12 @@ class UsbCamNode { } // Setup the network that outputs derivates of the image captured - m_node.param("depth_anything_v2_file", m_parameters.dav2_file, std::string("")); - m_node.param("depth_anything_v2_topic", m_parameters.dav2_topic, std::string("")); + m_node.param("dav2_file", m_parameters.dav2_file, std::string("")); + m_node.param("dav2_metric_topic", m_parameters.dav2_metric_topic, std::string("")); + m_node.param("dav2_vis_topic", m_parameters.dav2_vis_topic, std::string("")); - if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_topic.empty()) { - networks.push_back(std::make_unique(&m_node, m_parameters.dav2_file, m_parameters.dav2_topic)); + if (!m_parameters.dav2_file.empty() && !m_parameters.dav2_metric_topic.empty()) { + networks.push_back(std::make_unique(&m_node, m_parameters.dav2_file, m_parameters.dav2_metric_topic, m_parameters.dav2_vis_topic)); } ROS_INFO("Starting '%s' (%s) at %dx%d via %s (%s) at %i FPS",