diff --git a/docker/perception/camera_object_detection/camera_object_detection.Dockerfile b/docker/perception/camera_object_detection/camera_object_detection.Dockerfile index 5c3ee385..933bddb6 100644 --- a/docker/perception/camera_object_detection/camera_object_detection.Dockerfile +++ b/docker/perception/camera_object_detection/camera_object_detection.Dockerfile @@ -7,6 +7,7 @@ WORKDIR ${AMENT_WS}/src # Copy in source code COPY src/perception/camera_object_detection camera_object_detection +COPY src/wato_msgs/perception_msgs/camera_object_detection_msgs camera_object_detection_msgs # Scan for rosdeps RUN apt-get -qq update && rosdep update && \ @@ -32,7 +33,7 @@ RUN rm requirements.txt # Install Rosdep requirements COPY --from=source /tmp/colcon_install_list /tmp/colcon_install_list RUN apt-get update && apt-fast install -qq -y --no-install-recommends $(cat /tmp/colcon_install_list) - +RUN apt install ros-$ROS_DISTRO-tf-transformations -y # Copy in source code from source stage WORKDIR ${AMENT_WS} COPY --from=source ${AMENT_WS}/src src diff --git a/modules/docker-compose.perception.yaml b/modules/docker-compose.perception.yaml index 99dc9a49..ef633077 100644 --- a/modules/docker-compose.perception.yaml +++ b/modules/docker-compose.perception.yaml @@ -34,7 +34,10 @@ services: - /mnt/wato-drive2/perception_models/yolov8m.pt:/perception_models/yolov8m.pt - /mnt/wato-drive2/perception_models/traffic_light.pt:/perception_models/traffic_light.pt - /mnt/wato-drive2/perception_models/traffic_signs_v0.pt:/perception_models/traffic_signs_v1.pt - + - /mnt/wato-drive2/perception_models/tensorRT.onnx:/perception_models/tensorRT.onnx + - /mnt/wato-drive2/perception_models/tensorRT.engine:/perception_models/tensorRT.engine + - /mnt/wato-drive2/perception_models/eve.onnx:/perception_models/eve.onnx + lidar_object_detection: build: context: .. diff --git a/src/perception/camera_object_detection/camera_object_detection/yolov8_detection.py b/src/perception/camera_object_detection/camera_object_detection/yolov8_detection.py index 05ceb431..8bd0482b 100755 --- a/src/perception/camera_object_detection/camera_object_detection/yolov8_detection.py +++ b/src/perception/camera_object_detection/camera_object_detection/yolov8_detection.py @@ -1,7 +1,8 @@ import rclpy from rclpy.node import Node import os - +from message_filters import Subscriber, TimeSynchronizer, ApproximateTimeSynchronizer +from camera_object_detection_msgs.msg import BatchDetection, EveBatchDetection from sensor_msgs.msg import Image, CompressedImage from vision_msgs.msg import ( ObjectHypothesisWithPose, @@ -9,6 +10,9 @@ Detection2DArray, ) +from std_msgs.msg import Header +from pathlib import Path +from ultralytics import YOLO from ultralytics.nn.autobackend import AutoBackend from ultralytics.data.augment import LetterBox, CenterCrop from ultralytics.utils.ops import non_max_suppression @@ -22,6 +26,20 @@ import time import torch +import onnx +import tensorrt as trt +from typing import List, Optional, Tuple, Union +from dataclasses import dataclass +from numpy import ndarray +# import pycuda.driver as cuda +from cuda import cudart +@dataclass +class Tensor: + name:str + dtype:np.dtype + shape:Tuple + cpu:ndarray + gpu:int class CameraDetectionNode(Node): @@ -30,21 +48,73 @@ def __init__(self): torch.zeros(1).cuda() super().__init__("camera_object_detection_node") - self.get_logger().info("Creating camera detection node...") + self.get_logger().info("Creating batched camera detection node...") + + #Eve config params self.declare_parameter("camera_topic", "/camera/right/image_color") + self.declare_parameter("left_camera_topic", "/camera/left/image_color") + self.declare_parameter("center_camera_topic", "/camera/center/image_color") + + #Nuscenes + self.declare_parameter("front_center_camera_topic", "/CAM_FRONT/image_rect_compressed") + self.declare_parameter("front_right_camera_topic", "/CAM_FRONT_RIGHT/image_rect_compressed") + self.declare_parameter("front_left_camera_topic", "/CAM_FRONT_LEFT/image_rect_compressed") + + self.declare_parameter("back_center_camera_topic", "/CAM_BACK/image_rect_compressed") + self.declare_parameter("back_right_camera_topic", "/CAM_BACK_RIGHT/image_rect_compressed") + self.declare_parameter("back_left_camera_topic", "/CAM_BACK_LEFT/image_rect_compressed") + + self.declare_parameter("onnx_model_path", "/perception_models/tensorRT.onnx") #tensorRT. extensions + self.declare_parameter("tensorRT_model_path", "/perception_models/tensorRT.engine") + self.declare_parameter("eve_tensorRT_model_path", "/perception_models/eve.engine") self.declare_parameter("publish_vis_topic", "/annotated_img") + self.declare_parameter("batch_publish_vis_topic", "/batch_annotated_img") + self.declare_parameter("batch_publish_detection_topic", "/batch_detections") self.declare_parameter("publish_detection_topic", "/detections") self.declare_parameter("model_path", "/perception_models/yolov8m.pt") + self.declare_parameter("image_size", 1024) self.declare_parameter("compressed", False) self.declare_parameter("crop_mode", "LetterBox") self.declare_parameter("save_detections", False) - + + #Declare batch topic + self.declare_parameter("batch_inference_topic", "/batched_camera_message") + self.declare_parameter("eve_batch_inference_topic", "/eve_batched_camera_message") + + + + #Batch inference topic + self.batch_inference_topic = self.get_parameter("batch_inference_topic").value + self.eve_batch_inference_topic = self.get_parameter("eve_batch_inference_topic").value + self.batch_publish_detection_topic = self.get_parameter("batch_publish_detection_topic").value + self.batch_publish_vis_topic = self.get_parameter("batch_publish_vis_topic").value + self.onnx_model_path = self.get_parameter("onnx_model_path").value + + self.tensorRT_model_path = self.get_parameter("tensorRT_model_path").value + self.eve_tensorRT_model_path = self.get_parameter("eve_tensorRT_model_path").value + #Camera topics for eve self.camera_topic = self.get_parameter("camera_topic").value + self.left_camera_topic = self.get_parameter("left_camera_topic").value + self.center_camera_topic = self.get_parameter("center_camera_topic").value + + #Camera topics for nuscenes + self.front_center_camera_topic = self.get_parameter("front_center_camera_topic").value + self.front_right_camera_topic = self.get_parameter("front_right_camera_topic").value + self.front_left_camera_topic = self.get_parameter("front_left_camera_topic").value + + self.back_center_camera_topic = self.get_parameter("back_center_camera_topic").value + self.back_right_camera_topic = self.get_parameter("back_right_camera_topic").value + self.back_left_camera_topic = self.get_parameter("back_left_camera_topic").value + + + + self.publish_vis_topic = self.get_parameter("publish_vis_topic").value self.publish_detection_topic = self.get_parameter("publish_detection_topic").value self.model_path = self.get_parameter("model_path").value + self.image_size = self.get_parameter("image_size").value self.compressed = self.get_parameter("compressed").value self.crop_mode = self.get_parameter("crop_mode").value @@ -57,21 +127,33 @@ def __init__(self): self.line_thickness = 1 self.half = False - self.subscription = self.create_subscription( - Image if not self.compressed else CompressedImage, - self.camera_topic, - self.image_callback, - qos_profile=QoSProfile( - reliability=QoSReliabilityPolicy.RELIABLE, - history=QoSHistoryPolicy.KEEP_LAST, - depth=10, - ), - ) + + #Subscription for Nuscenes + self.front_center_camera_subscription = Subscriber(self, CompressedImage, self.front_center_camera_topic) + self.front_right_camera_subscription = Subscriber(self, CompressedImage, self.front_right_camera_topic) + self.front_left_camera_subscription = Subscriber(self, CompressedImage, self.front_left_camera_topic) + self.back_center_camera_subscription = Subscriber(self, CompressedImage, self.back_center_camera_topic) + self.back_right_camera_subscription = Subscriber(self, CompressedImage, self.back_right_camera_topic) + self.back_left_camera_subscription = Subscriber(self, CompressedImage, self.back_left_camera_topic) + #self.ats = ApproximateTimeSynchronizer([self.front_center_camera_subscription, self.back_center_camera_subscription, self.front_right_camera_subscription, self.back_right_camera_subscription, self.front_left_camera_subscription, self.back_left_camera_subscription], queue_size=10, slop=0.5) + self.ats = ApproximateTimeSynchronizer([self.front_left_camera_subscription,self.front_center_camera_subscription, self.front_right_camera_subscription ], queue_size=10, slop=0.1) + #self.ats = ApproximateTimeSynchronizer([self.front_center_camera_subscription], queue_size=10, slop=0.3) + self.ats.registerCallback(self.batch_inference_callback) + + #Subscription for Eve cameras + self.right_camera_subscription = Subscriber(self, Image, self.camera_topic) + self.left_camera_subscription = Subscriber(self, Image, self.left_camera_topic) + self.center_camera_subscription = Subscriber(self, Image, self.center_camera_topic) + self.eve_ats = ApproximateTimeSynchronizer([self.right_camera_subscription, self.left_camera_subscription,self.center_camera_subscription], queue_size=10, slop=0.1) + + # #Create this callback function tomorrow + self.eve_ats.registerCallback(self.eve_batch_inference_callback) + self.get_logger().info(f"TENSORT VERSION:{trt.__version__}") self.orig_image_width = None self.orig_image_height = None - # set device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): self.get_logger().info("Using GPU for inference") @@ -80,228 +162,441 @@ def __init__(self): # CV bridge self.cv_bridge = CvBridge() - - # load yolov8 model - self.model = AutoBackend(self.model_path, device=self.device, dnn=False, fp16=False) - - self.names = self.model.module.names if hasattr(self.model, "module") else self.model.names - - self.stride = int(self.model.stride) - - # setup vis publishers + + + status, self.stream = cudart.cudaStreamCreate() + assert status.value == 0, "IS NOT ZERO" + + #self.build_engine() + self.last_publish_time = self.get_clock().now() + + + + #Batch vis publishers + self.batched_camera_message_publisher = self.create_publisher(BatchDetection,self.batch_inference_topic, 10) + self.eve_batched_camera_message_publisher = self.create_publisher(EveBatchDetection, self.eve_batch_inference_topic, 10) + self.num_cameras = 3 # Adjust this based on the number of cameras + + #Nuscenes Publishers + self.nuscenes_camera_names = ["/CAM_FRONT_LEFT", "/CAM_FRONT", "/CAM_FRONT_RIGHT"] + self.batch_vis_publishers = [ + self.create_publisher(CompressedImage, f"{nuscenes}/viz", 10) + for nuscenes in self.nuscenes_camera_names + ] + self.batch_detection_publishers = [ + self.create_publisher(Detection2DArray, f"{nuscenes}/detections", 10) + for nuscenes in self.nuscenes_camera_names + ] + + #Eve Publishers + self.eve_camera_names = [self.left_camera_topic, self.center_camera_topic, self.camera_topic] + self.eve_batch_vis_publishers = [ + self.create_publisher(Image, f"{eve}/eve_batch_viz", 10) + for eve in self.eve_camera_names + ] + self.eve_batch_detection_publishers = [ + self.create_publisher(Detection2DArray, f"{eve}/eve_batch_detections", 10) + for eve in self.eve_camera_names + ] + + + # self.batch_detection_publisher = self.create_publisher(Detection2DArray, self.batch_publish_detection_topic, 10) + # self.batch_vis_publisher = self.create_publisher(CompressedImage, self.batch_publish_vis_topic, 10) + + #vis publishers self.vis_publisher = self.create_publisher(Image, self.publish_vis_topic, 10) self.detection_publisher = self.create_publisher( Detection2DArray, self.publish_detection_topic, 10 ) - - self.get_logger().info( + self.get_logger().info( f"Successfully created node listening on camera topic: {self.camera_topic}..." ) - - def crop_image(self, cv_image): - if self.crop_mode == "LetterBox": - img = LetterBox(self.image_size, stride=self.stride)(image=cv_image) - elif self.crop_mode == "CenterCrop": - img = CenterCrop(self.image_size)(cv_image) - else: - raise Exception("Invalid crop mode, please choose either 'LetterBox' or 'CenterCrop'!") - - return img - - def convert_bboxes_to_orig_frame(self, bbox): + + def build_engine(self): + #Only calling this function when we dont have an engine file + #Reading the onnx file in the perception models directory + self.max_batch_size = 6 + self.channels = 3 + self.height = 640 + self.width = 640 + self.shape_input_model = [self.max_batch_size, self.channels, self.height, self.width] + self.logger = trt.Logger(trt.Logger.VERBOSE) + self.builder = trt.Builder(self.logger) + self.config = self.builder.create_builder_config() + self.cache = self.config.create_timing_cache(b"") + self.config.set_timing_cache(self.cache, ignore_mismatch=False) + self.total_memory = torch.cuda.get_device_properties(self.device).total_memory + self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, self.total_memory) + self.flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + self.network = self.builder.create_network(self.flag) + self.parser = trt.OnnxParser(self.network, self.logger) + with open(self.onnx_model_path, "rb") as f: + if not self.parser.parse(f.read()): + self.get_logger().info("ERROR: Cannot read ONNX FILE") + for error in range(self.parser.num_errors): + self.get_logger().info(f"{self.parser.get_error(error)}") + self.inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)] + self.outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)] + for input in self.inputs: + self.get_logger().info(f"Model {input.name} shape:{input.shape} {input.dtype}") + for output in self.outputs: + self.get_logger().info(f"Model {output.name} shape: {output.shape} {output.dtype}") + if self.max_batch_size > 1: + self.profile = self.builder.create_optimization_profile() + self.min_shape = [1, 3, 640, 640] + self.opt_shape = [3, 3, 640, 640] + self.max_shape = [6, 3, 640, 640] + for input in self.inputs: + self.profile.set_shape(input.name, self.min_shape, self.opt_shape, self.max_shape) + self.config.add_optimization_profile(self.profile) + self.half = True + self.int8 = False + if self.half: + self.config.set_flag(trt.BuilderFlag.FP16) + elif self.int8: + self.config.set_flag(trt.BuilderFlag.INT8) + self.engine_bytes = self.builder.build_serialized_network(self.network, self.config) + assert self.engine_bytes is not None, "Failed to create engine" + self.get_logger().info("BUILT THE ENGINE ") + with open(self.tensorRT_model_path, "wb") as f: + f.write(self.engine_bytes) + self.get_logger().info("FINISHED WRITING ") + + def initialize_engine(self, weight, batch_size, rgb, width, height): """ - Converts bounding box coordinates from the scaled image frame back to the original image frame. - - This function takes into account the original image dimensions and the scaling method used - (either "LetterBox" or "CenterCrop") to accurately map the bounding box coordinates back to - their original positions in the original image. - - Parameters: - bbox (list): A list containing the bounding box coordinates in the format [x1, y1, w1, h1] - in the scaled image frame. - - Returns: - list: A list containing the bounding box coordinates in the format [x1, y1, w1, h1] - in the original image frame. - + Initializes engine file requirements + - takes in file path for tensorRT file, batch size, # of rgb channels, width & height + - includes input names, output names, and setting dimensions for model input shape """ - width_scale = self.orig_image_width / self.image_size - height_scale = self.orig_image_height / self.image_size - if self.crop_mode == "LetterBox": - translation = (self.image_size - self.orig_image_height / width_scale) / 2 - return [ - bbox[0] * width_scale, - (bbox[1] - translation) * width_scale, - bbox[2] * width_scale, - bbox[3] * width_scale, - ] - elif self.crop_mode == "CenterCrop": - translation = (self.orig_image_width / height_scale - self.image_size) / 2 - return [ - (bbox[0] + translation) * height_scale, - bbox[1] * height_scale, - bbox[2] * height_scale, - bbox[3] * height_scale, - ] - - def crop_and_convert_to_tensor(self, cv_image): + self.weight = Path(weight) if isinstance(weight, str) else weight + self.logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(self.logger, namespace ='') + with trt.Runtime(self.logger) as runtime: + self.tensorRT_model = runtime.deserialize_cuda_engine(self.weight.read_bytes()) + self.execution_context = self.tensorRT_model.create_execution_context() + + + self.num_io_tensors = self.tensorRT_model.num_io_tensors + self.input_tensor_name = self.tensorRT_model.get_tensor_name(0) + + self.execution_context.set_input_shape(self.input_tensor_name, (batch_size, rgb, width, height)) + self.inputShape = self.execution_context.get_tensor_shape(self.input_tensor_name) + self.names = [self.tensorRT_model.get_tensor_name(i) for i in range(self.num_io_tensors)] + self.num_io_tensors = self.tensorRT_model.num_io_tensors + self.bindings = [0] * self.num_io_tensors + self.num_inputs = 0 + self.num_outputs = 0 + self.input_names = [] + self.output_names = [] + for i in range(self.num_io_tensors): + self.tensor_name = self.tensorRT_model.get_tensor_name(i) + if self.tensorRT_model.get_tensor_mode(self.tensor_name) == trt.TensorIOMode.INPUT: + self.input_names.append(self.tensor_name) + self.num_inputs += 1 + elif self.tensorRT_model.get_tensor_mode(self.tensor_name) == trt.TensorIOMode.OUTPUT: + self.output_names.append(self.tensor_name) + self.num_outputs += 1 + self.num_inputs = len(self.input_names) + self.num_outputs = len(self.output_names) + self.input_names = self.names[:self.num_inputs] + self.output_names = [name for name in self.output_names if name not in self.input_names] # This line removes it + + + def initialize_tensors(self): """ - Preprocess the image by resizing, padding and rearranging the dimensions. - - Parameters: - cv_image: A numpy or cv2 image of shape (w,h,3) - - Returns: - torch.Tensor image for model input of shape (1,3,w,h) + Initializes GPU from cuda to set up inferencing + - Assigns input names, and shape + - Assigns output names, and shapes """ - img = self.crop_image(cv_image) - - # Convert - img = img.transpose(2, 0, 1) - - # Further conversion - img = torch.from_numpy(img).to(self.device) - img = img.half() if self.half else img.float() # uint8 to fp16/32 - img /= 255.0 # 0 - 255 to 0.0 - 1.0 - img = img.unsqueeze(0) + self.dynamic = True + self.input_info = [] + self.output_info = [] + self.output_ptrs = [] + + # Initializing output tensors + for name in self.output_names: + self.tensorRT_output_shape = self.execution_context.get_tensor_shape(name) + self.outputDtype = trt.nptype(self.tensorRT_model.get_tensor_dtype(name)) + self.output_cpu = np.empty(self.tensorRT_output_shape, dtype = self.outputDtype) + status, self.output_gpu = cudart.cudaMallocAsync(self.output_cpu.nbytes, self.stream) + assert status.value == 0 + cudart.cudaMemcpyAsync(self.output_gpu, self.output_cpu.ctypes.data, self.output_cpu.nbytes, + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream) + self.output_ptrs.append(self.output_gpu) + self.output_info.append(Tensor(name, self.outputDtype, self.tensorRT_output_shape, self.output_cpu, self.output_gpu)) + + #Initializes input tensors + for i, name in enumerate(self.input_names): + if self.tensorRT_model.get_tensor_name(i) == name: + self.tensorRT_input_shape = tuple(self.inputShape) + self.dtype = trt.nptype(self.tensorRT_model.get_tensor_dtype(name)) + self.input_cpu = np.empty(self.tensorRT_input_shape, self.dtype) + status, self.input_gpu = cudart.cudaMallocAsync(self.input_cpu.nbytes, self.stream) + assert status.value == 0, "DOES NOT MATCH" + cudart.cudaMemcpyAsync( + self.input_gpu, self.input_cpu.ctypes.data, self.input_cpu.nbytes, + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream) + self.input_info.append(Tensor(name, self.dtype, self.tensorRT_input_shape, self.input_cpu, self.input_gpu)) + return self.input_info, self.output_info + + def tensorRT_inferencing(self, batch_array): + """ + Inferences through preprocessed batch images and gives data about detections + - Returns a contigious array of shape (3,84,8400) + """ + assert batch_array.ndim == 4 + batch_size = batch_array.shape[0] + assert batch_size == self.input_info[0].shape[0] + + #Intializing memory, and names + self.contiguous_inputs = [np.ascontiguousarray(batch_array)] + for i in range(self.num_inputs): + name = self.input_info[i].name + cudart.cudaMemcpyAsync( + self.input_info[i].gpu, + self.contiguous_inputs[i].ctypes.data, + self.contiguous_inputs[i].nbytes, + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, + self.stream + ) + self.execution_context.set_tensor_address(name, self.input_info[i].gpu) + + + self.output_gpu_ptrs = [] + self.outputs_ndarray = [] + for i in range(self.num_outputs): + output_shape = self.execution_context.get_tensor_shape(self.output_info[i].name) + # Reallocate output buffer if shape changed + if self.output_info[i].cpu.shape != tuple(output_shape): + self.output_info[i].cpu = np.empty(output_shape, dtype=self.output_info[i].cpu.dtype) + + self.outputs_ndarray.append(self.output_info[i].cpu) + self.output_gpu_ptrs.append(self.output_info[i].gpu) + self.execution_context.set_tensor_address( + self.output_info[i].name, + self.output_info[i].gpu + ) + + # Execute inference + status = self.execution_context.execute_async_v3(self.stream) + assert status, "Inference execution failed" + + # Synchronize and copy results + cudart.cudaStreamSynchronize(self.stream) + for i, gpu_ptr in enumerate(self.output_gpu_ptrs): + cudart.cudaMemcpyAsync( + self.outputs_ndarray[i].ctypes.data, + gpu_ptr, + self.outputs_ndarray[i].nbytes, + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.stream + ) + cudart.cudaStreamSynchronize(self.stream) + + return tuple(self.outputs_ndarray) if len(self.outputs_ndarray) > 1 else self.outputs_ndarray[0] - return img - def postprocess_detections(self, detections, annotator): + # will be called with nuscenes rosbag + def batch_inference_callback(self,msg1,msg2,msg3): """ - Post-process draws bouningboxes on camera image. - - Parameters: - detections: A list of dict with the format - { - "label": str, - "bbox": [float], - "conf": float - } - annotator: A ultralytics.yolo.utils.plotting.Annotator for the current image - - Returns: - processed_detections: filtered detections - annotator_img: image with bounding boxes drawn on + Returns final detections array for publishing to foxglove + - Preprocess and batch images + - Call tensorRT and parse through detections and send for visualization """ - processed_detections = detections - - for det in detections: - label = f'{det["label"]} {det["conf"]:.2f}' - x1, y1, w1, h1 = det["bbox"] - xyxy = [x1, y1, x1 + w1, y1 + h1] - annotator.box_label(xyxy, label, color=colors(1, True)) - - annotator_img = annotator.result() - return (processed_detections, annotator_img) - - def publish_vis(self, annotated_img, msg, feed): - # Publish visualizations - imgmsg = self.cv_bridge.cv2_to_imgmsg(annotated_img, "bgr8") - imgmsg.header.stamp = msg.header.stamp - imgmsg.header.frame_id = msg.header.frame_id - self.vis_publisher.publish(imgmsg) - - def publish_detections(self, detections, msg, feed): - # Publish detections to an detectionList message - detection2darray = Detection2DArray() - - # fill header for detection list - detection2darray.header.stamp = msg.header.stamp - detection2darray.header.frame_id = msg.header.frame_id - # populate detection list - if detections is not None and len(detections): - for detection in detections: - detection2d = Detection2D() - detection2d.header.stamp = msg.header.stamp - detection2d.header.frame_id = msg.header.frame_id - detected_object = ObjectHypothesisWithPose() - detected_object.hypothesis.class_id = detection["label"] - detected_object.hypothesis.score = detection["conf"] - detection2d.results.append(detected_object) - detection2d.bbox.center.position.x = detection["bbox"][0] - detection2d.bbox.center.position.y = detection["bbox"][1] - detection2d.bbox.size_x = detection["bbox"][2] - detection2d.bbox.size_y = detection["bbox"][3] - - # append detection to detection list - detection2darray.detections.append(detection2d) - - self.detection_publisher.publish(detection2darray) - - def image_callback(self, msg): - self.get_logger().debug("Received image") - if self.orig_image_width is None: - self.orig_image_width = msg.width - self.orig_image_height = msg.height - - images = [msg] # msg is a single sensor image - startTime = time.time() - for image in images: - - # convert ros Image to cv::Mat + #Taking msgs from all 6 ros2 subscribers + image_list = [msg1,msg2,msg3] + batched_list = [] + #Preprocessing + for msg in image_list: + numpy_array = np.frombuffer(msg.data, np.uint8) + compressedImage = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR) + original_height, original_width = compressedImage.shape[:2] + resized_compressedImage = cv2.resize(compressedImage,(640, 640)) + rgb_image = cv2.cvtColor(resized_compressedImage, cv2.COLOR_BGR2RGB) + normalized_image = rgb_image / 255.0 + chw_image = np.transpose(normalized_image, (2,0,1)) + float_image = chw_image.astype(np.float32) + batched_list.append(float_image) + batched_images = np.stack(batched_list, axis=0) + batch = batched_images.shape[0] + self.initialize_engine(self.tensorRT_model_path, batch,3,640,640) + self.input_info, self.output_info = self.initialize_tensors() + detections = self.tensorRT_inferencing(batched_images) + decoded_results = self.parse_detections(detections) + self.publish_batch_nuscenes(image_list, decoded_results) + + def parse_detections(self, detections): + detection_tensor = detections[0] + batch_size = detection_tensor.shape[0] + num_anchors = detection_tensor.shape[2] + decoded_results = [] + for batch_idx in range(batch_size): + image_detections = detection_tensor[batch_idx] + valid_detections = [] + for anchor_idx in range(num_anchors): + detection = image_detections[:, anchor_idx] + x_center, y_center, width, height = detection[:4] + class_probs = detection[4:] + predicted_class = np.argmax(class_probs) + confidence = class_probs[predicted_class] + x_min = x_center - width / 2 + y_min = y_center - height / 2 + x_max = x_center + width / 2 + y_max = y_center + height / 2 + + valid_detections.append([ + x_min, y_min, x_max, y_max, confidence, predicted_class + ]) + valid_detections = torch.tensor(valid_detections) + nms_results = non_max_suppression( + valid_detections.unsqueeze(0), # Add batch dimension for NMS + conf_thres=0.5, # Confidence threshold + iou_thres=0.45 # IoU threshold + ) + final_detections = [] + for det in nms_results[0]: # Loop through detections in this image + x_min, y_min, x_max, y_max, confidence, predicted_class = det.cpu().numpy() + final_detections.append({ + "bbox": [x_min, y_min, x_max, y_max], + "confidence": confidence, + "class": int(predicted_class), + }) + decoded_results.append(final_detections) + return decoded_results + + def publish_batch_nuscenes(self, image_list, decoded_results): + batch_msg = BatchDetection() + batch_msg.header.stamp = self.get_clock().now().to_msg() + batch_msg.header.frame_id = "batch" + + for idx, img_msg in enumerate(image_list): + #self.get_logger().info(f"{idx}") + numpy_image = np.frombuffer(img_msg.data, np.uint8) + image = cv2.imdecode(numpy_image, cv2.IMREAD_COLOR) + height, width = image.shape[:2] + annotator = Annotator(image, line_width=2, example="Class:0") + detection_array = Detection2DArray() + detection_array.header.stamp = batch_msg.header.stamp + detection_array.header.frame_id = f"camera_{idx}" + batch_detections = decoded_results[idx] + for anchor_idx, detection in enumerate(batch_detections): + # Extract values from the dictionary + bbox = detection["bbox"] + x_min, y_min, x_max, y_max = bbox + confidence = detection["confidence"] + predicted_class = detection["class"] + x_min = int(x_min * width / 640) + x_max = int(x_max * width / 640) + y_min = int(y_min * height / 640) + y_max = int(y_max * height / 640) + label = f"Class: {predicted_class}, Conf: {confidence:.2f}" + annotator.box_label((x_min, y_min, x_max, y_max), label, color=(0, 255, 0)) + detection = Detection2D() + detection.bbox.center.position.x = (x_min + x_max) / 2 + detection.bbox.center.position.y = (y_min + y_max) / 2 + detection.bbox.size_x = float(x_max - x_min) + detection.bbox.size_y = float(y_max - y_min) + + detected_object = ObjectHypothesisWithPose() + detected_object.hypothesis.class_id = str(int(predicted_class)) + detected_object.hypothesis.score = float(confidence) + detection.results.append(detected_object) + detection_array.detections.append(detection) + batch_msg.detections.append(detection_array) + annotated_image = annotator.result() + vis_compressed_image = CompressedImage() + vis_compressed_image.header.stamp = self.get_clock().now().to_msg() + vis_compressed_image.header.frame_id = f"camera_{idx}" + vis_compressed_image.format = "jpeg" + vis_compressed_image.data = cv2.imencode('.jpg', annotated_image, [cv2.IMWRITE_JPEG_QUALITY, 70])[1].tobytes() + self.batch_vis_publishers[idx].publish(vis_compressed_image) + + # Publish Detection2DArray + self.batch_detection_publishers[idx].publish(detection_array) + + # Publish batch detection message + self.batched_camera_message_publisher.publish(batch_msg) + + def eve_batch_inference_callback(self, msg1,msg2,msg3): + image_list = [msg1,msg2,msg3] + batched_list = [] + for msg in image_list: if self.compressed: - np_arr = np.frombuffer(msg.data, np.uint8) - cv_image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + numpy_array = np.frombuffer(msg.data, np.uint8) + cv_image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR) + original_height, original_width = cv_image.shape[:2] else: - try: - cv_image = self.cv_bridge.imgmsg_to_cv2(image, desired_encoding="passthrough") - except CvBridgeError as e: - self.get_logger().error(str(e)) - return - - # preprocess image and run through prediction - img = self.crop_and_convert_to_tensor(cv_image) - pred = self.model(img) - - # nms function used same as yolov8 detect.py - pred = non_max_suppression(pred) - detections = [] - for i, det in enumerate(pred): # per image - if len(det): - # Write results - for *xyxy, conf, cls in reversed(det): - label = self.names[int(cls)] - - bbox = [ - xyxy[0], - xyxy[1], - xyxy[2] - xyxy[0], - xyxy[3] - xyxy[1], - ] - bbox = [b.item() for b in bbox] - bbox = self.convert_bboxes_to_orig_frame(bbox) - - detections.append( - { - "label": label, - "conf": conf.item(), - "bbox": bbox, - } - ) - self.get_logger().debug(f"{label}: {bbox}") - - annotator = Annotator( - cv_image, - line_width=self.line_thickness, - example=str(self.names), - ) - (detections, annotated_img) = self.postprocess_detections(detections, annotator) - - # Currently we support a single camera so we pass an empty string - feed = "" - self.publish_vis(annotated_img, msg, feed) - self.publish_detections(detections, msg, feed) - - if self.save_detections: - cv2.imwrite(f"detections/{self.counter}.jpg", annotated_img) - self.counter += 1 - - self.get_logger().info( - f"Finished in: {time.time() - startTime}, {1/(time.time() - startTime)} Hz" - ) - - + cv_image = self.cv_bridge.imgmsg_to_cv2(msg, desired_encoding="passthrough") + resized_image = cv2.resize(cv_image, (640,640)) # can also be 1024, 1024 + rgb_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB) + normalized_image = rgb_image / 255 + chw_image = np.transpose(normalized_image, (2,0,1)) + float_image = chw_image.astype(np.float32) + batched_list.append(float_image) + batched_images = np.stack(batched_list, axis = 0) + self.get_logger().info(f"batched image shape:{batched_images.shape}") + self.initialize_engine(self.tensorRT_model_path, 3,3,640,640) + self.input_info, self.output_info = self.initialize_tensors() + detections = self.tensorRT_inferencing(batched_images) + decoded_results = self.parse_detections(detections) + + + def publish_batch_eve(self,image_list, decoded_results): + batch_msg = BatchDetection() + batch_msg.header.stamp = self.get_clock().now().to_msg() + batch_msg.header.frame_id = "batch" + + for idx, img_msg in enumerate(image_list): + if self.compressed: + numpy_array = np.frombuffer(msg.data, np.uint8) + image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR) + original_height, original_width = cv_image.shape[:2] + else: + image = self.cv_bridge.imgmsg_to_cv2(msg, desired_encoding="passthrough") + height, width = image.shape[:2] + annotator = Annotator(image, line_width=2, example="Class:0") + detection_array = Detection2DArray() + detection_array.header.stamp = batch_msg.header.stamp + detection_array.header.frame_id = f"camera_{idx}" + batch_detections = decoded_results[idx] + for anchor_idx, detection in enumerate(batch_detections): + # Extract values from the dictionary + bbox = detection["bbox"] + x_min, y_min, x_max, y_max = bbox + confidence = detection["confidence"] + predicted_class = detection["class"] + x_min = int(x_min * width / 640) + x_max = int(x_max * width / 640) + y_min = int(y_min * height / 640) + y_max = int(y_max * height / 640) + #self.get_logger().info(f"Camera {idx}: bbox: {x_min, y_min, x_max, y_max}, conf: {confidence}") + label = f"Class: {predicted_class}, Conf: {confidence:.2f}" + annotator.box_label((x_min, y_min, x_max, y_max), label, color=(0, 255, 0)) + detection = Detection2D() + detection.bbox.center.position.x = (x_min + x_max) / 2 + detection.bbox.center.position.y = (y_min + y_max) / 2 + detection.bbox.size_x = float(x_max - x_min) + detection.bbox.size_y = float(y_max - y_min) + + detected_object = ObjectHypothesisWithPose() + detected_object.hypothesis.class_id = str(int(predicted_class)) + detected_object.hypothesis.score = float(confidence) + detection.results.append(detected_object) + detection_array.detections.append(detection) + batch_msg.detections.append(detection_array) + annotated_image = annotator.result() + vis_image = Image() + vis_image.header.stamp = self.get_clock().now().to_msg() + vis_image.header.frame_id = f"camera_{idx}" + vis_image.format = "jpeg" + vis_image.data = self.cv_bridge.cv2_to_imgmsg( annotated_image,encoding="bgr8") + self.batch_vis_publishers[idx].publish(vis_image) + + # Publish Detection2DArray + self.batch_detection_publishers[idx].publish(detection_array) + + # Publish batch detection message + self.batched_camera_message_publisher.publish(batch_msg) def main(args=None): rclpy.init(args=args) diff --git a/src/perception/camera_object_detection/package.xml b/src/perception/camera_object_detection/package.xml index a8b24405..fc078392 100755 --- a/src/perception/camera_object_detection/package.xml +++ b/src/perception/camera_object_detection/package.xml @@ -9,10 +9,13 @@ TODO: License declaration cv_bridge + rclpy sensor_msgs vision_msgs std_msgs - + camera_object_detection_msgs + python3-pytest + ament_copyright ament_flake8 ament_pep257 diff --git a/src/perception/camera_object_detection/requirements.txt b/src/perception/camera_object_detection/requirements.txt index 0807da67..877493f7 100644 --- a/src/perception/camera_object_detection/requirements.txt +++ b/src/perception/camera_object_detection/requirements.txt @@ -4,4 +4,11 @@ setuptools==65.5.1 easydict huggingface huggingface_hub -numpy==1.24.2 \ No newline at end of file +numpy==1.24.2 +onnx +onnxsim +onnxruntime-gpu +tensorrt==10.6.0 +tensorrt_lean +tensorrt_dispatch +cuda-python \ No newline at end of file diff --git a/src/wato_msgs/perception_msgs/camera_object_detection_msgs/CMakeLists.txt b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/CMakeLists.txt new file mode 100644 index 00000000..7a9097e0 --- /dev/null +++ b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.8) +project(camera_object_detection_msgs) + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + + +find_package(ament_cmake REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(std_msgs REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(vision_msgs REQUIRED) + + + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/BatchDetection.msg" + "msg/EveBatchDetection.msg" + DEPENDENCIES std_msgs sensor_msgs vision_msgs +) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() \ No newline at end of file diff --git a/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/BatchDetection.msg b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/BatchDetection.msg new file mode 100644 index 00000000..e8ecdf73 --- /dev/null +++ b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/BatchDetection.msg @@ -0,0 +1,3 @@ +std_msgs/Header header +sensor_msgs/CompressedImage[] images # Array of compressed images in the batch +vision_msgs/Detection2DArray[] detections # Array of detections for each image diff --git a/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/EveBatchDetection.msg b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/EveBatchDetection.msg new file mode 100644 index 00000000..4d8f55e3 --- /dev/null +++ b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/msg/EveBatchDetection.msg @@ -0,0 +1,3 @@ +std_msgs/Header header +sensor_msgs/Image[] images # Array of compressed images in the batch +vision_msgs/Detection2DArray[] detections # Array of detections for each image diff --git a/src/wato_msgs/perception_msgs/camera_object_detection_msgs/package.xml b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/package.xml new file mode 100644 index 00000000..62cae926 --- /dev/null +++ b/src/wato_msgs/perception_msgs/camera_object_detection_msgs/package.xml @@ -0,0 +1,21 @@ + + + +camera_object_detection_msgs +0.0.0 +TODO: Package description +Arfan12630 +TODO: License declaration +ament_cmake + +std_msgs +vision_msgs +geometry_msgs + +rosidl_default_generators +rosidl_default_runtime +rosidl_interface_packages + + ament_cmake + + \ No newline at end of file diff --git a/watod-config.sh b/watod-config.sh index 3255dd40..96033167 100755 --- a/watod-config.sh +++ b/watod-config.sh @@ -15,7 +15,7 @@ ## - simulation : starts simulation ## - samples : starts sample ROS2 pubsub nodes -# ACTIVE_MODULES="" +ACTIVE_MODULES="perception infrastructure" ################################# MODE OF OPERATION ################################# ## Possible modes of operation when running watod.