Add TensorRT10 support for YOLOv8

mpj1234 · Jul 16, 2024 · f6fff4d · f6fff4d
1 parent 040d053
commit f6fff4d
Show file tree

Hide file tree

Showing 36 changed files with 7,684 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,9 @@
 *.exe
 *.out
 *.app
+
+build/
+models/
+cmake-build-debug/
+cmake-build-release/
+.idea/
diff --git a/CMakeLists-win.txt b/CMakeLists-win.txt
@@ -0,0 +1,103 @@
+cmake_minimum_required(VERSION 3.28)
+project(yolov8_trtx_v10)
+
+set(CMAKE_CXX_STANDARD 11)
+# 设置nvcc编译cu文件时候使用utf-8编码
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /utf-8")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
+
+enable_language(CUDA)
+
+# 设置cuda多个框架支持
+set(CMAKE_CUDA_ARCHITECTURES 75 86 89)
+message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+
+# OpenCV
+set(OpenCV_DIR E:\\Opencv\\install\\opencv-4.8.0\\build)
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS})
+link_directories(${OpenCV_LIB_DIR})
+
+# CUDA
+set(CUDA_TOOLKIT_ROOT_DIR C:\\Program\ Files\\NVIDIA\ GPU\ Computing\ Toolkit\\CUDA\\v11.8)
+include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+
+# TensorRT
+#set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-8.6.1.6)
+set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-10.2.0.19)
+include_directories(${TENSORRT_ROOT}/include)
+link_directories(${TENSORRT_ROOT}/lib)
+
+# 判断TENSORRT_ROOT路径中的version如果路径中第一个.前大于8
+# 获取所有版本文件
+file(GLOB TENSORRT_VERSION_FILES "${TENSORRT_ROOT}/include/NvInferVersion.h")
+# 读取版本文件
+file(STRINGS ${TENSORRT_VERSION_FILES} TENSORRT_VERSION_LINES
+        LIMIT_COUNT 1  # 只读取第一行
+        REGEX "#define NV_TENSORRT_MAJOR [0-9]+"  # 匹配版本号定义行
+)
+message(STATUS "  TENSORRT_VERSION_LINES: ${TENSORRT_VERSION_LINES}")
+# 解析版本号
+string(REGEX REPLACE "#define NV_TENSORRT_MAJOR ([0-9]+)" "\\1" TENSORRT_VERSION_MAJOR ${TENSORRT_VERSION_LINES})
+message(STATUS "  TENSORRT_VERSION_MAJOR: ${TENSORRT_VERSION_MAJOR}")
+# 判断版本号是否大于等于10
+if (TENSORRT_VERSION_MAJOR GREATER_EQUAL 10)
+    message(STATUS "  TensorRT version is greater than or equal to 10.")
+    link_libraries(
+            opencv_core
+            opencv_highgui
+            opencv_imgproc
+            opencv_imgcodecs
+            cudart
+            cublas
+            nvinfer_10
+    )
+else ()
+    message(STATUS "  TensorRT version is less than 10.")
+    link_libraries(
+            opencv_core
+            opencv_highgui
+            opencv_imgproc
+            opencv_imgcodecs
+            cudart
+            cublas
+            nvinfer
+    )
+endif ()
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/plugin)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+link_directories(${CMAKE_SOURCE_DIR}/lib)
+
+add_definitions(-DNOMINMAX)
+
+add_definitions(-DAPI_EXPORTS)
+
+file(GLOB_RECURSE SRCS ${CMAKE_SOURCE_DIR}/src/*.cpp ${CMAKE_SOURCE_DIR}/src/*.cu)
+file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)
+
+add_library(myplugins SHARED ${PLUGIN_SRCS})
+target_link_libraries(myplugins nvinfer_10 nvinfer_plugin_10 cudart)
+
+add_executable(yolov8_cls yolov8_cls.cpp ${SRCS})
+target_link_libraries(yolov8_cls myplugins)
+
+add_executable(yolov8_det yolov8_det.cpp ${SRCS})
+target_link_libraries(yolov8_det nvinfer_10)
+target_link_libraries(yolov8_det cudart)
+target_link_libraries(yolov8_det myplugins)
+target_link_libraries(yolov8_det ${OpenCV_LIBS})
+
+add_executable(yolov8_seg yolov8_seg.cpp ${SRCS})
+target_link_libraries(yolov8_seg nvinfer_10)
+target_link_libraries(yolov8_seg cudart)
+target_link_libraries(yolov8_seg myplugins)
+target_link_libraries(yolov8_seg ${OpenCV_LIBS})
+
+add_executable(yolov8_pose yolov8_pose.cpp ${SRCS})
+target_link_libraries(yolov8_pose nvinfer_10)
+target_link_libraries(yolov8_pose cudart)
+target_link_libraries(yolov8_pose myplugins)
+target_link_libraries(yolov8_pose ${OpenCV_LIBS})
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(yolov8)
+
+add_definitions(-std=c++11)
+add_definitions(-DAPI_EXPORTS)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_BUILD_TYPE Debug)
+
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+enable_language(CUDA)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/plugin)
+
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  message("embed_platform on")
+  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
+  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
+else()
+  message("embed_platform off")
+
+  # cuda
+  include_directories(/usr/local/cuda/include)
+  link_directories(/usr/local/cuda/lib64)
+
+  # tensorrt
+  include_directories(/workspace/shared/TensorRT-10.2.0.19/include/)
+  link_directories(/workspace/shared/TensorRT-10.2.0.19/lib/)
+
+  # include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
+  # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
+endif()
+
+add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
+target_link_libraries(myplugins nvinfer cudart)
+
+find_package(OpenCV)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
+add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS})
+
+target_link_libraries(yolov8_det nvinfer)
+target_link_libraries(yolov8_det cudart)
+target_link_libraries(yolov8_det myplugins)
+target_link_libraries(yolov8_det ${OpenCV_LIBS})
+
+add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
+target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})
+
+add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
+target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
+
+add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
+target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
diff --git a/README.md b/README.md
@@ -0,0 +1,143 @@
+## Introduce
+
+Yolov8 model supports TensorRT-10.
+
+## Environment
+
+CUDA: 11.8
+CUDNN: 8.9.1.23
+TensorRT: TensorRT-10.2.0.19
+
+## Support
+
+* [x] YOLOv8-cls support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-det support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-seg support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-pose support FP32/FP16/INT8 and Python/C++ API
+
+## Config
+
+* Choose the YOLOv8 sub-model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+* Other configs please check [src/config.h](src/config.h)
+
+## Build and Run
+
+1. generate .wts from pytorch with .pt, or download .wts from model zoo
+
+```shell
+git clone https://gitclone.com/github.com/ultralytics/ultralytics.git
+git clone https://github.com/mpj1234/YOLOv8-series-TensorRT10.git
+cd YOLOv8-series-TensorRT10/
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-cls.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-seg.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-pose.pt
+cp [PATH-TO-YOLOv8-series-TensorRT10]/yolov8/gen_wts.py .
+python gen_wts.py -w yolov8n-cls.pt -o yolov8n-cls.wts -t cls
+python gen_wts.py -w yolov8n.pt -o yolov8n.wts
+python gen_wts.py -w yolov8n-seg.pt -o yolov8n-seg.wts -t seg
+python gen_wts.py -w yolov8n-pose.pt -o yolov8n-pose.wts -t pose
+# A file 'yolov8n.wts' will be generated.
+```
+
+2. build YOLOv8-series-TensorRT10 and run
+
+#### Classification
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8sn-cls.wts .
+cmake ..
+make
+
+# Download ImageNet labels
+wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt
+
+# Build and serialize TensorRT engine
+./yolov8_cls -s yolov8n-cls.wts yolov8n-cls.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_cls -d yolov8n-cls.engine ../images
+# The results are displayed in the console
+```
+
+3. Optional, load and run the tensorrt model in Python
+```shell
+// Install python-tensorrt, pycuda, etc.
+// Ensure the yolov8n-cls.engine
+python yolov8_cls_trt.py ./build/yolov8n-cls.engine ../images
+# faq: in windows bug pycuda._driver.LogicError
+# faq: in linux bug Segmentation fault
+# Add the following code to the py file:
+# import pycuda.autoinit
+# import pycuda.driver as cuda
+```
+
+#### Detection
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_det -s yolov8n.wts yolov8n.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_det -d yolov8n.engine ../images [c/g]
+# The results are displayed in the console
+```
+
+#### Segmentation
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n-seg.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_seg -s yolov8n-seg.wts yolov8n-seg.engine [n/s/m/l/x]
+
+# Download the labels file
+wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
+
+# Run inference
+./yolov8_seg -d yolov8n-seg.engine ../images [c/g] coco.txt
+# The results are displayed in the console
+```
+
+#### Pose
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n-pose.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_seg -s yolov8n-pose.wts yolov8n-pose.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_seg -d yolov8n-seg.engine ../images c
+# The results are displayed in the console
+```
+
+## INT8 Quantization
+1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
+2. unzip it in yolov8_trt10/build
+3. set the macro `USE_INT8` in src/config.h and make again
+4. serialize the model and test
diff --git a/gen_wts.py b/gen_wts.py
@@ -0,0 +1,57 @@
+import sys  # noqa: F401
+import argparse
+import os
+import struct
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
+    parser.add_argument('-w', '--weights', required=True,
+                        help='Input weights (.pt) file path (required)')
+    parser.add_argument(
+        '-o', '--output', help='Output (.wts) file path (optional)')
+    parser.add_argument(
+        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose'],
+        help='determines the model is detection/classification')
+    args = parser.parse_args()
+    if not os.path.isfile(args.weights):
+        raise SystemExit('Invalid input file')
+    if not args.output:
+        args.output = os.path.splitext(args.weights)[0] + '.wts'
+    elif os.path.isdir(args.output):
+        args.output = os.path.join(
+            args.output,
+            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
+    return args.weights, args.output, args.type
+
+
+pt_file, wts_file, m_type = parse_args()
+
+print(f'Generating .wts for {m_type} model')
+
+# Load model
+print(f'Loading {pt_file}')
+
+# Initialize
+device = 'cpu'
+
+# Load model
+model = torch.load(pt_file, map_location=device)['model'].float()  # load to FP32
+
+if m_type in ['detect', 'seg', 'pose']:
+    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
+
+    delattr(model.model[-1], 'anchors')
+
+model.to(device).eval()
+
+with open(wts_file, 'w') as f:
+    f.write('{}\n'.format(len(model.state_dict().keys())))
+    for k, v in model.state_dict().items():
+        vr = v.reshape(-1).cpu().numpy()
+        f.write('{} {} '.format(k, len(vr)))
+        for vv in vr:
+            f.write(' ')
+            f.write(struct.pack('>f', float(vv)).hex())
+        f.write('\n')
diff --git a/images/bus.jpg b/images/bus.jpg
diff --git a/images/cat.jpg b/images/cat.jpg
diff --git a/images/dog.jpg b/images/dog.jpg
diff --git a/images/zidane.jpg b/images/zidane.jpg
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,3 +30,9 @@ @@
     *.exe
     *.out
     *.app
+    build/
+    models/
+    cmake-build-debug/
+    cmake-build-release/
+    .idea/