Ar-Ray-code · Li-Kane · Jun 17, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__/
 weights/*.tar
-weights/*.pth
+# weights/*.pth
 YOLOX_outputs/
 *.pth
 *.onnx

diff --git a/README.md b/README.md
@@ -1,5 +1,16 @@
 # YOLOX-ROS
 
+## TR setup 
+### before you build 
+`sudo apt install ros-humble-generate-parameter-library`
+
+### building
+colcon build --cmake-args -DYOLOX_USE_TENSORRT=ON
+
+### running
+`ros2 launch yolox_ros_cpp yolox_tensorrt.launch.py`
+
+
 ![](https://img.shields.io/github/stars/Ar-Ray-code/YOLOX-ROS)
 
 [![iron](https://github.com/Ar-Ray-code/YOLOX-ROS/actions/workflows/ci_iron.yml/badge.svg?branch=iron)](https://github.com/Ar-Ray-code/YOLOX-ROS/actions/workflows/ci_iron.yml)

diff --git a/weights/onnx/armor_s.onnx b/weights/onnx/armor_s.onnx
diff --git a/weights/onnx/armor_sim_s.onnx b/weights/onnx/armor_sim_s.onnx
diff --git a/weights/onnx/armor_tiny.onnx b/weights/onnx/armor_tiny.onnx
diff --git a/weights/tensorrt/armor_s.trt b/weights/tensorrt/armor_s.trt
diff --git a/weights/tensorrt/armor_sim_s.trt b/weights/tensorrt/armor_sim_s.trt
diff --git a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.8)
+cmake_minimum_required(VERSION 3.14)
 project(yolox_cpp)
 
 if(NOT CMAKE_CXX_STANDARD)
@@ -11,6 +11,11 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   add_compile_options(-Wall -Wextra -Wpedantic)
 endif()
 
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math -DNDEBUG")
+set(CMAKE_CUDA_ARCHITECTURES 87)
+project(yolox_cpp CXX CUDA)
+
 find_package(ament_cmake_auto REQUIRED)
 ament_auto_find_build_dependencies()
 
@@ -46,15 +51,15 @@ endif()
 
 if(YOLOX_USE_TENSORRT)
   find_package(CUDA REQUIRED)
+  enable_language(CUDA) 
   if (NOT CUDA_TOOLKIT_ROOT_DIR)
     set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
   endif()
   find_path(TENSORRT_INCLUDE_DIR NvInfer.h
     HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
     PATH_SUFFIXES include)
-
   set(ENABLE_TENSORRT ON)
-  set(TARGET_SRC ${TARGET_SRC} src/yolox_tensorrt.cpp)
+  set(TARGET_SRC ${TARGET_SRC} src/yolox_tensorrt.cpp src/cuda_utils.cu)
   set(TARGET_LIBS ${TARGET_LIBS} nvinfer nvinfer_plugin)
   set(TARGET_DPENDENCIES ${TARGET_DPENDENCIES} CUDA)
 endif()
@@ -85,6 +90,11 @@ configure_file(
 )
 
 ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC})
+target_include_directories(yolox_cpp PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
 ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES})
 ament_export_dependencies(${TARGET_DPENDENCIES})
 target_link_libraries(yolox_cpp ${TARGET_LIBS})

diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
@@ -39,7 +39,8 @@ namespace yolox_cpp
               num_classes_(num_classes), p6_(p6), model_version_(model_version)
         {
         }
-        virtual std::vector<Object> inference(const cv::Mat &frame) = 0;
+        virtual std::vector<Object> inference(const cv::Mat &frame, uchar3* d_image, float* d_output, 
+                                                cudaStream_t copy_stream_, cudaStream_t resize_stream_) = 0;
 
     protected:
         int input_w_;
@@ -76,35 +77,20 @@ namespace yolox_cpp
         // for NCHW
         void blobFromImage(const cv::Mat &img, float *blob_data)
         {
-            const size_t channels = 3;
-            const size_t img_h = img.rows;
-            const size_t img_w = img.cols;
-            const size_t img_hw = img_h * img_w;
-            float *blob_data_ch0 = blob_data;
-            float *blob_data_ch1 = blob_data + img_hw;
-            float *blob_data_ch2 = blob_data + img_hw * 2;
-            // HWC -> CHW
-            if (this->model_version_ == "0.1.0")
-            {
-                for (size_t i = 0; i < img_hw; ++i)
-                {
-                    // blob = (img / 255.0 - mean) / std
-                    const size_t src_idx = i * channels;
-                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
-                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
-                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
-                }
-            }
-            else
-            {
-                for (size_t i = 0; i < img_hw; ++i)
-                {
-                    const size_t src_idx = i * channels;
-                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]);
-                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]);
-                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]);
-                }
-            }
+            cv::Scalar mean = cv::Scalar(0, 0, 0);
+            cv::Scalar std_inv = cv::Scalar(1.0, 1.0, 1.0); // No normalization
+
+            cv::Mat blob = cv::dnn::blobFromImage(
+                img,
+                1.0,        // dummy scalefactor (will apply manually below)
+                cv::Size(), // use original size
+                mean,       // mean subtraction
+                true,       // swapRB = true
+                false       // crop = false
+            );
+
+            // Copy to output buffer
+            std::memcpy(blob_data, blob.ptr<float>(), sizeof(float) * 3 * img.rows * img.cols);
         }
 
         // for NHWC

diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cuh b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <opencv2/core/types.hpp>
+
+__global__ void gpuResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                            int input_width, int input_height, float w_ratio, float h_ratio);
+extern "C" void launchGPUResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                    int input_width, int input_height, float w_ratio, float h_ratio, cudaStream_t stream);