diff --git a/.gitignore b/.gitignore
index 134ceb9..d6610cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__/
 weights/*.tar
-weights/*.pth
+# weights/*.pth
 YOLOX_outputs/
 *.pth
 *.onnx
diff --git a/README.md b/README.md
index 38434df..7b91a20 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,16 @@
 # YOLOX-ROS
 
+## TR setup 
+### before you build 
+`sudo apt install ros-humble-generate-parameter-library`
+
+### building
+colcon build --cmake-args -DYOLOX_USE_TENSORRT=ON
+
+### running
+`ros2 launch yolox_ros_cpp yolox_tensorrt.launch.py`
+
+
 ![](https://img.shields.io/github/stars/Ar-Ray-code/YOLOX-ROS)
 
 [![iron](https://github.com/Ar-Ray-code/YOLOX-ROS/actions/workflows/ci_iron.yml/badge.svg?branch=iron)](https://github.com/Ar-Ray-code/YOLOX-ROS/actions/workflows/ci_iron.yml)
diff --git a/weights/onnx/armor_s.onnx b/weights/onnx/armor_s.onnx
new file mode 100644
index 0000000..0d6064b
Binary files /dev/null and b/weights/onnx/armor_s.onnx differ
diff --git a/weights/onnx/armor_sim_s.onnx b/weights/onnx/armor_sim_s.onnx
new file mode 100644
index 0000000..e109778
Binary files /dev/null and b/weights/onnx/armor_sim_s.onnx differ
diff --git a/weights/onnx/armor_tiny.onnx b/weights/onnx/armor_tiny.onnx
new file mode 100644
index 0000000..0cdf5b1
Binary files /dev/null and b/weights/onnx/armor_tiny.onnx differ
diff --git a/weights/tensorrt/armor_s.trt b/weights/tensorrt/armor_s.trt
new file mode 100644
index 0000000..f033e89
Binary files /dev/null and b/weights/tensorrt/armor_s.trt differ
diff --git a/weights/tensorrt/armor_sim_s.trt b/weights/tensorrt/armor_sim_s.trt
new file mode 100644
index 0000000..c1c41d3
Binary files /dev/null and b/weights/tensorrt/armor_sim_s.trt differ
diff --git a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt
index db849f4..d9834ce 100644
--- a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt
+++ b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.8)
+cmake_minimum_required(VERSION 3.14)
 project(yolox_cpp)
 
 if(NOT CMAKE_CXX_STANDARD)
@@ -11,6 +11,11 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   add_compile_options(-Wall -Wextra -Wpedantic)
 endif()
 
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math -DNDEBUG")
+set(CMAKE_CUDA_ARCHITECTURES 87)
+project(yolox_cpp CXX CUDA)
+
 find_package(ament_cmake_auto REQUIRED)
 ament_auto_find_build_dependencies()
 
@@ -46,15 +51,15 @@ endif()
 
 if(YOLOX_USE_TENSORRT)
   find_package(CUDA REQUIRED)
+  enable_language(CUDA) 
   if (NOT CUDA_TOOLKIT_ROOT_DIR)
     set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
   endif()
   find_path(TENSORRT_INCLUDE_DIR NvInfer.h
     HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
     PATH_SUFFIXES include)
-
   set(ENABLE_TENSORRT ON)
-  set(TARGET_SRC ${TARGET_SRC} src/yolox_tensorrt.cpp)
+  set(TARGET_SRC ${TARGET_SRC} src/yolox_tensorrt.cpp src/cuda_utils.cu)
   set(TARGET_LIBS ${TARGET_LIBS} nvinfer nvinfer_plugin)
   set(TARGET_DPENDENCIES ${TARGET_DPENDENCIES} CUDA)
 endif()
@@ -85,6 +90,11 @@ configure_file(
 )
 
 ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC})
+target_include_directories(yolox_cpp PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
 ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES})
 ament_export_dependencies(${TARGET_DPENDENCIES})
 target_link_libraries(yolox_cpp ${TARGET_LIBS})
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
index 5360908..217afd1 100644
--- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
+++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
@@ -39,7 +39,8 @@ namespace yolox_cpp
               num_classes_(num_classes), p6_(p6), model_version_(model_version)
         {
         }
-        virtual std::vector<Object> inference(const cv::Mat &frame) = 0;
+        virtual std::vector<Object> inference(const cv::Mat &frame, uchar3* d_image, float* d_output, 
+                                                cudaStream_t copy_stream_, cudaStream_t resize_stream_) = 0;
 
     protected:
         int input_w_;
@@ -76,35 +77,20 @@ namespace yolox_cpp
         // for NCHW
         void blobFromImage(const cv::Mat &img, float *blob_data)
         {
-            const size_t channels = 3;
-            const size_t img_h = img.rows;
-            const size_t img_w = img.cols;
-            const size_t img_hw = img_h * img_w;
-            float *blob_data_ch0 = blob_data;
-            float *blob_data_ch1 = blob_data + img_hw;
-            float *blob_data_ch2 = blob_data + img_hw * 2;
-            // HWC -> CHW
-            if (this->model_version_ == "0.1.0")
-            {
-                for (size_t i = 0; i < img_hw; ++i)
-                {
-                    // blob = (img / 255.0 - mean) / std
-                    const size_t src_idx = i * channels;
-                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]) * this->std255_inv_[0] + this->mean_std_[0];
-                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]) * this->std255_inv_[1] + this->mean_std_[1];
-                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]) * this->std255_inv_[2] + this->mean_std_[2];
-                }
-            }
-            else
-            {
-                for (size_t i = 0; i < img_hw; ++i)
-                {
-                    const size_t src_idx = i * channels;
-                    blob_data_ch0[i] = static_cast<float>(img.data[src_idx + 0]);
-                    blob_data_ch1[i] = static_cast<float>(img.data[src_idx + 1]);
-                    blob_data_ch2[i] = static_cast<float>(img.data[src_idx + 2]);
-                }
-            }
+            cv::Scalar mean = cv::Scalar(0, 0, 0);
+            cv::Scalar std_inv = cv::Scalar(1.0, 1.0, 1.0); // No normalization
+
+            cv::Mat blob = cv::dnn::blobFromImage(
+                img,
+                1.0,        // dummy scalefactor (will apply manually below)
+                cv::Size(), // use original size
+                mean,       // mean subtraction
+                true,       // swapRB = true
+                false       // crop = false
+            );
+
+            // Copy to output buffer
+            std::memcpy(blob_data, blob.ptr<float>(), sizeof(float) * 3 * img.rows * img.cols);
         }
 
         // for NHWC
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cuh b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cuh
new file mode 100644
index 0000000..87b2b0e
--- /dev/null
+++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <opencv2/core/types.hpp>
+
+__global__ void gpuResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                            int input_width, int input_height, float w_ratio, float h_ratio);
+extern "C" void launchGPUResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                    int input_width, int input_height, float w_ratio, float h_ratio, cudaStream_t stream);
\ No newline at end of file
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/tensorrt_logging.h b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/tensorrt_logging.h
index abea1ec..0a75ded 100644
--- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/tensorrt_logging.h
+++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/tensorrt_logging.h
@@ -26,503 +26,507 @@
 #include <sstream>
 #include <string>
 
-namespace yolox_cpp
-{
-
-    using Severity = nvinfer1::ILogger::Severity;
-
-    class LogStreamConsumerBuffer : public std::stringbuf
-    {
-    public:
-        LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
-            : mOutput(stream)
-            , mPrefix(prefix)
-            , mShouldLog(shouldLog)
-        {
-        }
-
-        LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept
-            : mOutput(other.mOutput)
-            , mPrefix(other.mPrefix)
-            , mShouldLog(other.mShouldLog)
-        {
-        }
-        LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete;
-        LogStreamConsumerBuffer() = delete;
-        LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete;
-        LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete;
-
-        ~LogStreamConsumerBuffer() override
-        {
-            // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
-            // std::streambuf::pptr() gives a pointer to the current position of the output sequence
-            // if the pointer to the beginning is not equal to the pointer to the current position,
-            // call putOutput() to log the output to the stream
-            if (pbase() != pptr())
-            {
-                putOutput();
-            }
-        }
-
-        //!
-        //! synchronizes the stream buffer and returns 0 on success
-        //! synchronizing the stream buffer consists of inserting the buffer contents into the stream,
-        //! resetting the buffer and flushing the stream
-        //!
-        int32_t sync() override
-        {
+namespace yolox_cpp {
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf {
+  public:
+    LogStreamConsumerBuffer(std::ostream &stream, const std::string &prefix,
+                            bool shouldLog)
+        : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {}
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer &&other) noexcept
+        : mOutput(other.mOutput), mPrefix(other.mPrefix),
+          mShouldLog(other.mShouldLog) {}
+    LogStreamConsumerBuffer(const LogStreamConsumerBuffer &other) = delete;
+    LogStreamConsumerBuffer() = delete;
+    LogStreamConsumerBuffer &
+    operator=(const LogStreamConsumerBuffer &) = delete;
+    LogStreamConsumerBuffer &operator=(LogStreamConsumerBuffer &&) = delete;
+
+    ~LogStreamConsumerBuffer() override {
+        // std::streambuf::pbase() gives a pointer to the beginning of the
+        // buffered part of the output sequence std::streambuf::pptr() gives a
+        // pointer to the current position of the output sequence if the pointer
+        // to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr()) {
             putOutput();
-            return 0;
         }
-
-        void putOutput()
-        {
-            if (mShouldLog)
-            {
-                // prepend timestamp
-                std::time_t timestamp = std::time(nullptr);
-                tm* tm_local = std::localtime(&timestamp);
-                mOutput << "[";
-                mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
-                mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
-                mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
-                mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
-                mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
-                mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
-                // std::stringbuf::str() gets the string contents of the buffer
-                // insert the buffer contents pre-appended by the appropriate prefix into the stream
-                mOutput << mPrefix << str();
-            }
-            // set the buffer to empty
-            str("");
-            // flush the stream
-            mOutput.flush();
-        }
-
-        void setShouldLog(bool shouldLog)
-        {
-            mShouldLog = shouldLog;
-        }
-
-    private:
-        std::ostream& mOutput;
-        std::string mPrefix;
-        bool mShouldLog{};
-    }; // class LogStreamConsumerBuffer
+    }
 
     //!
-    //! \class LogStreamConsumerBase
-    //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+    //! synchronizes the stream buffer and returns 0 on success
+    //! synchronizing the stream buffer consists of inserting the buffer
+    //! contents into the stream, resetting the buffer and flushing the stream
     //!
-    class LogStreamConsumerBase
-    {
-    public:
-        LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
-            : mBuffer(stream, prefix, shouldLog)
-        {
-        }
+    int32_t sync() override {
+        putOutput();
+        return 0;
+    }
 
-    protected:
-        LogStreamConsumerBuffer mBuffer;
-    }; // class LogStreamConsumerBase
-
-    //!
-    //! \class LogStreamConsumer
-    //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
-    //!  Order of base classes is LogStreamConsumerBase and then std::ostream.
-    //!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
-    //!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
-    //!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
-    //!  Please do not change the order of the parent classes.
-    //!
-    class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
-    {
-    public:
-        //!
-        //! \brief Creates a LogStreamConsumer which logs messages with level severity.
-        //!  Reportable severity determines if the messages are severe enough to be logged.
-        //!
-        LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity)
-            : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
-            , std::ostream(&mBuffer) // links the stream buffer with the stream
-            , mShouldLog(severity <= reportableSeverity)
-            , mSeverity(severity)
-        {
+    void putOutput() {
+        if (mShouldLog) {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm *tm_local = std::localtime(&timestamp);
+            mOutput << "[";
+            mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon
+                    << "/";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday
+                    << "/";
+            mOutput << std::setw(4) << std::setfill('0')
+                    << 1900 + tm_local->tm_year << "-";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour
+                    << ":";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min
+                    << ":";
+            mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec
+                    << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix
+            // into the stream
+            mOutput << mPrefix << str();
         }
+        // set the buffer to empty
+        str("");
+        // flush the stream
+        mOutput.flush();
+    }
 
-        LogStreamConsumer(LogStreamConsumer&& other) noexcept
-            : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
-            , std::ostream(&mBuffer) // links the stream buffer with the stream
-            , mShouldLog(other.mShouldLog)
-            , mSeverity(other.mSeverity)
-        {
-        }
-        LogStreamConsumer(const LogStreamConsumer& other) = delete;
-        LogStreamConsumer() = delete;
-        ~LogStreamConsumer() = default;
-        LogStreamConsumer& operator=(const LogStreamConsumer&) = delete;
-        LogStreamConsumer& operator=(LogStreamConsumer&&) = delete;
-
-        void setReportableSeverity(Severity reportableSeverity)
-        {
-            mShouldLog = mSeverity <= reportableSeverity;
-            mBuffer.setShouldLog(mShouldLog);
-        }
+    void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }
+
+  private:
+    std::ostream &mOutput;
+    std::string mPrefix;
+    bool mShouldLog{};
+}; // class LogStreamConsumerBuffer
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before
+//! std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase {
+  public:
+    LogStreamConsumerBase(std::ostream &stream, const std::string &prefix,
+                          bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog) {}
+
+  protected:
+    LogStreamConsumerBuffer mBuffer;
+}; // class LogStreamConsumerBase
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when
+//! logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the
+//!  LogStreamConsumerBuffer member field in LogStreamConsumer and then the
+//!  address of the buffer is passed to std::ostream. This is necessary to
+//!  prevent the address of an uninitialized buffer from being passed to
+//!  std::ostream. Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream {
+  public:
+    //!
+    //! \brief Creates a LogStreamConsumer which logs messages with level
+    //! severity.
+    //!  Reportable severity determines if the messages are severe enough to be
+    //!  logged.
+    //!
+    LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity,
+                      nvinfer1::ILogger::Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity),
+                                severityPrefix(severity),
+                                severity <= reportableSeverity),
+          std::ostream(&mBuffer) // links the stream buffer with the stream
+          ,
+          mShouldLog(severity <= reportableSeverity), mSeverity(severity) {}
+
+    LogStreamConsumer(LogStreamConsumer &&other) noexcept
+        : LogStreamConsumerBase(severityOstream(other.mSeverity),
+                                severityPrefix(other.mSeverity),
+                                other.mShouldLog),
+          std::ostream(&mBuffer) // links the stream buffer with the stream
+          ,
+          mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {}
+    LogStreamConsumer(const LogStreamConsumer &other) = delete;
+    LogStreamConsumer() = delete;
+    ~LogStreamConsumer() = default;
+    LogStreamConsumer &operator=(const LogStreamConsumer &) = delete;
+    LogStreamConsumer &operator=(LogStreamConsumer &&) = delete;
+
+    void setReportableSeverity(Severity reportableSeverity) {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
 
-    private:
-        static std::ostream& severityOstream(Severity severity)
-        {
-            return severity >= Severity::kINFO ? std::cout : std::cerr;
-        }
+  private:
+    static std::ostream &severityOstream(Severity severity) {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
 
-        static std::string severityPrefix(Severity severity)
-        {
-            switch (severity)
-            {
-            case Severity::kINTERNAL_ERROR: return "[F] ";
-            case Severity::kERROR: return "[E] ";
-            case Severity::kWARNING: return "[W] ";
-            case Severity::kINFO: return "[I] ";
-            case Severity::kVERBOSE: return "[V] ";
-            default: assert(0); return "";
-            }
+    static std::string severityPrefix(Severity severity) {
+        switch (severity) {
+        case Severity::kINTERNAL_ERROR:
+            return "[F] ";
+        case Severity::kERROR:
+            return "[E] ";
+        case Severity::kWARNING:
+            return "[W] ";
+        case Severity::kINFO:
+            return "[I] ";
+        case Severity::kVERBOSE:
+            return "[V] ";
+        default:
+            assert(0);
+            return "";
         }
+    }
 
-        bool mShouldLog;
-        Severity mSeverity;
-    }; // class LogStreamConsumer
+    bool mShouldLog;
+    Severity mSeverity;
+}; // class LogStreamConsumer
+
+//!
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and
+//! samples to log information to the console, and supports logging two types of
+//! messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or
+//! internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to
+//! emitting directly to stdout/stderr is that the logic for controlling the
+//! verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results
+//! to a file in some standard format (for example, JUnit XML), and providing
+//! additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits
+//! directly from the nvinfer1::ILogger interface, which is problematic since
+//! there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to
+//! access the ILogger) we can refactor the class to eliminate the inheritance
+//! and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+//!
+class Logger : public nvinfer1::ILogger {
+  public:
+    explicit Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity) {}
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger
+    //! associated with this Logger \return The nvinfer1::ILogger associated
+    //! with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the
+    //! logger with TensorRT, we can eliminate the inheritance of Logger from
+    //! ILogger
+    //!
+    nvinfer1::ILogger &getTRTLogger() noexcept { return *this; }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will
+    //! eventually go away once we eliminate the inheritance from
+    //! nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char *msg) noexcept override {
+        LogStreamConsumer(mReportableSeverity, severity)
+            << "[TRT] " << std::string(msg) << std::endl;
+    }
 
     //!
-    //! \class Logger
+    //! \brief Method for controlling the verbosity of logging output
     //!
-    //! \brief Class which manages logging of TensorRT tools and samples
+    //! \param severity The logger will only emit messages that have severity of
+    //! this level or higher.
     //!
-    //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
-    //! and supports logging two types of messages:
-    //!
-    //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
-    //! - Test pass/fail messages
-    //!
-    //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
-    //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
-    //!
-    //! In the future, this class could be extended to support dumping test results to a file in some standard format
-    //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+    void setReportableSeverity(Severity severity) noexcept {
+        mReportableSeverity = severity;
+    }
+
     //!
-    //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
-    //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
-    //! library and messages coming from the sample.
+    //! \brief Opaque handle that holds logging information for a particular
+    //! test
     //!
-    //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
-    //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
-    //! object.
+    //! This object is an opaque handle to information used by the Logger to
+    //! print test results. The sample must call Logger::defineTest() in order
+    //! to obtain a TestAtom that can be used with
+    //! Logger::reportTest{Start,End}().
     //!
-    class Logger : public nvinfer1::ILogger
-    {
-    public:
-        explicit Logger(Severity severity = Severity::kWARNING)
-            : mReportableSeverity(severity)
-        {
-        }
-
-        //!
-        //! \enum TestResult
-        //! \brief Represents the state of a given test
-        //!
-        enum class TestResult
-        {
-            kRUNNING, //!< The test is running
-            kPASSED,  //!< The test passed
-            kFAILED,  //!< The test failed
-            kWAIVED   //!< The test was waived
-        };
-
-        //!
-        //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
-        //! \return The nvinfer1::ILogger associated with this Logger
-        //!
-        //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
-        //! we can eliminate the inheritance of Logger from ILogger
-        //!
-        nvinfer1::ILogger& getTRTLogger() noexcept
-        {
-            return *this;
-        }
-
-        //!
-        //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
-        //!
-        //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
-        //! inheritance from nvinfer1::ILogger
-        //!
-        void log(Severity severity, const char* msg) noexcept override
-        {
-            LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
-        }
-
-        //!
-        //! \brief Method for controlling the verbosity of logging output
-        //!
-        //! \param severity The logger will only emit messages that have severity of this level or higher.
-        //!
-        void setReportableSeverity(Severity severity) noexcept
-        {
-            mReportableSeverity = severity;
-        }
+    class TestAtom {
+      public:
+        TestAtom(TestAtom &&) = default;
 
-        //!
-        //! \brief Opaque handle that holds logging information for a particular test
-        //!
-        //! This object is an opaque handle to information used by the Logger to print test results.
-        //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
-        //! with Logger::reportTest{Start,End}().
-        //!
-        class TestAtom
-        {
-        public:
-            TestAtom(TestAtom&&) = default;
-
-        private:
-            friend class Logger;
-
-            TestAtom(bool started, const std::string& name, const std::string& cmdline)
-                : mStarted(started)
-                , mName(name)
-                , mCmdline(cmdline)
-            {
-            }
-
-            bool mStarted;
-            std::string mName;
-            std::string mCmdline;
-        };
-
-        //!
-        //! \brief Define a test for logging
-        //!
-        //! \param[in] name The name of the test.  This should be a string starting with
-        //!                  "TensorRT" and containing dot-separated strings containing
-        //!                  the characters [A-Za-z0-9_].
-        //!                  For example, "TensorRT.sample_googlenet"
-        //! \param[in] cmdline The command line used to reproduce the test
-        //
-        //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
-        //!
-        static TestAtom defineTest(const std::string& name, const std::string& cmdline)
-        {
-            return TestAtom(false, name, cmdline);
-        }
-
-        //!
-        //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
-        //!        as input
-        //!
-        //! \param[in] name The name of the test
-        //! \param[in] argc The number of command-line arguments
-        //! \param[in] argv The array of command-line arguments (given as C strings)
-        //!
-        //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
-        //!
-        static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv)
-        {
-            // Append TensorRT version as info
-            const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]";
-            auto cmdline = genCmdlineString(argc, argv);
-            return defineTest(vname, cmdline);
-        }
-
-        //!
-        //! \brief Report that a test has started.
-        //!
-        //! \pre reportTestStart() has not been called yet for the given testAtom
-        //!
-        //! \param[in] testAtom The handle to the test that has started
-        //!
-        static void reportTestStart(TestAtom& testAtom)
-        {
-            reportTestResult(testAtom, TestResult::kRUNNING);
-            assert(!testAtom.mStarted);
-            testAtom.mStarted = true;
-        }
-
-        //!
-        //! \brief Report that a test has ended.
-        //!
-        //! \pre reportTestStart() has been called for the given testAtom
-        //!
-        //! \param[in] testAtom The handle to the test that has ended
-        //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
-        //!                   TestResult::kFAILED, TestResult::kWAIVED
-        //!
-        static void reportTestEnd(TestAtom const& testAtom, TestResult result)
-        {
-            assert(result != TestResult::kRUNNING);
-            assert(testAtom.mStarted);
-            reportTestResult(testAtom, result);
-        }
+      private:
+        friend class Logger;
 
-        static int32_t reportPass(TestAtom const& testAtom)
-        {
-            reportTestEnd(testAtom, TestResult::kPASSED);
-            return EXIT_SUCCESS;
-        }
+        TestAtom(bool started, const std::string &name,
+                 const std::string &cmdline)
+            : mStarted(started), mName(name), mCmdline(cmdline) {}
 
-        static int32_t reportFail(TestAtom const& testAtom)
-        {
-            reportTestEnd(testAtom, TestResult::kFAILED);
-            return EXIT_FAILURE;
-        }
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
 
-        static int32_t reportWaive(TestAtom const& testAtom)
-        {
-            reportTestEnd(testAtom, TestResult::kWAIVED);
-            return EXIT_SUCCESS;
-        }
-
-        static int32_t reportTest(TestAtom const& testAtom, bool pass)
-        {
-            return pass ? reportPass(testAtom) : reportFail(testAtom);
-        }
-
-        Severity getReportableSeverity() const
-        {
-            return mReportableSeverity;
-        }
-
-    private:
-        //!
-        //! \brief returns an appropriate string for prefixing a log message with the given severity
-        //!
-        static const char* severityPrefix(Severity severity)
-        {
-            switch (severity)
-            {
-            case Severity::kINTERNAL_ERROR: return "[F] ";
-            case Severity::kERROR: return "[E] ";
-            case Severity::kWARNING: return "[W] ";
-            case Severity::kINFO: return "[I] ";
-            case Severity::kVERBOSE: return "[V] ";
-            default: assert(0); return "";
-            }
-        }
-
-        //!
-        //! \brief returns an appropriate string for prefixing a test result message with the given result
-        //!
-        static const char* testResultString(TestResult result)
-        {
-            switch (result)
-            {
-            case TestResult::kRUNNING: return "RUNNING";
-            case TestResult::kPASSED: return "PASSED";
-            case TestResult::kFAILED: return "FAILED";
-            case TestResult::kWAIVED: return "WAIVED";
-            default: assert(0); return "";
-            }
-        }
-
-        //!
-        //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
-        //!
-        static std::ostream& severityOstream(Severity severity)
-        {
-            return severity >= Severity::kINFO ? std::cout : std::cerr;
-        }
-
-        //!
-        //! \brief method that implements logging test results
-        //!
-        static void reportTestResult(TestAtom const& testAtom, TestResult result)
-        {
-            severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
-                                            << testAtom.mCmdline << std::endl;
-        }
-
-        //!
-        //! \brief generate a command line string from the given (argc, argv) values
-        //!
-        static std::string genCmdlineString(int32_t argc, char const* const* argv)
-        {
-            std::stringstream ss;
-            for (int32_t i = 0; i < argc; i++)
-            {
-                if (i > 0)
-                {
-                    ss << " ";
-                }
-                ss << argv[i];
-            }
-            return ss.str();
-        }
-
-        Severity mReportableSeverity;
-    }; // class Logger
-
-} // namespace yolox_cpp
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting
+    //! with
+    //!                  "TensorRT" and containing dot-separated strings
+    //!                  containing the characters [A-Za-z0-9_]. For example,
+    //!                  "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string &name,
+                               const std::string &cmdline) {
+        return TestAtom(false, name, cmdline);
+    }
 
-namespace
-{
-    using namespace yolox_cpp;
     //!
-    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+    //! \brief A convenience overloaded version of defineTest() that accepts an
+    //! array of command-line arguments
+    //!        as input
     //!
-    //! Example usage:
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
     //!
-    //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
     //!
-    inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
-    {
-        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+    static TestAtom defineTest(const std::string &name, int32_t argc,
+                               char const *const *argv) {
+        // Append TensorRT version as info
+        const std::string vname =
+            name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]";
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(vname, cmdline);
     }
 
     //!
-    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+    //! \brief Report that a test has started.
     //!
-    //! Example usage:
+    //! \pre reportTestStart() has not been called yet for the given testAtom
     //!
-    //!     LOG_INFO(logger) << "hello world" << std::endl;
+    //! \param[in] testAtom The handle to the test that has started
     //!
-    inline LogStreamConsumer LOG_INFO(const Logger& logger)
-    {
-        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+    static void reportTestStart(TestAtom &testAtom) {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
     }
 
     //!
-    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+    //! \brief Report that a test has ended.
     //!
-    //! Example usage:
+    //! \pre reportTestStart() has been called for the given testAtom
     //!
-    //!     LOG_WARN(logger) << "hello world" << std::endl;
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of
+    //! TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
     //!
-    inline LogStreamConsumer LOG_WARN(const Logger& logger)
-    {
-        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+    static void reportTestEnd(TestAtom const &testAtom, TestResult result) {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int32_t reportPass(TestAtom const &testAtom) {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int32_t reportFail(TestAtom const &testAtom) {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int32_t reportWaive(TestAtom const &testAtom) {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int32_t reportTest(TestAtom const &testAtom, bool pass) {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const { return mReportableSeverity; }
+
+  private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with
+    //! the given severity
+    //!
+    static const char *severityPrefix(Severity severity) {
+        switch (severity) {
+        case Severity::kINTERNAL_ERROR:
+            return "[F] ";
+        case Severity::kERROR:
+            return "[E] ";
+        case Severity::kWARNING:
+            return "[W] ";
+        case Severity::kINFO:
+            return "[I] ";
+        case Severity::kVERBOSE:
+            return "[V] ";
+        default:
+            assert(0);
+            return "";
+        }
     }
 
     //!
-    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
-    //!
-    //! Example usage:
+    //! \brief returns an appropriate string for prefixing a test result message
+    //! with the given result
+    //!
+    static const char *testResultString(TestResult result) {
+        switch (result) {
+        case TestResult::kRUNNING:
+            return "RUNNING";
+        case TestResult::kPASSED:
+            return "PASSED";
+        case TestResult::kFAILED:
+            return "FAILED";
+        case TestResult::kWAIVED:
+            return "WAIVED";
+        default:
+            assert(0);
+            return "";
+        }
+    }
+
     //!
-    //!     LOG_ERROR(logger) << "hello world" << std::endl;
+    //! \brief returns an appropriate output stream (cout or cerr) to use with
+    //! the given severity
     //!
-    inline LogStreamConsumer LOG_ERROR(const Logger& logger)
-    {
-        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+    static std::ostream &severityOstream(Severity severity) {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
     }
 
     //!
-    //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
-    //!        ("fatal" severity)
+    //! \brief method that implements logging test results
     //!
-    //! Example usage:
+    static void reportTestResult(TestAtom const &testAtom, TestResult result) {
+        severityOstream(Severity::kINFO)
+            << "&&&& " << testResultString(result) << " " << testAtom.mName
+            << " # " << testAtom.mCmdline << std::endl;
+    }
+
     //!
-    //!     LOG_FATAL(logger) << "hello world" << std::endl;
+    //! \brief generate a command line string from the given (argc, argv) values
     //!
-    inline LogStreamConsumer LOG_FATAL(const Logger& logger)
-    {
-        return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+    static std::string genCmdlineString(int32_t argc, char const *const *argv) {
+        std::stringstream ss;
+        for (int32_t i = 0; i < argc; i++) {
+            if (i > 0) {
+                ss << " ";
+            }
+            ss << argv[i];
+        }
+        return ss.str();
     }
+
+    Severity mReportableSeverity;
+}; // class Logger
+
+} // namespace yolox_cpp
+
+namespace {
+using namespace yolox_cpp;
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages
+//! of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger &logger) {
+    return LogStreamConsumer(logger.getReportableSeverity(),
+                             Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages
+//! of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger &logger) {
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages
+//! of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger &logger) {
+    return LogStreamConsumer(logger.getReportableSeverity(),
+                             Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages
+//! of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger &logger) {
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages
+//! of severity kINTERNAL_ERROR
+//!        ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger &logger) {
+    return LogStreamConsumer(logger.getReportableSeverity(),
+                             Severity::kINTERNAL_ERROR);
+}
 } // anonymous namespace
 
 #endif // TENSORRT_LOGGING_H
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp
index 15ee452..912c7e6 100644
--- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp
+++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <fstream>
 #include <opencv2/opencv.hpp>
+#include <opencv2/core/cuda.hpp>
 
 #include <cuda_runtime_api.h>
 #include <NvInfer.h>
@@ -37,10 +38,10 @@ namespace yolox_cpp{
                           float nms_th=0.45, float conf_th=0.3, const std::string &model_version="0.1.1rc0",
                           int num_classes=80, bool p6=false);
             ~YoloXTensorRT();
-            std::vector<Object> inference(const cv::Mat& frame) override;
-
+            std::vector<Object> inference(const cv::Mat &frame, uchar3* d_image, float* d_output, 
+                                            cudaStream_t copy_stream_, cudaStream_t resize_stream_);
         private:
-            void doInference(const float* input, float* output);
+            void doInference(const float* input, float* output, cudaStream_t copy_stream_);
 
             int DEVICE_ = 0;
             Logger gLogger_;
diff --git a/yolox_ros_cpp/yolox_cpp/src/cuda_utils.cu b/yolox_ros_cpp/yolox_cpp/src/cuda_utils.cu
new file mode 100644
index 0000000..5bec46d
--- /dev/null
+++ b/yolox_ros_cpp/yolox_cpp/src/cuda_utils.cu
@@ -0,0 +1,58 @@
+#include <cstdio>
+#include <yolox_cpp/cuda_utils.cuh>
+
+__global__ void gpuResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                            int input_width, int input_height, float w_ratio, float h_ratio) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= r_width || y >= r_height) return;
+
+    // Look into the input image, grab all the pixels
+    // Do math to determine the color of the output pixel; for bilinear scaling
+    float sy = (y + 0.5) * h_ratio - 0.5;
+    float sx = (x + 0.5) * w_ratio - 0.5;
+
+    int y0 = (int) sy;
+    int x0 = (int) sx;
+    int y1 = y0 + 1;
+    int x1 = x0 + 1;
+
+    y0 = max(y0, 0);
+    x0 = max(x0, 0);
+    y1 = min(y1, input_height - 1);
+    x1 = min(x1, input_width - 1);
+
+    int src_idx = y0 * input_width + x0;
+    if (src_idx >= input_width * input_height) return;
+
+    // Weights for each input pixel to contribute
+    float w00 = (1 - (sy - y0)) * (1 - (sx - x0)); // For y = 0 x = 0
+    float w01 = (1 - (sy - y0)) * (sx - x0); // For y = 0 x = 1
+    float w10 = (sy - y0) * (1 - (sx - x0)); // For y = 1 x = 0 dy(1-dx)
+    float w11 = (sy - y0) * (sx - x0); // For y = 1 x = 1
+
+    int r_idx = 0 * r_height * r_width + y * r_width + x;
+    int g_idx = 1 * r_height * r_width + y * r_width + x;
+    int b_idx = 2 * r_height * r_width + y * r_width + x;
+    
+    uchar3 p00 = image_data[y0 * input_width + x0];
+    uchar3 p01 = image_data[y0 * input_width + x1];
+    uchar3 p10 = image_data[y1 * input_width + x0];
+    uchar3 p11 = image_data[y1 * input_width + x1];
+
+    // Red channel (uchar3.z)
+    output[r_idx] = w00 * p00.z + w01 * p01.z + w10 * p10.z + w11 * p11.z;
+    // Green channel (uchar3.y)
+    output[g_idx] = w00 * p00.y + w01 * p01.y + w10 * p10.y + w11 * p11.y;
+    // Blue channel (uchar3.x)
+    output[b_idx] = w00 * p00.x + w01 * p01.x + w10 * p10.x + w11 * p11.x;
+}
+
+extern "C" void launchGPUResizeAndBlobFromImage(uchar3* image_data, float* output, int r_width, int r_height, 
+                                    int input_width, int input_height, float w_ratio, float h_ratio, cudaStream_t stream) {
+    dim3 block(16, 16);
+    dim3 grid((r_width + block.x - 1) / block.x, 
+             (r_height + block.y - 1) / block.y);
+    gpuResizeAndBlobFromImage<<<grid, block, 0, stream>>>(image_data, output, r_width, r_height, input_width, input_height, w_ratio, h_ratio);
+}
\ No newline at end of file
diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_onnxruntime.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_onnxruntime.cpp
index d3f97a0..430e837 100644
--- a/yolox_ros_cpp/yolox_cpp/src/yolox_onnxruntime.cpp
+++ b/yolox_ros_cpp/yolox_cpp/src/yolox_onnxruntime.cpp
@@ -1,150 +1,147 @@
 #include "yolox_cpp/yolox_onnxruntime.hpp"
 
-namespace yolox_cpp{
-
-    YoloXONNXRuntime::YoloXONNXRuntime(const file_name_t &path_to_model,
-                                       int intra_op_num_threads, int inter_op_num_threads,
-                                       bool use_cuda, int device_id, bool use_parallel,
-                                       float nms_th, float conf_th, const std::string &model_version,
-                                       int num_classes, bool p6)
-    :AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
-     intra_op_num_threads_(intra_op_num_threads), inter_op_num_threads_(inter_op_num_threads),
-     use_cuda_(use_cuda), device_id_(device_id), use_parallel_(use_parallel)
-    {
-        try
-        {
-            Ort::SessionOptions session_options;
-
-            session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-            if(this->use_parallel_)
-            {
-                session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-                session_options.SetInterOpNumThreads(this->inter_op_num_threads_);
-            }
-            else
-            {
-                session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-            }
-            session_options.SetIntraOpNumThreads(this->intra_op_num_threads_);
-
-            if(this->use_cuda_)
-            {
-                OrtCUDAProviderOptions cuda_option;
-                cuda_option.device_id = this->device_id_;
-                session_options.AppendExecutionProvider_CUDA(cuda_option);
-            }
-
-            this->session_ = Ort::Session(this->env_,
-                                          path_to_model.c_str(),
-                                          session_options);
-        }
-        catch (std::exception &e)
-        {
-            std::cerr << e.what() << std::endl;
-            throw e;
+namespace yolox_cpp {
+
+YoloXONNXRuntime::YoloXONNXRuntime(const file_name_t &path_to_model,
+                                   int intra_op_num_threads,
+                                   int inter_op_num_threads, bool use_cuda,
+                                   int device_id, bool use_parallel,
+                                   float nms_th, float conf_th,
+                                   const std::string &model_version,
+                                   int num_classes, bool p6)
+    : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
+      intra_op_num_threads_(intra_op_num_threads),
+      inter_op_num_threads_(inter_op_num_threads), use_cuda_(use_cuda),
+      device_id_(device_id), use_parallel_(use_parallel) {
+    try {
+        Ort::SessionOptions session_options;
+
+        session_options.SetGraphOptimizationLevel(
+            GraphOptimizationLevel::ORT_ENABLE_ALL);
+        if (this->use_parallel_) {
+            session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+            session_options.SetInterOpNumThreads(this->inter_op_num_threads_);
+        } else {
+            session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
         }
+        session_options.SetIntraOpNumThreads(this->intra_op_num_threads_);
 
-        Ort::AllocatorWithDefaultOptions ort_alloc;
-
-        // Allocate input memory buffer
-        std::cout << "input:" << std::endl;
-        this->input_name_ = std::string(this->session_.GetInputNameAllocated(0, ort_alloc).get());
-        // this->input_name_ = this->session_.GetInputName(0, ort_alloc);
-        std::cout << " name: " << this->input_name_ << std::endl;
-        auto input_info = this->session_.GetInputTypeInfo(0);
-        auto input_shape_info = input_info.GetTensorTypeAndShapeInfo();
-        std::vector<int64_t> input_shape = input_shape_info.GetShape();
-        ONNXTensorElementDataType  input_tensor_type = input_shape_info.GetElementType();
-        this->input_h_ = input_shape[2];
-        this->input_w_ = input_shape[3];
-
-        std::cout << " shape:" << std::endl;
-        for (size_t i = 0; i < input_shape.size(); i++)
-        {
-            std::cout << "   - " << input_shape[i] << std::endl;
-        }
-        std::cout << " tensor_type: " << input_tensor_type << std::endl;
-
-        size_t input_byte_count = sizeof(float) * input_shape_info.GetElementCount();
-        std::unique_ptr<uint8_t[]> input_buffer = std::make_unique<uint8_t[]>(input_byte_count);
-        // auto input_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-        auto input_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
-
-        this->input_tensor_ = Ort::Value::CreateTensor(input_memory_info,
-                                                       input_buffer.get(), input_byte_count,
-                                                       input_shape.data(), input_shape.size(),
-                                                       input_tensor_type);
-        this->input_buffer_.emplace_back(std::move(input_buffer));
-
-        // Allocate output memory buffer
-        std::cout << "outputs" << std::endl;
-        this->output_name_ = std::string(this->session_.GetOutputNameAllocated(0, ort_alloc).get());
-        // this->output_name_ = this->session_.GetOutputName(0, ort_alloc);
-        std::cout << " name: " << this->output_name_ << std::endl;
-
-        auto output_info = this->session_.GetOutputTypeInfo(0);
-        auto output_shape_info = output_info.GetTensorTypeAndShapeInfo();
-        auto output_shape = output_shape_info.GetShape();
-        auto output_tensor_type = output_shape_info.GetElementType();
-
-        std::cout << " shape:" << std::endl;
-        for (size_t i = 0; i < output_shape.size(); i++)
-        {
-            std::cout << "   - " << output_shape[i] << std::endl;
-        }
-        std::cout << " tensor_type: " << output_tensor_type << std::endl;
-
-        size_t output_byte_count = sizeof(float) * output_shape_info.GetElementCount();
-        std::unique_ptr<uint8_t[]> output_buffer = std::make_unique<uint8_t[]>(output_byte_count);
-        // auto output_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-        auto output_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
-
-        this->output_tensor_ = Ort::Value::CreateTensor(output_memory_info,
-                                                        output_buffer.get(), output_byte_count,
-                                                        output_shape.data(), output_shape.size(),
-                                                        output_tensor_type);
-        this->output_buffer_.emplace_back(std::move(output_buffer));
-
-        // Prepare GridAndStrides
-        if(this->p6_)
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_p6_, this->grid_strides_);
-        }
-        else
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_, this->grid_strides_);
+        if (this->use_cuda_) {
+            OrtCUDAProviderOptions cuda_option;
+            cuda_option.device_id = this->device_id_;
+            session_options.AppendExecutionProvider_CUDA(cuda_option);
         }
+
+        this->session_ =
+            Ort::Session(this->env_, path_to_model.c_str(), session_options);
+    } catch (std::exception &e) {
+        std::cerr << e.what() << std::endl;
+        throw e;
     }
 
-    std::vector<Object> YoloXONNXRuntime::inference(const cv::Mat& frame)
-    {
-        // preprocess
-        cv::Mat pr_img = static_resize(frame);
-
-        float *blob_data = (float *)(this->input_buffer_[0].get());
-        blobFromImage(pr_img, blob_data);
-
-        const char* input_names_[] = {this->input_name_.c_str()};
-        const char* output_names_[] = {this->output_name_.c_str()};
-
-        // Inference
-        Ort::RunOptions run_options;
-        this->session_.Run(run_options,
-                           input_names_,
-                           &this->input_tensor_, 1,
-                           output_names_,
-                           &this->output_tensor_, 1);
-
-        float* net_pred = (float *)this->output_buffer_[0].get();
-
-        // postprocess
-        const float scale = std::min(
-            static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
-            static_cast<float>(this->input_h_) / static_cast<float>(frame.rows)
-        );
-        std::vector<Object> objects;
-        decode_outputs(net_pred, this->grid_strides_, objects, this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
-        return objects;
+    Ort::AllocatorWithDefaultOptions ort_alloc;
+
+    // Allocate input memory buffer
+    std::cout << "input:" << std::endl;
+    this->input_name_ =
+        std::string(this->session_.GetInputNameAllocated(0, ort_alloc).get());
+    // this->input_name_ = this->session_.GetInputName(0, ort_alloc);
+    std::cout << " name: " << this->input_name_ << std::endl;
+    auto input_info = this->session_.GetInputTypeInfo(0);
+    auto input_shape_info = input_info.GetTensorTypeAndShapeInfo();
+    std::vector<int64_t> input_shape = input_shape_info.GetShape();
+    ONNXTensorElementDataType input_tensor_type =
+        input_shape_info.GetElementType();
+    this->input_h_ = input_shape[2];
+    this->input_w_ = input_shape[3];
+
+    std::cout << " shape:" << std::endl;
+    for (size_t i = 0; i < input_shape.size(); i++) {
+        std::cout << "   - " << input_shape[i] << std::endl;
+    }
+    std::cout << " tensor_type: " << input_tensor_type << std::endl;
+
+    size_t input_byte_count =
+        sizeof(float) * input_shape_info.GetElementCount();
+    std::unique_ptr<uint8_t[]> input_buffer =
+        std::make_unique<uint8_t[]>(input_byte_count);
+    // auto input_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator,
+    // OrtMemTypeDefault);
+    auto input_memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+
+    this->input_tensor_ = Ort::Value::CreateTensor(
+        input_memory_info, input_buffer.get(), input_byte_count,
+        input_shape.data(), input_shape.size(), input_tensor_type);
+    this->input_buffer_.emplace_back(std::move(input_buffer));
+
+    // Allocate output memory buffer
+    std::cout << "outputs" << std::endl;
+    this->output_name_ =
+        std::string(this->session_.GetOutputNameAllocated(0, ort_alloc).get());
+    // this->output_name_ = this->session_.GetOutputName(0, ort_alloc);
+    std::cout << " name: " << this->output_name_ << std::endl;
+
+    auto output_info = this->session_.GetOutputTypeInfo(0);
+    auto output_shape_info = output_info.GetTensorTypeAndShapeInfo();
+    auto output_shape = output_shape_info.GetShape();
+    auto output_tensor_type = output_shape_info.GetElementType();
+
+    std::cout << " shape:" << std::endl;
+    for (size_t i = 0; i < output_shape.size(); i++) {
+        std::cout << "   - " << output_shape[i] << std::endl;
+    }
+    std::cout << " tensor_type: " << output_tensor_type << std::endl;
+
+    size_t output_byte_count =
+        sizeof(float) * output_shape_info.GetElementCount();
+    std::unique_ptr<uint8_t[]> output_buffer =
+        std::make_unique<uint8_t[]>(output_byte_count);
+    // auto output_memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator,
+    // OrtMemTypeDefault);
+    auto output_memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+
+    this->output_tensor_ = Ort::Value::CreateTensor(
+        output_memory_info, output_buffer.get(), output_byte_count,
+        output_shape.data(), output_shape.size(), output_tensor_type);
+    this->output_buffer_.emplace_back(std::move(output_buffer));
+
+    // Prepare GridAndStrides
+    if (this->p6_) {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_p6_, this->grid_strides_);
+    } else {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_, this->grid_strides_);
     }
+}
+
+std::vector<Object> YoloXONNXRuntime::inference(const cv::Mat &frame) {
+    // preprocess
+    cv::Mat pr_img = static_resize(frame);
+
+    float *blob_data = (float *)(this->input_buffer_[0].get());
+    blobFromImage(pr_img, blob_data);
+
+    const char *input_names_[] = {this->input_name_.c_str()};
+    const char *output_names_[] = {this->output_name_.c_str()};
 
+    // Inference
+    Ort::RunOptions run_options;
+    this->session_.Run(run_options, input_names_, &this->input_tensor_, 1,
+                       output_names_, &this->output_tensor_, 1);
+
+    float *net_pred = (float *)this->output_buffer_[0].get();
+
+    // postprocess
+    const float scale = std::min(
+        static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
+        static_cast<float>(this->input_h_) / static_cast<float>(frame.rows));
+    std::vector<Object> objects;
+    decode_outputs(net_pred, this->grid_strides_, objects,
+                   this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
+    return objects;
 }
+
+} // namespace yolox_cpp
diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_openvino.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_openvino.cpp
index a6c9df1..883d182 100644
--- a/yolox_ros_cpp/yolox_cpp/src/yolox_openvino.cpp
+++ b/yolox_ros_cpp/yolox_cpp/src/yolox_openvino.cpp
@@ -1,84 +1,84 @@
 #include "yolox_cpp/yolox_openvino.hpp"
 
-namespace yolox_cpp{
-    YoloXOpenVINO::YoloXOpenVINO(const file_name_t &path_to_model, std::string device_name,
-                                 float nms_th, float conf_th, const std::string &model_version,
-                                 int num_classes, bool p6)
-    :AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
-     device_name_(device_name)
-    {
-        // Step 1. Initialize inference engine core
-        std::cout << "Initialize Inference engine core" << std::endl;
-        ov::Core ie;
+namespace yolox_cpp {
+YoloXOpenVINO::YoloXOpenVINO(const file_name_t &path_to_model,
+                             std::string device_name, float nms_th,
+                             float conf_th, const std::string &model_version,
+                             int num_classes, bool p6)
+    : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
+      device_name_(device_name) {
+    // Step 1. Initialize inference engine core
+    std::cout << "Initialize Inference engine core" << std::endl;
+    ov::Core ie;
 
-        // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and
-        // .bin files) or ONNX (.onnx file) format
-        std::cout << "Read a model in OpenVINO Intermediate Representation: " << path_to_model << std::endl;
-        const auto network = ie.read_model(path_to_model);
+    // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and
+    // .bin files) or ONNX (.onnx file) format
+    std::cout << "Read a model in OpenVINO Intermediate Representation: "
+              << path_to_model << std::endl;
+    const auto network = ie.read_model(path_to_model);
 
-        //  Step 3. Loading a model to the device
-        std::vector<std::string> available_devices = ie.get_available_devices();
-        std::cout << "======= AVAILABLE DEVICES FOR OPENVINO =======" << std::endl;
-        for (auto device : available_devices) {
-            std::cout << "- " << device << std::endl;
-        }
-        std::cout << "==============================================" << std::endl;
-        std::cout << "Loading a model to the device: " << device_name_ << std::endl;
-        auto compiled_model = ie.compile_model(network, device_name);
+    //  Step 3. Loading a model to the device
+    std::vector<std::string> available_devices = ie.get_available_devices();
+    std::cout << "======= AVAILABLE DEVICES FOR OPENVINO =======" << std::endl;
+    for (auto device : available_devices) {
+        std::cout << "- " << device << std::endl;
+    }
+    std::cout << "==============================================" << std::endl;
+    std::cout << "Loading a model to the device: " << device_name_ << std::endl;
+    auto compiled_model = ie.compile_model(network, device_name);
 
-        // Step 4. Create an infer request
-        std::cout << "Create an infer request" << std::endl;
-        this->infer_request_ = compiled_model.create_infer_request();
+    // Step 4. Create an infer request
+    std::cout << "Create an infer request" << std::endl;
+    this->infer_request_ = compiled_model.create_infer_request();
 
-        // Step 5. Configure input & output
-        std::cout << "Configuring input and output blobs" << std::endl;
-        this->input_shape_ = compiled_model.input(0).get_shape();
-        /* Mark input as resizable by setting of a resize algorithm.
-        * In this case we will be able to set an input blob of any shape to an
-        * infer request. Resize and layout conversions are executed automatically
-        * during inference */
-        this->blob_.resize(
-            this->input_shape_.at(0) * this->input_shape_.at(1) *
-            this->input_shape_.at(2) * this->input_shape_.at(3));
-        this->input_h_ = this->input_shape_.at(2);
-        this->input_w_ = this->input_shape_.at(3);
-        std::cout << "INPUT_HEIGHT: " << this->input_h_ << std::endl;
-        std::cout << "INPUT_WIDTH: " << this->input_w_ << std::endl;
+    // Step 5. Configure input & output
+    std::cout << "Configuring input and output blobs" << std::endl;
+    this->input_shape_ = compiled_model.input(0).get_shape();
+    /* Mark input as resizable by setting of a resize algorithm.
+     * In this case we will be able to set an input blob of any shape to an
+     * infer request. Resize and layout conversions are executed automatically
+     * during inference */
+    this->blob_.resize(this->input_shape_.at(0) * this->input_shape_.at(1) *
+                       this->input_shape_.at(2) * this->input_shape_.at(3));
+    this->input_h_ = this->input_shape_.at(2);
+    this->input_w_ = this->input_shape_.at(3);
+    std::cout << "INPUT_HEIGHT: " << this->input_h_ << std::endl;
+    std::cout << "INPUT_WIDTH: " << this->input_w_ << std::endl;
 
-        // Prepare GridAndStrides
-        if(this->p6_)
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_p6_, this->grid_strides_);
-        }
-        else
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_, this->grid_strides_);
-        }
+    // Prepare GridAndStrides
+    if (this->p6_) {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_p6_, this->grid_strides_);
+    } else {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_, this->grid_strides_);
     }
+}
 
-    std::vector<Object> YoloXOpenVINO::inference(const cv::Mat& frame)
-    {
-        // preprocess
-        cv::Mat pr_img = static_resize(frame);
-        // locked memory holder should be alive all time while access to its buffer happens
-        blobFromImage(pr_img, this->blob_.data());
+std::vector<Object> YoloXOpenVINO::inference(const cv::Mat &frame) {
+    // preprocess
+    cv::Mat pr_img = static_resize(frame);
+    // locked memory holder should be alive all time while access to its buffer
+    // happens
+    blobFromImage(pr_img, this->blob_.data());
 
-        // do inference
-        /* Running the request synchronously */
-        this->infer_request_.set_input_tensor(
-            ov::Tensor{ov::element::f32, this->input_shape_, reinterpret_cast<float *>(this->blob_.data())});
-        infer_request_.infer();
+    // do inference
+    /* Running the request synchronously */
+    this->infer_request_.set_input_tensor(
+        ov::Tensor{ov::element::f32, this->input_shape_,
+                   reinterpret_cast<float *>(this->blob_.data())});
+    infer_request_.infer();
 
-        const auto &output_tensor = this->infer_request_.get_output_tensor();
-        const float* net_pred = reinterpret_cast<float *>(output_tensor.data());
+    const auto &output_tensor = this->infer_request_.get_output_tensor();
+    const float *net_pred = reinterpret_cast<float *>(output_tensor.data());
 
-        const float scale = std::min(
-            static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
-            static_cast<float>(this->input_h_) / static_cast<float>(frame.rows)
-        );
+    const float scale = std::min(
+        static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
+        static_cast<float>(this->input_h_) / static_cast<float>(frame.rows));
 
-        std::vector<Object> objects;
-        decode_outputs(net_pred, this->grid_strides_, objects, this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
-        return objects;
-    }
+    std::vector<Object> objects;
+    decode_outputs(net_pred, this->grid_strides_, objects,
+                   this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
+    return objects;
 }
+} // namespace yolox_cpp
diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp
index f5c73ad..8fa5dc5 100644
--- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp
+++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp
@@ -1,149 +1,174 @@
 #include "yolox_cpp/yolox_tensorrt.hpp"
-
-namespace yolox_cpp
-{
-
-    YoloXTensorRT::YoloXTensorRT(const file_name_t &path_to_engine, int device,
-                                 float nms_th, float conf_th, const std::string &model_version,
-                                 int num_classes, bool p6)
-        : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
-          DEVICE_(device)
-    {
-        cudaSetDevice(this->DEVICE_);
-        // create a model using the API directly and serialize it to a stream
-        std::vector<char> trtModelStream;
-        size_t size{0};
-
-        std::ifstream file(path_to_engine, std::ios::binary);
-        if (file.good())
-        {
-            file.seekg(0, file.end);
-            size = file.tellg();
-            file.seekg(0, file.beg);
-            trtModelStream.resize(size);
-            file.read(trtModelStream.data(), size);
-            file.close();
-        }
-        else
-        {
-            std::string msg = "invalid arguments path_to_engine: ";
-            msg += path_to_engine;
-            throw std::runtime_error(msg.c_str());
-        }
-
-        this->runtime_ = std::unique_ptr<IRuntime>(createInferRuntime(this->gLogger_));
-        assert(this->runtime_ != nullptr);
-        this->engine_ = std::unique_ptr<ICudaEngine>(this->runtime_->deserializeCudaEngine(trtModelStream.data(), size));
-        assert(this->engine_ != nullptr);
-        this->context_ = std::unique_ptr<IExecutionContext>(this->engine_->createExecutionContext());
-        assert(this->context_ != nullptr);
-
-        const auto input_name = this->engine_->getIOTensorName(this->inputIndex_);
-        const auto input_dims = this->engine_->getTensorShape(input_name);
-        this->input_h_ = input_dims.d[2];
-        this->input_w_ = input_dims.d[3];
-        std::cout << "INPUT_HEIGHT: " << this->input_h_ << std::endl;
-        std::cout << "INPUT_WIDTH: " << this->input_w_ << std::endl;
-
-        const auto output_name = this->engine_->getIOTensorName(this->outputIndex_);
-        auto output_dims = this->engine_->getTensorShape(output_name);
-        this->output_size_ = 1;
-        for (int j = 0; j < output_dims.nbDims; ++j)
-        {
-            this->output_size_ *= output_dims.d[j];
-        }
-
-        // allocate buffer
-        this->input_blob_.resize(this->input_h_ * this->input_w_ * 3);
-        this->output_blob_.resize(this->output_size_);
-
-        // Pointers to input and output device buffers to pass to engine.
-        // Engine requires exactly IEngine::getNbBindings() number of buffers.
-        assert(this->engine_->getNbIOTensors() == 2);
-        // In order to bind the buffers, we need to know the names of the input and output tensors.
-        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
-        assert(this->engine_->getTensorDataType(input_name) == nvinfer1::DataType::kFLOAT);
-        assert(this->engine_->getTensorDataType(output_name) == nvinfer1::DataType::kFLOAT);
-
-        // Create GPU buffers on device
-        CHECK(cudaMalloc(&this->inference_buffers_[this->inputIndex_], 3 * this->input_h_ * this->input_w_ * sizeof(float)));
-        CHECK(cudaMalloc(&this->inference_buffers_[this->outputIndex_], this->output_size_ * sizeof(float)));
-
-        assert(this->context_->setInputShape(input_name, input_dims));
-        assert(this->context_->allInputDimensionsSpecified());
-
-        assert(this->context_->setInputTensorAddress(input_name, this->inference_buffers_[this->inputIndex_]));
-        assert(this->context_->setOutputTensorAddress(output_name, this->inference_buffers_[this->outputIndex_]));
-
-        // Prepare GridAndStrides
-        if (this->p6_)
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_p6_, this->grid_strides_);
-        }
-        else
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_, this->grid_strides_);
-        }
-    }
-
-    YoloXTensorRT::~YoloXTensorRT()
-    {
-        CHECK(cudaFree(inference_buffers_[this->inputIndex_]));
-        CHECK(cudaFree(inference_buffers_[this->outputIndex_]));
+#include <vector_types.h>
+
+extern "C" void launchGPUResizeAndBlobFromImage(uchar3 *image_data,
+                                                float *output, int r_width,
+                                                int r_height, int input_width,
+                                                int input_height, float w_ratio,
+                                                float h_ratio,
+                                                cudaStream_t stream);
+
+namespace yolox_cpp {
+
+YoloXTensorRT::YoloXTensorRT(const file_name_t &path_to_engine, int device,
+                             float nms_th, float conf_th,
+                             const std::string &model_version, int num_classes,
+                             bool p6)
+    : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
+      DEVICE_(device) {
+    cudaSetDevice(this->DEVICE_);
+    // create a model using the API directly and serialize it to a stream
+    std::vector<char> trtModelStream;
+    size_t size{0};
+
+    std::ifstream file(path_to_engine, std::ios::binary);
+    if (file.good()) {
+        file.seekg(0, file.end);
+        size = file.tellg();
+        file.seekg(0, file.beg);
+        trtModelStream.resize(size);
+        file.read(trtModelStream.data(), size);
+        file.close();
+    } else {
+        std::string msg = "invalid arguments path_to_engine: ";
+        msg += path_to_engine;
+        throw std::runtime_error(msg.c_str());
     }
 
-    std::vector<Object> YoloXTensorRT::inference(const cv::Mat &frame)
-    {
-        // preprocess
-        auto pr_img = static_resize(frame);
-        blobFromImage(pr_img, input_blob_.data());
-
-        // inference
-        this->doInference(input_blob_.data(), output_blob_.data());
-
-        // postprocess
-        const float scale = std::min(
-            static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
-            static_cast<float>(this->input_h_) / static_cast<float>(frame.rows)
-        );
-
-        std::vector<Object> objects;
-        decode_outputs(
-            output_blob_.data(), this->grid_strides_, objects,
-            this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
-
-        return objects;
+    this->runtime_ =
+        std::unique_ptr<IRuntime>(createInferRuntime(this->gLogger_));
+    assert(this->runtime_ != nullptr);
+    this->engine_ = std::unique_ptr<ICudaEngine>(
+        this->runtime_->deserializeCudaEngine(trtModelStream.data(), size));
+    assert(this->engine_ != nullptr);
+    this->context_ = std::unique_ptr<IExecutionContext>(
+        this->engine_->createExecutionContext());
+    assert(this->context_ != nullptr);
+
+    const auto input_name = this->engine_->getIOTensorName(this->inputIndex_);
+    const auto input_dims = this->engine_->getTensorShape(input_name);
+    this->input_h_ = input_dims.d[2];
+    this->input_w_ = input_dims.d[3];
+    std::cout << "INPUT_HEIGHT: " << this->input_h_ << std::endl;
+    std::cout << "INPUT_WIDTH: " << this->input_w_ << std::endl;
+
+    const auto output_name = this->engine_->getIOTensorName(this->outputIndex_);
+    auto output_dims = this->engine_->getTensorShape(output_name);
+    this->output_size_ = 1;
+    for (int j = 0; j < output_dims.nbDims; ++j) {
+        this->output_size_ *= output_dims.d[j];
     }
 
-    void YoloXTensorRT::doInference(const float *input, float *output)
-    {
-        // Create stream
-        cudaStream_t stream;
-        CHECK(cudaStreamCreate(&stream));
-
-        // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
-        CHECK(
-            cudaMemcpyAsync(
-                this->inference_buffers_[this->inputIndex_],
-                input,
-                3 * this->input_h_ * this->input_w_ * sizeof(float),
-                cudaMemcpyHostToDevice, stream));
-
-        bool success = context_->executeV2(this->inference_buffers_);
-        if (!success)
-            throw std::runtime_error("failed inference");
-
-        CHECK(
-            cudaMemcpyAsync(
-                output,
-                this->inference_buffers_[this->outputIndex_],
-                this->output_size_ * sizeof(float),
-                cudaMemcpyDeviceToHost, stream));
-
-        CHECK(cudaStreamSynchronize(stream));
-
-        // Release stream
-        CHECK(cudaStreamDestroy(stream));
+    // allocate buffer
+    this->input_blob_.resize(this->input_h_ * this->input_w_ * 3);
+    this->output_blob_.resize(this->output_size_);
+
+    // Pointers to input and output device buffers to pass to engine.
+    // Engine requires exactly IEngine::getNbBindings() number of buffers.
+    assert(this->engine_->getNbIOTensors() == 2);
+    // In order to bind the buffers, we need to know the names of the input and
+    // output tensors. Note that indices are guaranteed to be less than
+    // IEngine::getNbBindings()
+    assert(this->engine_->getTensorDataType(input_name) ==
+           nvinfer1::DataType::kFLOAT);
+    assert(this->engine_->getTensorDataType(output_name) ==
+           nvinfer1::DataType::kFLOAT);
+
+    // Create GPU buffers on device
+    CHECK(cudaMalloc(&this->inference_buffers_[this->inputIndex_],
+                     3 * this->input_h_ * this->input_w_ * sizeof(float)));
+    CHECK(cudaMalloc(&this->inference_buffers_[this->outputIndex_],
+                     this->output_size_ * sizeof(float)));
+
+    assert(this->context_->setInputShape(input_name, input_dims));
+    assert(this->context_->allInputDimensionsSpecified());
+
+    assert(this->context_->setInputTensorAddress(
+        input_name, this->inference_buffers_[this->inputIndex_]));
+    assert(this->context_->setOutputTensorAddress(
+        output_name, this->inference_buffers_[this->outputIndex_]));
+
+    // Prepare GridAndStrides
+    if (this->p6_) {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_p6_, this->grid_strides_);
+    } else {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_, this->grid_strides_);
     }
+}
+
+YoloXTensorRT::~YoloXTensorRT() {
+    CHECK(cudaFree(inference_buffers_[this->inputIndex_]));
+    CHECK(cudaFree(inference_buffers_[this->outputIndex_]));
+}
+
+std::vector<yolox_cpp::Object>
+YoloXTensorRT::inference(const cv::Mat &frame, uchar3 *d_image, float *d_output,
+                         cudaStream_t copy_stream_,
+                         cudaStream_t resize_stream_) {
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    cudaMemcpy2DAsync(d_image, frame.cols * sizeof(uchar3), frame.data,
+                      frame.step, frame.cols * sizeof(uchar3), frame.rows,
+                      cudaMemcpyHostToDevice, copy_stream_);
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto elapsed_us =
+        std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+    printf("Copy Time: %5ld us\n", elapsed_us);
+
+    float w_r = static_cast<float>(frame.cols) / this->input_w_;
+    float h_r = static_cast<float>(frame.rows) / this->input_h_;
+
+    t0 = std::chrono::high_resolution_clock::now();
+    launchGPUResizeAndBlobFromImage(d_image, d_output, this->input_w_,
+                                    this->input_h_, frame.cols, frame.rows, w_r,
+                                    h_r, resize_stream_);
+    cudaStreamSynchronize(resize_stream_);
+    t1 = std::chrono::high_resolution_clock::now();
+    elapsed_us =
+        std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+    printf("resize time on GPU: %5ld us\n", elapsed_us);
+
+    t0 = std::chrono::high_resolution_clock::now();
+    this->doInference(d_output, output_blob_.data(), copy_stream_);
+    t1 = std::chrono::high_resolution_clock::now();
+    elapsed_us =
+        std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+    printf("Inference time on GPU: %5ld us\n", elapsed_us);
+
+    const float scale =
+        std::min(static_cast<float>(this->input_w_) / frame.cols,
+                 static_cast<float>(this->input_h_) / frame.rows);
+
+    std::vector<Object> objects;
+    t0 = std::chrono::high_resolution_clock::now();
+    decode_outputs(output_blob_.data(), this->grid_strides_, objects,
+                   this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
+    t1 = std::chrono::high_resolution_clock::now();
+    elapsed_us =
+        std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+    printf("Post time: %5ld us\n", elapsed_us);
+
+    return objects;
+}
+
+void YoloXTensorRT::doInference(const float *input, float *output,
+                                cudaStream_t copy_stream_) {
+    // DMA input batch data to device, infer on the batch asynchronously, and
+    // DMA output back to host Change to Device to Device copy
+    this->inference_buffers_[this->inputIndex_] = const_cast<float *>(input);
+
+    bool success = context_->executeV2(this->inference_buffers_);
+    if (!success)
+        throw std::runtime_error("failed inference");
+
+    CHECK(cudaMemcpyAsync(output, this->inference_buffers_[this->outputIndex_],
+                          this->output_size_ * sizeof(float),
+                          cudaMemcpyDeviceToHost, copy_stream_));
+
+    CHECK(cudaStreamSynchronize(copy_stream_));
+}
 
 } // namespace yolox_cpp
diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tflite.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tflite.cpp
index 5182150..c03f4ed 100644
--- a/yolox_ros_cpp/yolox_cpp/src/yolox_tflite.cpp
+++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tflite.cpp
@@ -1,182 +1,162 @@
 #include "yolox_cpp/yolox_tflite.hpp"
 
-namespace yolox_cpp
-{
-
-    YoloXTflite::YoloXTflite(const file_name_t &path_to_model, int num_threads,
-                             float nms_th, float conf_th, const std::string &model_version,
-                             int num_classes, bool p6, bool is_nchw)
-        : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6), is_nchw_(is_nchw)
-    {
-        TfLiteStatus status;
-        this->model_ = tflite::FlatBufferModel::BuildFromFile(path_to_model.c_str());
-        TFLITE_MINIMAL_CHECK(model_);
-
-        this->resolver_ = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
-        this->interpreter_ = std::make_unique<tflite::Interpreter>();
+namespace yolox_cpp {
+
+YoloXTflite::YoloXTflite(const file_name_t &path_to_model, int num_threads,
+                         float nms_th, float conf_th,
+                         const std::string &model_version, int num_classes,
+                         bool p6, bool is_nchw)
+    : AbcYoloX(nms_th, conf_th, model_version, num_classes, p6),
+      is_nchw_(is_nchw) {
+    TfLiteStatus status;
+    this->model_ =
+        tflite::FlatBufferModel::BuildFromFile(path_to_model.c_str());
+    TFLITE_MINIMAL_CHECK(model_);
+
+    this->resolver_ =
+        std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+    this->interpreter_ = std::make_unique<tflite::Interpreter>();
+
+    tflite::InterpreterBuilder builder(*model_, *this->resolver_);
+    builder(&this->interpreter_);
+    TFLITE_MINIMAL_CHECK(this->interpreter_ != nullptr);
+
+    TFLITE_MINIMAL_CHECK(this->interpreter_->AllocateTensors() == kTfLiteOk);
+    // tflite::PrintInterpreterState(this->interpreter_.get());
+
+    status = this->interpreter_->SetNumThreads(num_threads);
+    if (status != TfLiteStatus::kTfLiteOk) {
+        std::string msg = "Failed to SetNumThreads.";
+        throw std::runtime_error(msg.c_str());
+    }
 
-        tflite::InterpreterBuilder builder(*model_, *this->resolver_);
-        builder(&this->interpreter_);
-        TFLITE_MINIMAL_CHECK(this->interpreter_ != nullptr);
+    // XNNPACK Delegate
+    auto xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+    xnnpack_options.num_threads = num_threads;
+    this->delegate_ = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+    status = this->interpreter_->ModifyGraphWithDelegate(this->delegate_);
+    if (status != TfLiteStatus::kTfLiteOk) {
+        std::string msg = "Failed to ModifyGraphWithDelegate.";
+        throw std::runtime_error(msg.c_str());
+    }
 
-        TFLITE_MINIMAL_CHECK(this->interpreter_->AllocateTensors() == kTfLiteOk);
-        // tflite::PrintInterpreterState(this->interpreter_.get());
+    // // GPU Delegate
+    // auto gpu_options = TfLiteGpuDelegateOptionsV2Default();
+    // gpu_options.inference_preference =
+    // TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;
+    // gpu_options.inference_priority1 =
+    // TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY; this->delegate_ =
+    // TfLiteGpuDelegateV2Create(&gpu_options); status =
+    // this->interpreter_->ModifyGraphWithDelegate(this->delegate_); if (status
+    // != TfLiteStatus::kTfLiteOk)
+    // {
+    //     std::cerr << "Failed to ModifyGraphWithDelegate." << std::endl;
+    //     exit(1);
+    // }
+
+    // // NNAPI Delegate
+    // tflite::StatefulNnApiDelegate::Options nnapi_options;
+    // nnapi_options.execution_preference =
+    // tflite::StatefulNnApiDelegate::Options::kSustainedSpeed;
+    // nnapi_options.allow_fp16 = true;
+    // nnapi_options.disallow_nnapi_cpu = true;
+    // this->delegate_ = new tflite::StatefulNnApiDelegate(nnapi_options);
+    // status = this->interpreter_->ModifyGraphWithDelegate(this->delegate_);
+    // if (status != TfLiteStatus::kTfLiteOk)
+    // {
+    //     std::cerr << "Failed to ModifyGraphWithDelegate." << std::endl;
+    //     exit(1);
+    // }
+
+    if (this->interpreter_->AllocateTensors() != TfLiteStatus::kTfLiteOk) {
+        std::string msg = "Failed to allocate tensors.";
+        throw std::runtime_error(msg.c_str());
+    }
 
-        status = this->interpreter_->SetNumThreads(num_threads);
-        if (status != TfLiteStatus::kTfLiteOk)
-        {
-            std::string msg = "Failed to SetNumThreads.";
-            throw std::runtime_error(msg.c_str());
+    {
+        TfLiteTensor *tensor = this->interpreter_->input_tensor(0);
+        std::cout << "input:" << std::endl;
+        std::cout << " name: " << tensor->name << std::endl;
+        if (this->is_nchw_ == true) {
+            // NCHW
+            this->input_h_ = tensor->dims->data[2];
+            this->input_w_ = tensor->dims->data[3];
+        } else {
+            // NHWC
+            this->input_h_ = tensor->dims->data[1];
+            this->input_w_ = tensor->dims->data[2];
         }
 
-        // XNNPACK Delegate
-        auto xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
-        xnnpack_options.num_threads = num_threads;
-        this->delegate_ = TfLiteXNNPackDelegateCreate(&xnnpack_options);
-        status = this->interpreter_->ModifyGraphWithDelegate(this->delegate_);
-        if (status != TfLiteStatus::kTfLiteOk)
-        {
-            std::string msg = "Failed to ModifyGraphWithDelegate.";
-            throw std::runtime_error(msg.c_str());
+        std::cout << " shape:" << std::endl;
+        if (tensor->type == kTfLiteUInt8) {
+            this->input_size_ = sizeof(uint8_t);
+        } else {
+            this->input_size_ = sizeof(float);
         }
-
-        // // GPU Delegate
-        // auto gpu_options = TfLiteGpuDelegateOptionsV2Default();
-        // gpu_options.inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;
-        // gpu_options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
-        // this->delegate_ = TfLiteGpuDelegateV2Create(&gpu_options);
-        // status = this->interpreter_->ModifyGraphWithDelegate(this->delegate_);
-        // if (status != TfLiteStatus::kTfLiteOk)
-        // {
-        //     std::cerr << "Failed to ModifyGraphWithDelegate." << std::endl;
-        //     exit(1);
-        // }
-
-        // // NNAPI Delegate
-        // tflite::StatefulNnApiDelegate::Options nnapi_options;
-        // nnapi_options.execution_preference = tflite::StatefulNnApiDelegate::Options::kSustainedSpeed;
-        // nnapi_options.allow_fp16 = true;
-        // nnapi_options.disallow_nnapi_cpu = true;
-        // this->delegate_ = new tflite::StatefulNnApiDelegate(nnapi_options);
-        // status = this->interpreter_->ModifyGraphWithDelegate(this->delegate_);
-        // if (status != TfLiteStatus::kTfLiteOk)
-        // {
-        //     std::cerr << "Failed to ModifyGraphWithDelegate." << std::endl;
-        //     exit(1);
-        // }
-
-        if (this->interpreter_->AllocateTensors() != TfLiteStatus::kTfLiteOk)
-        {
-            std::string msg = "Failed to allocate tensors.";
-            throw std::runtime_error(msg.c_str());
+        for (int i = 0; i < tensor->dims->size; i++) {
+            this->input_size_ *= tensor->dims->data[i];
+            std::cout << "   - " << tensor->dims->data[i] << std::endl;
         }
+        std::cout << " input_h: " << this->input_h_ << std::endl;
+        std::cout << " input_w: " << this->input_w_ << std::endl;
+        std::cout << " tensor_type: " << tensor->type << std::endl;
+    }
 
-        {
-            TfLiteTensor *tensor = this->interpreter_->input_tensor(0);
-            std::cout << "input:" << std::endl;
-            std::cout << " name: " << tensor->name << std::endl;
-            if (this->is_nchw_ == true)
-            {
-                // NCHW
-                this->input_h_ = tensor->dims->data[2];
-                this->input_w_ = tensor->dims->data[3];
-            }
-            else
-            {
-                // NHWC
-                this->input_h_ = tensor->dims->data[1];
-                this->input_w_ = tensor->dims->data[2];
-            }
-
-            std::cout << " shape:" << std::endl;
-            if (tensor->type == kTfLiteUInt8)
-            {
-                this->input_size_ = sizeof(uint8_t);
-            }
-            else
-            {
-                this->input_size_ = sizeof(float);
-            }
-            for (int i = 0; i < tensor->dims->size; i++)
-            {
-                this->input_size_ *= tensor->dims->data[i];
-                std::cout << "   - " << tensor->dims->data[i] << std::endl;
-            }
-            std::cout << " input_h: " << this->input_h_ << std::endl;
-            std::cout << " input_w: " << this->input_w_ << std::endl;
-            std::cout << " tensor_type: " << tensor->type << std::endl;
+    {
+        TfLiteTensor *tensor = this->interpreter_->output_tensor(0);
+        std::cout << "output:" << std::endl;
+        std::cout << " name: " << tensor->name << std::endl;
+        std::cout << " shape:" << std::endl;
+        if (tensor->type == kTfLiteUInt8) {
+            this->output_size_ = sizeof(uint8_t);
+        } else {
+            this->output_size_ = sizeof(float);
         }
-
-        {
-            TfLiteTensor *tensor = this->interpreter_->output_tensor(0);
-            std::cout << "output:" << std::endl;
-            std::cout << " name: " << tensor->name << std::endl;
-            std::cout << " shape:" << std::endl;
-            if (tensor->type == kTfLiteUInt8)
-            {
-                this->output_size_ = sizeof(uint8_t);
-            }
-            else
-            {
-                this->output_size_ = sizeof(float);
-            }
-            for (int i = 0; i < tensor->dims->size; i++)
-            {
-                this->output_size_ *= tensor->dims->data[i];
-                std::cout << "   - " << tensor->dims->data[i] << std::endl;
-            }
-            std::cout << " tensor_type: " << tensor->type << std::endl;
+        for (int i = 0; i < tensor->dims->size; i++) {
+            this->output_size_ *= tensor->dims->data[i];
+            std::cout << "   - " << tensor->dims->data[i] << std::endl;
         }
+        std::cout << " tensor_type: " << tensor->type << std::endl;
+    }
 
-        // Prepare GridAndStrides
-        if(this->p6_)
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_p6_, this->grid_strides_);
-        }
-        else
-        {
-            generate_grids_and_stride(this->input_w_, this->input_h_, this->strides_, this->grid_strides_);
-        }
+    // Prepare GridAndStrides
+    if (this->p6_) {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_p6_, this->grid_strides_);
+    } else {
+        generate_grids_and_stride(this->input_w_, this->input_h_,
+                                  this->strides_, this->grid_strides_);
     }
-    YoloXTflite::~YoloXTflite()
-    {
-        TfLiteXNNPackDelegateDelete(this->delegate_);
+}
+YoloXTflite::~YoloXTflite() { TfLiteXNNPackDelegateDelete(this->delegate_); }
+std::vector<Object> YoloXTflite::inference(const cv::Mat &frame) {
+    // preprocess
+    cv::Mat pr_img = static_resize(frame);
+
+    float *input_blob = this->interpreter_->typed_input_tensor<float>(0);
+    if (this->is_nchw_ == true) {
+        blobFromImage(pr_img, input_blob);
+    } else {
+        blobFromImage_nhwc(pr_img, input_blob);
     }
-    std::vector<Object> YoloXTflite::inference(const cv::Mat &frame)
-    {
-        // preprocess
-        cv::Mat pr_img = static_resize(frame);
 
-        float *input_blob = this->interpreter_->typed_input_tensor<float>(0);
-        if (this->is_nchw_ == true)
-        {
-            blobFromImage(pr_img, input_blob);
-        }
-        else
-        {
-            blobFromImage_nhwc(pr_img, input_blob);
-        }
-
-        // inference
-        TfLiteStatus ret = this->interpreter_->Invoke();
-        if (ret != TfLiteStatus::kTfLiteOk)
-        {
-            std::cerr << "Failed to invoke." << std::endl;
-            return std::vector<Object>();
-        }
-
-        // postprocess
-        const float scale = std::min(
-            static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
-            static_cast<float>(this->input_h_) / static_cast<float>(frame.rows)
-        );
-        std::vector<Object> objects;
-        decode_outputs(
-            this->interpreter_->typed_output_tensor<float>(0),
-            this->grid_strides_, objects,
-            this->bbox_conf_thresh_, scale, frame.cols, frame.rows);
-
-        return objects;
+    // inference
+    TfLiteStatus ret = this->interpreter_->Invoke();
+    if (ret != TfLiteStatus::kTfLiteOk) {
+        std::cerr << "Failed to invoke." << std::endl;
+        return std::vector<Object>();
     }
 
+    // postprocess
+    const float scale = std::min(
+        static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
+        static_cast<float>(this->input_h_) / static_cast<float>(frame.rows));
+    std::vector<Object> objects;
+    decode_outputs(this->interpreter_->typed_output_tensor<float>(0),
+                   this->grid_strides_, objects, this->bbox_conf_thresh_, scale,
+                   frame.cols, frame.rows);
+
+    return objects;
+}
+
 } // namespace yolox_cpp
diff --git a/yolox_ros_cpp/yolox_param/CMakeLists.txt b/yolox_ros_cpp/yolox_param/CMakeLists.txt
index 59aea79..3689f2a 100644
--- a/yolox_ros_cpp/yolox_param/CMakeLists.txt
+++ b/yolox_ros_cpp/yolox_param/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.8)
+cmake_minimum_required(VERSION 3.14)
 project(yolox_param)
 
 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
diff --git a/yolox_ros_cpp/yolox_ros_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_ros_cpp/CMakeLists.txt
index d087feb..c51d79c 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/CMakeLists.txt
+++ b/yolox_ros_cpp/yolox_ros_cpp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.8)
+cmake_minimum_required(VERSION 3.14)
 project(yolox_ros_cpp)
 
 if(NOT CMAKE_CXX_STANDARD)
@@ -7,11 +7,19 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_EXTENSIONS OFF)
 endif()
 
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math -DNDEBUG")
+set(CMAKE_CUDA_ARCHITECTURES 87)
+project(yolox_ros_cpp CXX CUDA)
+
 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   add_compile_options(-Wall -Wextra -Wpedantic)
 endif()
 
 find_package(ament_cmake_auto REQUIRED)
+find_package(tr_messages REQUIRED)
+
+
 ament_auto_find_build_dependencies()
 
 if(NOT yolox_cpp_FOUND)
@@ -37,3 +45,5 @@ ament_auto_package(
   INSTALL_TO_SHARE
   launch
 )
+
+ament_target_dependencies(yolox_ros_cpp tr_messages)
diff --git a/yolox_ros_cpp/yolox_ros_cpp/include/yolox_ros_cpp/yolox_ros_cpp.hpp b/yolox_ros_cpp/yolox_ros_cpp/include/yolox_ros_cpp/yolox_ros_cpp.hpp
index 4253237..80329a9 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/include/yolox_ros_cpp/yolox_ros_cpp.hpp
+++ b/yolox_ros_cpp/yolox_ros_cpp/include/yolox_ros_cpp/yolox_ros_cpp.hpp
@@ -22,6 +22,8 @@
 #include "yolox_cpp/utils.hpp"
 #include "yolox_param/yolox_param.hpp"
 
+#include "tr_messages/msg/det_with_img.hpp"
+
 namespace yolox_ros_cpp{
     class YoloXNode : public rclcpp::Node
     {
@@ -30,22 +32,31 @@ namespace yolox_ros_cpp{
     private:
         void onInit();
         void colorImageCallback(const sensor_msgs::msg::Image::ConstSharedPtr &);
-
         static bboxes_ex_msgs::msg::BoundingBoxes objects_to_bboxes(const cv::Mat &, const std::vector<yolox_cpp::Object> &, const std_msgs::msg::Header &);
         static vision_msgs::msg::Detection2DArray objects_to_detection2d(const std::vector<yolox_cpp::Object> &, const std_msgs::msg::Header &);
-
     protected:
         std::shared_ptr<yolox_parameters::ParamListener> param_listener_;
         yolox_parameters::Params params_;
     private:
+        bool init;
+
+        cudaStream_t copy_stream_;
+        cudaStream_t resize_stream_;
+
+        uchar3* d_image_; 
+        float* d_output_;
+
         std::unique_ptr<yolox_cpp::AbcYoloX> yolox_;
         std::vector<std::string> class_names_;
 
+        rclcpp::CallbackGroup::SharedPtr callback_group_reentrant_;
+        std::shared_ptr<rclcpp::SubscriptionOptions> sub_options_;
+
         rclcpp::TimerBase::SharedPtr init_timer_;
         image_transport::Subscriber sub_image_;
 
         rclcpp::Publisher<bboxes_ex_msgs::msg::BoundingBoxes>::SharedPtr pub_bboxes_;
-        rclcpp::Publisher<vision_msgs::msg::Detection2DArray>::SharedPtr pub_detection2d_;
+        rclcpp::Publisher<tr_messages::msg::DetWithImg>::SharedPtr pub_detection2d_;
         image_transport::Publisher pub_image_;
     };
 }
diff --git a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_onnxruntime.launch.py b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_onnxruntime.launch.py
index 61d0802..6013229 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_onnxruntime.launch.py
+++ b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_onnxruntime.launch.py
@@ -19,150 +19,152 @@
 from launch_ros.actions import ComposableNodeContainer
 from launch_ros.descriptions import ComposableNode
 
+
 def generate_launch_description():
     launch_args = [
         DeclareLaunchArgument(
-            'video_device',
-            default_value='/dev/video0',
-            description='input video source'
-        ),
-        DeclareLaunchArgument(
-            'model_path',
-            default_value='./src/YOLOX-ROS/weights/onnx/yolox_tiny.onnx',
-            description='yolox model path.'
-        ),
-        DeclareLaunchArgument(
-            'p6',
-            default_value='false',
-            description='with p6.'
-        ),
-        DeclareLaunchArgument(
-            'class_labels_path',
-            default_value='',
-            description='if use custom model, set class name labels. '
+            "video_device",
+            default_value="/dev/video0",
+            description="input video source",
         ),
         DeclareLaunchArgument(
-            'num_classes',
-            default_value='80',
-            description='num classes.'
+            "model_path",
+            default_value="./src/YOLOX-ROS/weights/onnx/yolox_tiny.onnx",
+            description="yolox model path.",
         ),
+        DeclareLaunchArgument("p6", default_value="false", description="with p6."),
         DeclareLaunchArgument(
-            'model_version',
-            default_value='0.1.1rc0',
-            description='yolox model version.'
+            "class_labels_path",
+            default_value="",
+            description="if use custom model, set class name labels. ",
         ),
         DeclareLaunchArgument(
-            'onnxruntime_use_cuda',
-            default_value='true',
-            description='onnxruntime use cuda.'
+            "num_classes", default_value="80", description="num classes."
         ),
         DeclareLaunchArgument(
-            'onnxruntime_device_id',
-            default_value='0',
-            description='onnxruntime gpu device id.'
+            "model_version",
+            default_value="0.1.1rc0",
+            description="yolox model version.",
         ),
         DeclareLaunchArgument(
-            'onnxruntime_use_parallel',
-            default_value='false',
-            description='if use_parallel is true, you can set inter_op_num_threads.'
+            "onnxruntime_use_cuda",
+            default_value="true",
+            description="onnxruntime use cuda.",
         ),
         DeclareLaunchArgument(
-            'onnxruntime_inter_op_num_threads',
-            default_value='1'
+            "onnxruntime_device_id",
+            default_value="0",
+            description="onnxruntime gpu device id.",
         ),
         DeclareLaunchArgument(
-            'onnxruntime_intra_op_num_threads',
-            default_value='1',
-            description='ontrols the number of threads to use to run the model.'
+            "onnxruntime_use_parallel",
+            default_value="false",
+            description="if use_parallel is true, you can set inter_op_num_threads.",
         ),
+        DeclareLaunchArgument("onnxruntime_inter_op_num_threads", default_value="1"),
         DeclareLaunchArgument(
-            'conf',
-            default_value='0.30',
-            description='yolox confidence threshold.'
+            "onnxruntime_intra_op_num_threads",
+            default_value="1",
+            description="ontrols the number of threads to use to run the model.",
         ),
         DeclareLaunchArgument(
-            'nms',
-            default_value='0.45',
-            description='yolox nms threshold'
+            "conf", default_value="0.30", description="yolox confidence threshold."
         ),
         DeclareLaunchArgument(
-            'imshow_isshow',
-            default_value='true',
-            description=''
+            "nms", default_value="0.45", description="yolox nms threshold"
         ),
+        DeclareLaunchArgument("imshow_isshow", default_value="true", description=""),
         DeclareLaunchArgument(
-            'src_image_topic_name',
-            default_value='/image_raw',
-            description='topic name for source image'
+            "src_image_topic_name",
+            default_value="/image_raw",
+            description="topic name for source image",
         ),
         DeclareLaunchArgument(
-            'publish_image_topic_name',
-            default_value='/yolox/image_raw',
-            description='topic name for publishing image with bounding box drawn'
+            "publish_image_topic_name",
+            default_value="/yolox/image_raw",
+            description="topic name for publishing image with bounding box drawn",
         ),
         DeclareLaunchArgument(
-            'publish_boundingbox_topic_name',
-            default_value='/yolox/bounding_boxes',
-            description='topic name for publishing bounding box message.'
+            "publish_boundingbox_topic_name",
+            default_value="/yolox/bounding_boxes",
+            description="topic name for publishing bounding box message.",
         ),
         DeclareLaunchArgument(
-            'use_bbox_ex_msgs',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
+            "use_bbox_ex_msgs",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
         DeclareLaunchArgument(
-            'publish_resized_image',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
+            "publish_resized_image",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
     ]
     container = ComposableNodeContainer(
-        name='yolox_container',
-        namespace='',
-        package='rclcpp_components',
-        executable='component_container',
+        name="yolox_container",
+        namespace="",
+        package="rclcpp_components",
+        executable="component_container",
         composable_node_descriptions=[
             ComposableNode(
-                package='usb_cam',
-                plugin='usb_cam::UsbCamNode',
-                name='usb_cam_node',
-                parameters=[{
-                    'video_device': LaunchConfiguration('video_device'),
-                    'brightness': 100
-                }]),
+                package="usb_cam",
+                plugin="usb_cam::UsbCamNode",
+                name="usb_cam_node",
+                parameters=[
+                    {
+                        "video_device": LaunchConfiguration("video_device"),
+                        "brightness": 100,
+                    }
+                ],
+            ),
             ComposableNode(
-                package='yolox_ros_cpp',
-                plugin='yolox_ros_cpp::YoloXNode',
-                name='yolox_ros_cpp',
-                parameters=[{
-                    'model_path': LaunchConfiguration('model_path'),
-                    'p6': LaunchConfiguration('p6'),
-                    'class_labels_path': LaunchConfiguration('class_labels_path'),
-                    'num_classes': LaunchConfiguration('num_classes'),
-                    'model_type': 'onnxruntime',
-                    'model_version': LaunchConfiguration('model_version'),
-                    'onnxruntime_use_cuda': LaunchConfiguration('onnxruntime_use_cuda'),
-                    'onnxruntime_device_id': LaunchConfiguration('onnxruntime_device_id'),
-                    'onnxruntime_use_parallel': LaunchConfiguration('onnxruntime_use_parallel'),
-                    'onnxruntime_inter_op_num_threads': LaunchConfiguration('onnxruntime_inter_op_num_threads'),
-                    'onnxruntime_intra_op_num_threads': LaunchConfiguration('onnxruntime_intra_op_num_threads'),
-                    'conf': LaunchConfiguration('conf'),
-                    'nms': LaunchConfiguration('nms'),
-                    'imshow_isshow': LaunchConfiguration('imshow_isshow'),
-                    'src_image_topic_name': LaunchConfiguration('src_image_topic_name'),
-                    'publish_image_topic_name': LaunchConfiguration('publish_image_topic_name'),
-                    'publish_boundingbox_topic_name': LaunchConfiguration('publish_boundingbox_topic_name'),
-                    'publish_resized_image': LaunchConfiguration('publish_resized_image'),
-                    'use_bbox_ex_msgs': LaunchConfiguration('use_bbox_ex_msgs'),
-                }],
-                ),
+                package="yolox_ros_cpp",
+                plugin="yolox_ros_cpp::YoloXNode",
+                name="yolox_ros_cpp",
+                parameters=[
+                    {
+                        "model_path": LaunchConfiguration("model_path"),
+                        "p6": LaunchConfiguration("p6"),
+                        "class_labels_path": LaunchConfiguration("class_labels_path"),
+                        "num_classes": LaunchConfiguration("num_classes"),
+                        "model_type": "onnxruntime",
+                        "model_version": LaunchConfiguration("model_version"),
+                        "onnxruntime_use_cuda": LaunchConfiguration(
+                            "onnxruntime_use_cuda"
+                        ),
+                        "onnxruntime_device_id": LaunchConfiguration(
+                            "onnxruntime_device_id"
+                        ),
+                        "onnxruntime_use_parallel": LaunchConfiguration(
+                            "onnxruntime_use_parallel"
+                        ),
+                        "onnxruntime_inter_op_num_threads": LaunchConfiguration(
+                            "onnxruntime_inter_op_num_threads"
+                        ),
+                        "onnxruntime_intra_op_num_threads": LaunchConfiguration(
+                            "onnxruntime_intra_op_num_threads"
+                        ),
+                        "conf": LaunchConfiguration("conf"),
+                        "nms": LaunchConfiguration("nms"),
+                        "imshow_isshow": LaunchConfiguration("imshow_isshow"),
+                        "src_image_topic_name": LaunchConfiguration(
+                            "src_image_topic_name"
+                        ),
+                        "publish_image_topic_name": LaunchConfiguration(
+                            "publish_image_topic_name"
+                        ),
+                        "publish_boundingbox_topic_name": LaunchConfiguration(
+                            "publish_boundingbox_topic_name"
+                        ),
+                        "publish_resized_image": LaunchConfiguration(
+                            "publish_resized_image"
+                        ),
+                        "use_bbox_ex_msgs": LaunchConfiguration("use_bbox_ex_msgs"),
+                    }
+                ],
+            ),
         ],
-        output='screen',
+        output="screen",
     )
 
-    return launch.LaunchDescription(
-        launch_args +
-        [
-            container
-        ]
-    )
+    return launch.LaunchDescription(launch_args + [container])
diff --git a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_openvino.launch.py b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_openvino.launch.py
index 8b18767..09d44c0 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_openvino.launch.py
+++ b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_openvino.launch.py
@@ -4,127 +4,122 @@
 from launch_ros.actions import ComposableNodeContainer
 from launch_ros.descriptions import ComposableNode
 
+
 def generate_launch_description():
     launch_args = [
         DeclareLaunchArgument(
-            'video_device',
-            default_value='/dev/video0',
-            description='input video source'
-        ),
-        DeclareLaunchArgument(
-            'model_path',
-            default_value='./src/YOLOX-ROS/weights/onnx/yolox_tiny.onnx',
-            description='yolox model path.'
+            "video_device",
+            default_value="/dev/video0",
+            description="input video source",
         ),
         DeclareLaunchArgument(
-            'p6',
-            default_value='false',
-            description='with p6.'
+            "model_path",
+            default_value="./src/YOLOX-ROS/weights/onnx/yolox_tiny.onnx",
+            description="yolox model path.",
         ),
+        DeclareLaunchArgument("p6", default_value="false", description="with p6."),
         DeclareLaunchArgument(
-            'class_labels_path',
-            default_value='',
-            description='if use custom model, set class name labels. '
+            "class_labels_path",
+            default_value="",
+            description="if use custom model, set class name labels. ",
         ),
         DeclareLaunchArgument(
-            'num_classes',
-            default_value='80',
-            description='num classes.'
+            "num_classes", default_value="80", description="num classes."
         ),
         DeclareLaunchArgument(
-            'model_version',
-            default_value='0.1.1rc0',
-            description='yolox model version.'
+            "model_version",
+            default_value="0.1.1rc0",
+            description="yolox model version.",
         ),
         DeclareLaunchArgument(
-            'openvino_device',
-            default_value='CPU',
-            description='model device. CPU, GPU, MYRIAD, etc...'
+            "openvino_device",
+            default_value="CPU",
+            description="model device. CPU, GPU, MYRIAD, etc...",
         ),
         DeclareLaunchArgument(
-            'conf',
-            default_value='0.30',
-            description='yolox confidence threshold.'
+            "conf", default_value="0.30", description="yolox confidence threshold."
         ),
         DeclareLaunchArgument(
-            'nms',
-            default_value='0.45',
-            description='yolox nms threshold'
+            "nms", default_value="0.45", description="yolox nms threshold"
         ),
+        DeclareLaunchArgument("imshow_isshow", default_value="true", description=""),
         DeclareLaunchArgument(
-            'imshow_isshow',
-            default_value='true',
-            description=''
+            "src_image_topic_name",
+            default_value="/image_raw",
+            description="topic name for source image",
         ),
         DeclareLaunchArgument(
-            'src_image_topic_name',
-            default_value='/image_raw',
-            description='topic name for source image'
+            "publish_image_topic_name",
+            default_value="/yolox/image_raw",
+            description="topic name for publishing image with bounding box drawn",
         ),
         DeclareLaunchArgument(
-            'publish_image_topic_name',
-            default_value='/yolox/image_raw',
-            description='topic name for publishing image with bounding box drawn'
+            "publish_boundingbox_topic_name",
+            default_value="/yolox/bounding_boxes",
+            description="topic name for publishing bounding box message.",
         ),
         DeclareLaunchArgument(
-            'publish_boundingbox_topic_name',
-            default_value='/yolox/bounding_boxes',
-            description='topic name for publishing bounding box message.'
+            "use_bbox_ex_msgs",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
         DeclareLaunchArgument(
-            'use_bbox_ex_msgs',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
-        ),
-        DeclareLaunchArgument(
-            'publish_resized_image',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
+            "publish_resized_image",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
     ]
     container = ComposableNodeContainer(
-        name='yolox_container',
-        namespace='',
-        package='rclcpp_components',
-        executable='component_container',
+        name="yolox_container",
+        namespace="",
+        package="rclcpp_components",
+        executable="component_container",
         composable_node_descriptions=[
             ComposableNode(
-                package='usb_cam',
-                plugin='usb_cam::UsbCamNode',
-                name='usb_cam_node',
-                parameters=[{
-                    'video_device': LaunchConfiguration('video_device'),
-                    'brightness': 100
-                }]),
+                package="usb_cam",
+                plugin="usb_cam::UsbCamNode",
+                name="usb_cam_node",
+                parameters=[
+                    {
+                        "video_device": LaunchConfiguration("video_device"),
+                        "brightness": 100,
+                    }
+                ],
+            ),
             ComposableNode(
-                package='yolox_ros_cpp',
-                plugin='yolox_ros_cpp::YoloXNode',
-                name='yolox_ros_cpp',
-                parameters=[{
-                    'model_path': LaunchConfiguration('model_path'),
-                    'p6': LaunchConfiguration('p6'),
-                    'class_labels_path': LaunchConfiguration('class_labels_path'),
-                    'num_classes': LaunchConfiguration('num_classes'),
-                    'model_type': 'openvino',
-                    'model_version': LaunchConfiguration('model_version'),
-                    'openvino_device': LaunchConfiguration('openvino_device'),
-                    'conf': LaunchConfiguration('conf'),
-                    'nms': LaunchConfiguration('nms'),
-                    'imshow_isshow': LaunchConfiguration('imshow_isshow'),
-                    'src_image_topic_name': LaunchConfiguration('src_image_topic_name'),
-                    'publish_image_topic_name': LaunchConfiguration('publish_image_topic_name'),
-                    'publish_boundingbox_topic_name': LaunchConfiguration('publish_boundingbox_topic_name'),
-                    'publish_resized_image': LaunchConfiguration('publish_resized_image'),
-                    'use_bbox_ex_msgs': LaunchConfiguration('use_bbox_ex_msgs'),
-                }],
-                ),
+                package="yolox_ros_cpp",
+                plugin="yolox_ros_cpp::YoloXNode",
+                name="yolox_ros_cpp",
+                parameters=[
+                    {
+                        "model_path": LaunchConfiguration("model_path"),
+                        "p6": LaunchConfiguration("p6"),
+                        "class_labels_path": LaunchConfiguration("class_labels_path"),
+                        "num_classes": LaunchConfiguration("num_classes"),
+                        "model_type": "openvino",
+                        "model_version": LaunchConfiguration("model_version"),
+                        "openvino_device": LaunchConfiguration("openvino_device"),
+                        "conf": LaunchConfiguration("conf"),
+                        "nms": LaunchConfiguration("nms"),
+                        "imshow_isshow": LaunchConfiguration("imshow_isshow"),
+                        "src_image_topic_name": LaunchConfiguration(
+                            "src_image_topic_name"
+                        ),
+                        "publish_image_topic_name": LaunchConfiguration(
+                            "publish_image_topic_name"
+                        ),
+                        "publish_boundingbox_topic_name": LaunchConfiguration(
+                            "publish_boundingbox_topic_name"
+                        ),
+                        "publish_resized_image": LaunchConfiguration(
+                            "publish_resized_image"
+                        ),
+                        "use_bbox_ex_msgs": LaunchConfiguration("use_bbox_ex_msgs"),
+                    }
+                ],
+            ),
         ],
-        output='screen',
+        output="screen",
     )
 
-    return launch.LaunchDescription(
-        launch_args +
-        [
-            container
-        ]
-    )
+    return launch.LaunchDescription(launch_args + [container])
diff --git a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tensorrt.launch.py b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tensorrt.launch.py
index fd6eea8..7c63210 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tensorrt.launch.py
+++ b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tensorrt.launch.py
@@ -2,127 +2,115 @@
 from launch.actions import DeclareLaunchArgument
 from launch.substitutions import LaunchConfiguration
 from launch_ros.actions import ComposableNodeContainer
+from launch.actions import SetEnvironmentVariable
 from launch_ros.descriptions import ComposableNode
 
-def generate_launch_description():
 
+def generate_launch_description():
     launch_args = [
         DeclareLaunchArgument(
-            'video_device',
-            default_value='/dev/video0',
-            description='input video source'
-        ),
-        DeclareLaunchArgument(
-            'model_path',
-            default_value='./src/YOLOX-ROS/weights/tensorrt/yolox_tiny.trt',
-            description='yolox model path.'
+            "video_device",
+            default_value="/dev/video0",
+            description="input video source",
         ),
         DeclareLaunchArgument(
-            'p6',
-            default_value='false',
-            description='with p6.'
+            "model_path",
+            default_value="/home/triton/Documents/cv-dev/TR-Autonomy-2024-2025/src/TR-YOLOX-ROS/weights/tensorrt/modelInt8_clocks.engine",
+            description="yolox model path.",
         ),
+        DeclareLaunchArgument("p6", default_value="false", description="with p6."),
         DeclareLaunchArgument(
-            'class_labels_path',
-            default_value='',
-            description='if use custom model, set class name labels. '
+            "class_labels_path",
+            default_value="",
+            description="if use custom model, set class name labels. ",
         ),
         DeclareLaunchArgument(
-            'num_classes',
-            default_value='80',
-            description='num classes.'
+            "num_classes", default_value="80", description="num classes."
         ),
         DeclareLaunchArgument(
-            'tensorrt_device',
-            default_value='0',
-            description='GPU index. Set in string type. ex 0'
+            "tensorrt_device",
+            default_value="0",
+            description="GPU index. Set in string type. ex 0",
         ),
         DeclareLaunchArgument(
-            'model_version',
-            default_value='0.1.1rc0',
-            description='yolox model version.'
+            "model_version",
+            default_value="0.1.1rc0",
+            description="yolox model version.",
         ),
         DeclareLaunchArgument(
-            'conf',
-            default_value='0.30',
-            description='yolox confidence threshold.'
+            "conf", default_value="0.7", description="yolox confidence threshold."
         ),
         DeclareLaunchArgument(
-            'nms',
-            default_value='0.45',
-            description='yolox nms threshold'
+            "nms", default_value="0.45", description="yolox nms threshold"
         ),
+        DeclareLaunchArgument("imshow_isshow", default_value="false", description=""),
         DeclareLaunchArgument(
-            'imshow_isshow',
-            default_value='true',
-            description=''
+            "src_image_topic_name",
+            default_value="/camera/image",
+            description="topic name for source image",
         ),
         DeclareLaunchArgument(
-            'src_image_topic_name',
-            default_value='/image_raw',
-            description='topic name for source image'
+            "publish_image_topic_name",
+            default_value="/yolox/image_raw",
+            description="topic name for publishing image with bounding box drawn",
         ),
         DeclareLaunchArgument(
-            'publish_image_topic_name',
-            default_value='/yolox/image_raw',
-            description='topic name for publishing image with bounding box drawn'
+            "publish_boundingbox_topic_name",
+            default_value="/detections",
+            description="topic name for publishing bounding box message.",
         ),
         DeclareLaunchArgument(
-            'publish_boundingbox_topic_name',
-            default_value='/yolox/bounding_boxes',
-            description='topic name for publishing bounding box message.'
+            "use_bbox_ex_msgs",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
         DeclareLaunchArgument(
-            'use_bbox_ex_msgs',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
-        ),
-        DeclareLaunchArgument(
-            'publish_resized_image',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
+            "publish_resized_image",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
     ]
+    (SetEnvironmentVariable(name="RCLCPP_EXECUTOR_THREAD_COUNT", value="2"),)
     container = ComposableNodeContainer(
-        name='yolox_container',
-        namespace='',
-        package='rclcpp_components',
-        executable='component_container',
+        name="yolox_container",
+        namespace="",
+        package="rclcpp_components",
+        executable="component_container",
         composable_node_descriptions=[
             ComposableNode(
-                package='usb_cam',
-                plugin='usb_cam::UsbCamNode',
-                name='usb_cam_node',
-                parameters=[{
-                    'video_device': LaunchConfiguration('video_device'),
-                    'brightness': 100
-                }]),
-            ComposableNode(
-                package='yolox_ros_cpp',
-                plugin='yolox_ros_cpp::YoloXNode',
-                name='yolox_ros_cpp',
-                parameters=[{
-                    'model_path': LaunchConfiguration('model_path'),
-                    'p6': LaunchConfiguration('p6'),
-                    'class_labels_path': LaunchConfiguration('class_labels_path'),
-                    'num_classes': LaunchConfiguration('num_classes'),
-                    'model_type': 'tensorrt',
-                    'model_version': LaunchConfiguration('model_version'),
-                    'tensorrt_device': LaunchConfiguration('tensorrt_device'),
-                    'conf': LaunchConfiguration('conf'),
-                    'nms': LaunchConfiguration('nms'),
-                    'imshow_isshow': LaunchConfiguration('imshow_isshow'),
-                    'src_image_topic_name': LaunchConfiguration('src_image_topic_name'),
-                    'publish_image_topic_name': LaunchConfiguration('publish_image_topic_name'),
-                    'publish_boundingbox_topic_name': LaunchConfiguration('publish_boundingbox_topic_name'),
-                    'publish_resized_image': LaunchConfiguration('publish_resized_image'),
-                    'use_bbox_ex_msgs': LaunchConfiguration('use_bbox_ex_msgs'),
-                }],
+                package="yolox_ros_cpp",
+                plugin="yolox_ros_cpp::YoloXNode",
+                name="yolox_ros_cpp",
+                parameters=[
+                    {
+                        "model_path": LaunchConfiguration("model_path"),
+                        "p6": LaunchConfiguration("p6"),
+                        "class_labels_path": LaunchConfiguration("class_labels_path"),
+                        "num_classes": LaunchConfiguration("num_classes"),
+                        "model_type": "tensorrt",
+                        "model_version": LaunchConfiguration("model_version"),
+                        "tensorrt_device": LaunchConfiguration("tensorrt_device"),
+                        "conf": LaunchConfiguration("conf"),
+                        "nms": LaunchConfiguration("nms"),
+                        "imshow_isshow": LaunchConfiguration("imshow_isshow"),
+                        "src_image_topic_name": LaunchConfiguration(
+                            "src_image_topic_name"
+                        ),
+                        "publish_image_topic_name": LaunchConfiguration(
+                            "publish_image_topic_name"
+                        ),
+                        "publish_boundingbox_topic_name": LaunchConfiguration(
+                            "publish_boundingbox_topic_name"
+                        ),
+                        "publish_resized_image": LaunchConfiguration(
+                            "publish_resized_image"
+                        ),
+                        "use_bbox_ex_msgs": LaunchConfiguration("use_bbox_ex_msgs"),
+                    }
+                ],
             ),
         ],
-        output='screen',
+        output="screen",
     )
 
-    return launch.LaunchDescription(
-        launch_args + [container]
-    )
+    return launch.LaunchDescription(launch_args + [container])
diff --git a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tflite.launch.py b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tflite.launch.py
index 023f972..0fdfdf7 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tflite.launch.py
+++ b/yolox_ros_cpp/yolox_ros_cpp/launch/yolox_tflite.launch.py
@@ -8,127 +8,122 @@
 def generate_launch_description():
     launch_args = [
         DeclareLaunchArgument(
-            'video_device',
-            default_value='/dev/video0',
-            description='input video source'
+            "video_device",
+            default_value="/dev/video0",
+            description="input video source",
         ),
         DeclareLaunchArgument(
-            'model_path',
-            default_value='./src/YOLOX-ROS/weights/tflite/model.tflite',
-            description='yolox model path.'
+            "model_path",
+            default_value="./src/YOLOX-ROS/weights/tflite/model.tflite",
+            description="yolox model path.",
         ),
+        DeclareLaunchArgument("p6", default_value="false", description="with p6."),
         DeclareLaunchArgument(
-            'p6',
-            default_value='false',
-            description='with p6.'
+            "is_nchw",
+            default_value="true",
+            description="model input shape is NCHW or NWHC.",
         ),
         DeclareLaunchArgument(
-            'is_nchw',
-            default_value='true',
-            description='model input shape is NCHW or NWHC.'
+            "class_labels_path",
+            default_value="",
+            description="if use custom model, set class name labels. ",
         ),
         DeclareLaunchArgument(
-            'class_labels_path',
-            default_value='',
-            description='if use custom model, set class name labels. '
+            "num_classes", default_value="1", description="num classes."
         ),
         DeclareLaunchArgument(
-            'num_classes',
-            default_value='1',
-            description='num classes.'
+            "model_version",
+            default_value="0.1.1rc0",
+            description="yolox model version.",
         ),
         DeclareLaunchArgument(
-            'model_version',
-            default_value='0.1.1rc0',
-            description='yolox model version.'
+            "tflite_num_threads", default_value="1", description="tflite num_threads."
         ),
         DeclareLaunchArgument(
-            'tflite_num_threads',
-            default_value='1',
-            description='tflite num_threads.'
+            "conf", default_value="0.30", description="yolox confidence threshold."
         ),
         DeclareLaunchArgument(
-            'conf',
-            default_value='0.30',
-            description='yolox confidence threshold.'
+            "nms", default_value="0.45", description="yolox nms threshold"
         ),
+        DeclareLaunchArgument("imshow_isshow", default_value="true", description=""),
         DeclareLaunchArgument(
-            'nms',
-            default_value='0.45',
-            description='yolox nms threshold'
+            "src_image_topic_name",
+            default_value="/image_raw",
+            description="topic name for source image",
         ),
         DeclareLaunchArgument(
-            'imshow_isshow',
-            default_value='true',
-            description=''
+            "publish_image_topic_name",
+            default_value="/yolox/image_raw",
+            description="topic name for publishing image with bounding box drawn",
         ),
         DeclareLaunchArgument(
-            'src_image_topic_name',
-            default_value='/image_raw',
-            description='topic name for source image'
+            "publish_boundingbox_topic_name",
+            default_value="/yolox/bounding_boxes",
+            description="topic name for publishing bounding box message.",
         ),
         DeclareLaunchArgument(
-            'publish_image_topic_name',
-            default_value='/yolox/image_raw',
-            description='topic name for publishing image with bounding box drawn'
+            "use_bbox_ex_msgs",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
         DeclareLaunchArgument(
-            'publish_boundingbox_topic_name',
-            default_value='/yolox/bounding_boxes',
-            description='topic name for publishing bounding box message.'
-        ),
-        DeclareLaunchArgument(
-            'use_bbox_ex_msgs',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
-        ),
-        DeclareLaunchArgument(
-            'publish_resized_image',
-            default_value='false',
-            description='use BoundingBoxArray message type.'
+            "publish_resized_image",
+            default_value="false",
+            description="use BoundingBoxArray message type.",
         ),
     ]
     container = ComposableNodeContainer(
-        name='yolox_container',
-        namespace='',
-        package='rclcpp_components',
-        executable='component_container',
+        name="yolox_container",
+        namespace="",
+        package="rclcpp_components",
+        executable="component_container",
         composable_node_descriptions=[
             ComposableNode(
-                package='usb_cam',
-                plugin='usb_cam::UsbCamNode',
-                name='usb_cam_node',
-                parameters=[{
-                    'video_device': LaunchConfiguration('video_device'),
-                    'brightness': 100
-                }]),
+                package="usb_cam",
+                plugin="usb_cam::UsbCamNode",
+                name="usb_cam_node",
+                parameters=[
+                    {
+                        "video_device": LaunchConfiguration("video_device"),
+                        "brightness": 100,
+                    }
+                ],
+            ),
             ComposableNode(
-                package='yolox_ros_cpp',
-                plugin='yolox_ros_cpp::YoloXNode',
-                name='yolox_ros_cpp',
-                parameters=[{
-                    'model_path': LaunchConfiguration('model_path'),
-                    'p6': LaunchConfiguration('p6'),
-                    'class_labels_path': LaunchConfiguration('class_labels_path'),
-                    'num_classes': LaunchConfiguration('num_classes'),
-                    'is_nchw': LaunchConfiguration('is_nchw'),
-                    'model_type': 'tflite',
-                    'model_version': LaunchConfiguration('model_version'),
-                    'tflite_num_threads': LaunchConfiguration('tflite_num_threads'),
-                    'conf': LaunchConfiguration('conf'),
-                    'nms': LaunchConfiguration('nms'),
-                    'imshow_isshow': LaunchConfiguration('imshow_isshow'),
-                    'src_image_topic_name': LaunchConfiguration('src_image_topic_name'),
-                    'publish_image_topic_name': LaunchConfiguration('publish_image_topic_name'),
-                    'publish_boundingbox_topic_name': LaunchConfiguration('publish_boundingbox_topic_name'),
-                    'publish_resized_image': LaunchConfiguration('publish_resized_image'),
-                    'use_bbox_ex_msgs': LaunchConfiguration('use_bbox_ex_msgs'),
-                }],
+                package="yolox_ros_cpp",
+                plugin="yolox_ros_cpp::YoloXNode",
+                name="yolox_ros_cpp",
+                parameters=[
+                    {
+                        "model_path": LaunchConfiguration("model_path"),
+                        "p6": LaunchConfiguration("p6"),
+                        "class_labels_path": LaunchConfiguration("class_labels_path"),
+                        "num_classes": LaunchConfiguration("num_classes"),
+                        "is_nchw": LaunchConfiguration("is_nchw"),
+                        "model_type": "tflite",
+                        "model_version": LaunchConfiguration("model_version"),
+                        "tflite_num_threads": LaunchConfiguration("tflite_num_threads"),
+                        "conf": LaunchConfiguration("conf"),
+                        "nms": LaunchConfiguration("nms"),
+                        "imshow_isshow": LaunchConfiguration("imshow_isshow"),
+                        "src_image_topic_name": LaunchConfiguration(
+                            "src_image_topic_name"
+                        ),
+                        "publish_image_topic_name": LaunchConfiguration(
+                            "publish_image_topic_name"
+                        ),
+                        "publish_boundingbox_topic_name": LaunchConfiguration(
+                            "publish_boundingbox_topic_name"
+                        ),
+                        "publish_resized_image": LaunchConfiguration(
+                            "publish_resized_image"
+                        ),
+                        "use_bbox_ex_msgs": LaunchConfiguration("use_bbox_ex_msgs"),
+                    }
+                ],
             ),
         ],
-        output='screen',
+        output="screen",
     )
 
-    return launch.LaunchDescription(
-        launch_args + [container]
-    )
+    return launch.LaunchDescription(launch_args + [container])
diff --git a/yolox_ros_cpp/yolox_ros_cpp/package.xml b/yolox_ros_cpp/yolox_ros_cpp/package.xml
index 1f6e181..ec7ebe5 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/package.xml
+++ b/yolox_ros_cpp/yolox_ros_cpp/package.xml
@@ -1,33 +1,34 @@
 <?xml version="1.0"?>
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
-  <name>yolox_ros_cpp</name>
-  <version>0.4.1</version>
-  <description>The yolox_ros_cpp package</description>
-  <maintainer email="53618876+fateshelled@users.noreply.github.com">fateshelled</maintainer>
-  <license>Apache-2.0 License</license>
-  <author email="53618876+fateshelled@users.noreply.github.com">fateshelled</author>
+    <name>yolox_ros_cpp</name>
+    <version>0.4.1</version>
+    <description>The yolox_ros_cpp package</description>
+    <maintainer email="53618876+fateshelled@users.noreply.github.com">fateshelled</maintainer>
+    <license>Apache-2.0 License</license>
+    <author email="53618876+fateshelled@users.noreply.github.com">fateshelled</author>
 
-  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+    <buildtool_depend>ament_cmake_auto</buildtool_depend>
 
-  <depend>bboxes_ex_msgs</depend>
-  <depend>cv_bridge</depend>
-  <depend>image_transport</depend>
-  <depend>libopencv-dev</depend>
-  <depend>rclcpp</depend>
-  <depend>rclcpp_components</depend>
-  <depend>sensor_msgs</depend>
-  <depend>std_msgs</depend>
-  <depend>vision_msgs</depend>
-  <depend>yolox_cpp</depend>
-  <depend>yolox_param</depend>
+    <depend>bboxes_ex_msgs</depend>
+    <depend>cv_bridge</depend>
+    <depend>image_transport</depend>
+    <depend>libopencv-dev</depend>
+    <depend>rclcpp</depend>
+    <depend>rclcpp_components</depend>
+    <depend>sensor_msgs</depend>
+    <depend>std_msgs</depend>
+    <depend>vision_msgs</depend>
+    <depend>yolox_cpp</depend>
+    <depend>yolox_param</depend>
+    <depend>tr_messages</depend>
 
-  <exec_depend>usb_cam</exec_depend>
+    <exec_depend>usb_cam</exec_depend>
 
-  <test_depend>ament_lint_auto</test_depend>
-  <test_depend>ament_lint_common</test_depend>
+    <test_depend>ament_lint_auto</test_depend>
+    <test_depend>ament_lint_common</test_depend>
 
-  <export>
-    <build_type>ament_cmake</build_type>
-  </export>
-</package>
+    <export>
+        <build_type>ament_cmake</build_type>
+    </export>
+</package>
\ No newline at end of file
diff --git a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp
index a5f5bcf..685fa5a 100644
--- a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp
+++ b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp
@@ -1,207 +1,232 @@
 #include "yolox_ros_cpp/yolox_ros_cpp.hpp"
 
-namespace yolox_ros_cpp
-{
-    YoloXNode::YoloXNode(const rclcpp::NodeOptions &options)
-        : Node("yolox_ros_cpp", options)
-    {
-        using namespace std::chrono_literals; // NOLINT
-        this->init_timer_ = this->create_wall_timer(
-            0s, std::bind(&YoloXNode::onInit, this));
-    }
-
-    void YoloXNode::onInit()
-    {
-        this->init_timer_->cancel();
-        this->param_listener_ = std::make_shared<yolox_parameters::ParamListener>(
-            this->get_node_parameters_interface());
-
-        this->params_ = this->param_listener_->get_params();
+namespace yolox_ros_cpp {
+YoloXNode::YoloXNode(const rclcpp::NodeOptions &options)
+    : Node("yolox_ros_cpp", options) {
+    using namespace std::chrono_literals; // NOLINT
+    this->init_timer_ =
+        this->create_wall_timer(0s, std::bind(&YoloXNode::onInit, this));
+}
 
-        if (this->params_.imshow_isshow)
-        {
-            cv::namedWindow("yolox", cv::WINDOW_AUTOSIZE);
-        }
+void YoloXNode::onInit() {
+    this->init = true;
+    this->d_image_ = nullptr;
+    this->d_output_ = nullptr;
+    cudaStreamCreate(&this->copy_stream_);
+    cudaStreamCreate(&this->resize_stream_);
+    rclcpp::on_shutdown([this]() {
+        cudaStreamDestroy(this->copy_stream_);
+        cudaStreamDestroy(this->resize_stream_);
+        cudaFree(this->d_image_);
+        cudaFree(this->d_output_);
+        this->copy_stream_ = nullptr;
+        this->resize_stream_ = nullptr;
+    });
+    this->init_timer_->cancel();
+    this->param_listener_ = std::make_shared<yolox_parameters::ParamListener>(
+        this->get_node_parameters_interface());
+    this->params_ = this->param_listener_->get_params();
+    this->callback_group_reentrant_ =
+        this->create_callback_group(rclcpp::CallbackGroupType::Reentrant);
+    this->sub_options_ = std::make_shared<rclcpp::SubscriptionOptions>();
+    this->sub_options_->callback_group = callback_group_reentrant_;
+    this->sub_options_->use_intra_process_comm =
+        rclcpp::IntraProcessSetting::Enable;
+    if (this->params_.imshow_isshow) {
+        cv::namedWindow("yolox", cv::WINDOW_AUTOSIZE);
+    }
 
-        if (this->params_.class_labels_path != "")
-        {
-            RCLCPP_INFO(this->get_logger(), "read class labels from '%s'", this->params_.class_labels_path.c_str());
-            this->class_names_ = yolox_cpp::utils::read_class_labels_file(this->params_.class_labels_path);
-        }
-        else
-        {
-            this->class_names_ = yolox_cpp::COCO_CLASSES;
-        }
+    if (this->params_.class_labels_path != "") {
+        RCLCPP_INFO(this->get_logger(), "read class labels from '%s'",
+                    this->params_.class_labels_path.c_str());
+        this->class_names_ = yolox_cpp::utils::read_class_labels_file(
+            this->params_.class_labels_path);
+    } else {
+        this->class_names_ = yolox_cpp::COCO_CLASSES;
+    }
 
-        if (this->params_.model_type == "tensorrt")
-        {
+    if (this->params_.model_type == "tensorrt") {
 #ifdef ENABLE_TENSORRT
-            RCLCPP_INFO(this->get_logger(), "Model Type is TensorRT");
-            this->yolox_ = std::make_unique<yolox_cpp::YoloXTensorRT>(
-                this->params_.model_path, this->params_.tensorrt_device,
-                this->params_.nms, this->params_.conf, this->params_.model_version,
-                this->params_.num_classes, this->params_.p6);
+        RCLCPP_INFO(this->get_logger(), "Model Type is TensorRT");
+        this->yolox_ = std::make_unique<yolox_cpp::YoloXTensorRT>(
+            this->params_.model_path, this->params_.tensorrt_device,
+            this->params_.nms, this->params_.conf, this->params_.model_version,
+            this->params_.num_classes, this->params_.p6);
 #else
-            RCLCPP_ERROR(this->get_logger(), "yolox_cpp is not built with TensorRT");
-            rclcpp::shutdown();
+        RCLCPP_ERROR(this->get_logger(),
+                     "yolox_cpp is not built with TensorRT");
+        rclcpp::shutdown();
 #endif
-        }
-        else if (this->params_.model_type == "openvino")
-        {
+    } else if (this->params_.model_type == "openvino") {
 #ifdef ENABLE_OPENVINO
-            RCLCPP_INFO(this->get_logger(), "Model Type is OpenVINO");
-            this->yolox_ = std::make_unique<yolox_cpp::YoloXOpenVINO>(
-                this->params_.model_path, this->params_.openvino_device,
-                this->params_.nms, this->params_.conf, this->params_.model_version,
-                this->params_.num_classes, this->params_.p6);
+        RCLCPP_INFO(this->get_logger(), "Model Type is OpenVINO");
+        this->yolox_ = std::make_unique<yolox_cpp::YoloXOpenVINO>(
+            this->params_.model_path, this->params_.openvino_device,
+            this->params_.nms, this->params_.conf, this->params_.model_version,
+            this->params_.num_classes, this->params_.p6);
 #else
-            RCLCPP_ERROR(this->get_logger(), "yolox_cpp is not built with OpenVINO");
-            rclcpp::shutdown();
+        RCLCPP_ERROR(this->get_logger(),
+                     "yolox_cpp is not built with OpenVINO");
+        rclcpp::shutdown();
 #endif
-        }
-        else if (this->params_.model_type == "onnxruntime")
-        {
+    } else if (this->params_.model_type == "onnxruntime") {
 #ifdef ENABLE_ONNXRUNTIME
-            RCLCPP_INFO(this->get_logger(), "Model Type is ONNXRuntime");
-            this->yolox_ = std::make_unique<yolox_cpp::YoloXONNXRuntime>(
-                this->params_.model_path,
-                this->params_.onnxruntime_intra_op_num_threads,
-                this->params_.onnxruntime_inter_op_num_threads,
-                this->params_.onnxruntime_use_cuda, this->params_.onnxruntime_device_id,
-                this->params_.onnxruntime_use_parallel,
-                this->params_.nms, this->params_.conf, this->params_.model_version,
-                this->params_.num_classes, this->params_.p6);
+        RCLCPP_INFO(this->get_logger(), "Model Type is ONNXRuntime");
+        this->yolox_ = std::make_unique<yolox_cpp::YoloXONNXRuntime>(
+            this->params_.model_path,
+            this->params_.onnxruntime_intra_op_num_threads,
+            this->params_.onnxruntime_inter_op_num_threads,
+            this->params_.onnxruntime_use_cuda,
+            this->params_.onnxruntime_device_id,
+            this->params_.onnxruntime_use_parallel, this->params_.nms,
+            this->params_.conf, this->params_.model_version,
+            this->params_.num_classes, this->params_.p6);
 #else
-            RCLCPP_ERROR(this->get_logger(), "yolox_cpp is not built with ONNXRuntime");
-            rclcpp::shutdown();
+        RCLCPP_ERROR(this->get_logger(),
+                     "yolox_cpp is not built with ONNXRuntime");
+        rclcpp::shutdown();
 #endif
-        }
-        else if (this->params_.model_type == "tflite")
-        {
+    } else if (this->params_.model_type == "tflite") {
 #ifdef ENABLE_TFLITE
-            RCLCPP_INFO(this->get_logger(), "Model Type is tflite");
-            this->yolox_ = std::make_unique<yolox_cpp::YoloXTflite>(
-                this->params_.model_path, this->params_.tflite_num_threads,
-                this->params_.nms, this->params_.conf, this->params_.model_version,
-                this->params_.num_classes, this->params_.p6, this->params_.is_nchw);
+        RCLCPP_INFO(this->get_logger(), "Model Type is tflite");
+        this->yolox_ = std::make_unique<yolox_cpp::YoloXTflite>(
+            this->params_.model_path, this->params_.tflite_num_threads,
+            this->params_.nms, this->params_.conf, this->params_.model_version,
+            this->params_.num_classes, this->params_.p6, this->params_.is_nchw);
 #else
-            RCLCPP_ERROR(this->get_logger(), "yolox_cpp is not built with tflite");
-            rclcpp::shutdown();
+        RCLCPP_ERROR(this->get_logger(), "yolox_cpp is not built with tflite");
+        rclcpp::shutdown();
 #endif
-        }
-        RCLCPP_INFO(this->get_logger(), "model loaded");
-
-        this->sub_image_ = image_transport::create_subscription(
-            this, this->params_.src_image_topic_name,
-            std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1),
-            "raw");
-
-        if (this->params_.use_bbox_ex_msgs) {
-            this->pub_bboxes_ = this->create_publisher<bboxes_ex_msgs::msg::BoundingBoxes>(
-                this->params_.publish_boundingbox_topic_name,
-                10);
-        } else {
-            this->pub_detection2d_ = this->create_publisher<vision_msgs::msg::Detection2DArray>(
-                this->params_.publish_boundingbox_topic_name,
-                10);
-        }
-
-        if (this->params_.publish_resized_image) {
-            this->pub_image_ = image_transport::create_publisher(this, this->params_.publish_image_topic_name);
-        }
+    }
+    RCLCPP_INFO(this->get_logger(), "model loaded");
+
+    rclcpp::QoS qos_profile(5); // Queue depth of 5 for multithreading
+    this->sub_image_ = image_transport::create_subscription(
+        this, this->params_.src_image_topic_name,
+        std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1),
+        "raw", qos_profile.get_rmw_qos_profile(), *this->sub_options_);
+
+    if (this->params_.use_bbox_ex_msgs) {
+        this->pub_bboxes_ =
+            this->create_publisher<bboxes_ex_msgs::msg::BoundingBoxes>(
+                this->params_.publish_boundingbox_topic_name, 10);
+    } else {
+        this->pub_detection2d_ =
+            this->create_publisher<tr_messages::msg::DetWithImg>(
+                this->params_.publish_boundingbox_topic_name, 10);
     }
 
-    void YoloXNode::colorImageCallback(const sensor_msgs::msg::Image::ConstSharedPtr &ptr)
-    {
-        auto img = cv_bridge::toCvCopy(ptr, "bgr8");
-        cv::Mat frame = img->image;
-
-        auto now = std::chrono::system_clock::now();
-        auto objects = this->yolox_->inference(frame);
-        auto end = std::chrono::system_clock::now();
-
-        auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - now);
-        RCLCPP_INFO(this->get_logger(), "Inference time: %5ld us", elapsed.count());
+    if (this->params_.publish_resized_image) {
+        this->pub_image_ = image_transport::create_publisher(
+            this, this->params_.publish_image_topic_name);
+    }
+}
 
-        yolox_cpp::utils::draw_objects(frame, objects, this->class_names_);
-        if (this->params_.imshow_isshow)
-        {
-            cv::imshow("yolox", frame);
-            auto key = cv::waitKey(1);
-            if (key == 27)
-            {
-                rclcpp::shutdown();
-            }
+void YoloXNode::colorImageCallback(
+    const sensor_msgs::msg::Image::ConstSharedPtr &ptr) {
+    auto now_noninf = std::chrono::system_clock::now();
+    auto img = cv_bridge::toCvShare(ptr, "bgr8");
+
+    auto now = std::chrono::system_clock::now();
+    // Initialization
+    if (this->init) {
+        size_t image_bytes = sizeof(uchar3) * img->image.cols * img->image.rows;
+        cudaMallocManaged(reinterpret_cast<void **>(&this->d_image_),
+                          image_bytes);
+        size_t output_bytes = sizeof(float) * 416 * 416 * 3;
+        cudaMallocManaged(reinterpret_cast<void **>(&this->d_output_),
+                          output_bytes);
+        this->init = false;
+    }
+    auto objects = this->yolox_->inference(
+        img->image, this->d_image_, this->d_output_, this->copy_stream_,
+        this->resize_stream_); // Use img->image
+    auto end = std::chrono::system_clock::now();
+    auto elapsed =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - now);
+
+    if (this->params_.imshow_isshow) {
+        yolox_cpp::utils::draw_objects(img->image, objects, this->class_names_);
+        cv::imshow("yolox", img->image);
+        if (cv::waitKey(1) == 27) {
+            rclcpp::shutdown();
         }
+    }
 
-        if (this->params_.use_bbox_ex_msgs)
-        {
-            if (this->pub_bboxes_ == nullptr)
-            {
-                RCLCPP_ERROR(this->get_logger(), "pub_bboxes_ is nullptr");
-                return;
-            }
-            auto boxes = objects_to_bboxes(frame, objects, img->header);
-            this->pub_bboxes_->publish(boxes);
+    if (this->params_.use_bbox_ex_msgs) {
+        if (!this->pub_bboxes_) {
+            RCLCPP_ERROR(this->get_logger(), "pub_bboxes_ is nullptr");
+            return;
         }
-        else
-        {
-            if (this->pub_detection2d_ == nullptr)
-            {
-                RCLCPP_ERROR(this->get_logger(), "pub_detection2d_ is nullptr");
-                return;
-            }
-            vision_msgs::msg::Detection2DArray detections = objects_to_detection2d(objects, img->header);
-            this->pub_detection2d_->publish(detections);
+        auto boxes =
+            objects_to_bboxes(img->image, objects, img->header); // Fix here
+        this->pub_bboxes_->publish(boxes);
+    } else {
+        if (!this->pub_detection2d_) {
+            RCLCPP_ERROR(this->get_logger(), "pub_detection2d_ is nullptr");
+            return;
         }
-
-        if (this->params_.publish_resized_image) {
-            sensor_msgs::msg::Image::SharedPtr pub_img =
-                cv_bridge::CvImage(img->header, "bgr8", frame).toImageMsg();
-            this->pub_image_.publish(pub_img);
+        vision_msgs::msg::Detection2DArray detections =
+            objects_to_detection2d(objects, img->header);
+        if (!detections.detections.empty()) {
+            tr_messages::msg::DetWithImg detwithimg;
+            // detwithimg.image = *ptr;  // Copy unavoidable due to const shared
+            // ptr
+            detwithimg.detection_info.detections = detections.detections;
+            this->pub_detection2d_->publish(detwithimg);
+        } else {
+            RCLCPP_INFO(this->get_logger(), "no detections so not publishing");
         }
     }
+    auto end_noninf = std::chrono::system_clock::now();
+    auto elapsed_noninf = std::chrono::duration_cast<std::chrono::microseconds>(
+        end_noninf - now_noninf);
+    RCLCPP_INFO(this->get_logger(),
+                "Inference time: %5ld us Non Inference time: %5ld",
+                elapsed.count(), elapsed_noninf.count() - elapsed.count());
+}
 
-    bboxes_ex_msgs::msg::BoundingBoxes YoloXNode::objects_to_bboxes(
-        const cv::Mat &frame, const std::vector<yolox_cpp::Object> &objects, const std_msgs::msg::Header &header)
-    {
-        bboxes_ex_msgs::msg::BoundingBoxes boxes;
-        boxes.header = header;
-        for (const auto &obj : objects)
-        {
-            bboxes_ex_msgs::msg::BoundingBox box;
-            box.probability = obj.prob;;
-            box.class_id = std::to_string(obj.label);
-            box.xmin = obj.rect.x;
-            box.ymin = obj.rect.y;
-            box.xmax = (obj.rect.x + obj.rect.width);
-            box.ymax = (obj.rect.y + obj.rect.height);
-            box.img_width = frame.cols;
-            box.img_height = frame.rows;
-            boxes.bounding_boxes.emplace_back(box);
-        }
-        return boxes;
+bboxes_ex_msgs::msg::BoundingBoxes
+YoloXNode::objects_to_bboxes(const cv::Mat &frame,
+                             const std::vector<yolox_cpp::Object> &objects,
+                             const std_msgs::msg::Header &header) {
+    bboxes_ex_msgs::msg::BoundingBoxes boxes;
+    boxes.header = header;
+    for (const auto &obj : objects) {
+        bboxes_ex_msgs::msg::BoundingBox box;
+        box.probability = obj.prob;
+        ;
+        box.class_id = std::to_string(obj.label);
+        box.xmin = obj.rect.x;
+        box.ymin = obj.rect.y;
+        box.xmax = (obj.rect.x + obj.rect.width);
+        box.ymax = (obj.rect.y + obj.rect.height);
+        box.img_width = frame.cols;
+        box.img_height = frame.rows;
+        boxes.bounding_boxes.emplace_back(box);
     }
+    return boxes;
+}
 
-    vision_msgs::msg::Detection2DArray YoloXNode::objects_to_detection2d(const std::vector<yolox_cpp::Object> &objects, const std_msgs::msg::Header &header)
-    {
-        vision_msgs::msg::Detection2DArray detection2d;
-        detection2d.header = header;
-        for (const auto &obj : objects)
-        {
-            vision_msgs::msg::Detection2D det;
-            det.bbox.center.position.x = obj.rect.x + obj.rect.width / 2;
-            det.bbox.center.position.y = obj.rect.y + obj.rect.height / 2;
-            det.bbox.size_x = obj.rect.width;
-            det.bbox.size_y = obj.rect.height;
-
-            det.results.resize(1);
-            det.results[0].hypothesis.class_id = std::to_string(obj.label);
-            det.results[0].hypothesis.score = obj.prob;
-            detection2d.detections.emplace_back(det);
-        }
-        return detection2d;
+vision_msgs::msg::Detection2DArray
+YoloXNode::objects_to_detection2d(const std::vector<yolox_cpp::Object> &objects,
+                                  const std_msgs::msg::Header &header) {
+    vision_msgs::msg::Detection2DArray detection2d;
+    detection2d.header = header;
+    for (const auto &obj : objects) {
+        vision_msgs::msg::Detection2D det;
+        det.bbox.center.position.x = obj.rect.x + obj.rect.width / 2;
+        det.bbox.center.position.y = obj.rect.y + obj.rect.height / 2;
+        det.bbox.size_x = obj.rect.width;
+        det.bbox.size_y = obj.rect.height;
+
+        det.results.resize(1);
+        det.results[0].hypothesis.class_id = std::to_string(obj.label);
+        det.results[0].hypothesis.score = obj.prob;
+        detection2d.detections.emplace_back(det);
     }
+    return detection2d;
 }
+} // namespace yolox_ros_cpp
 
 RCLCPP_COMPONENTS_REGISTER_NODE(yolox_ros_cpp::YoloXNode)