1+ /* ML YOLOv5 Model
2+ *
3+ * From: https://github.com/PokemonAutomation/
4+ *
5+ * Run YOLOv5 model for object detection
6+ */
7+
8+
9+ #include < string>
10+ #include < iostream>
11+ #include < opencv2/imgproc.hpp>
12+ #include < opencv2/dnn.hpp>
13+ #include " ML/Models/ML_ONNXRuntimeHelpers.h"
14+ #include " ML_YOLOv5Model.h"
15+
16+ namespace PokemonAutomation {
17+ namespace ML {
18+
19+ std::tuple<int , int , double , double > resize_image_with_border (
20+ const cv::Mat& input_image,
21+ cv::Mat& output_image,
22+ int target_width, int target_height,
23+ cv::Scalar border_color = cv::Scalar(0 ,0 ,0 )
24+ ){
25+ int original_width = input_image.cols ;
26+ int original_height = input_image.rows ;
27+
28+ double scale_x = static_cast <double >(target_width) / original_width;
29+ double scale_y = static_cast <double >(target_height) / original_height;
30+ double scale = std::min (scale_x, scale_y);
31+
32+ int new_width = static_cast <int >(original_width * scale);
33+ int new_height = static_cast <int >(original_height * scale);
34+ new_width = std::min (new_width, target_width);
35+ new_height = std::min (new_height, target_height);
36+
37+ if (new_width == 0 || new_height == 0 ){
38+ throw std::runtime_error (" Input Image too small: " + std::to_string (original_width) + " x " + std::to_string (original_height));
39+ }
40+
41+ cv::Mat resized_image;
42+ cv::resize (input_image, resized_image, cv::Size (new_width, new_height), 0 , 0 , cv::INTER_LINEAR); // INTER_AREA for shrinking
43+
44+ int border_top = (target_height - new_height) / 2 ;
45+ int border_bottom = target_height - new_height - border_top;
46+ int border_left = (target_width - new_width) / 2 ;
47+ int border_right = target_width - new_width - border_left;
48+
49+ cv::copyMakeBorder (resized_image, output_image, border_top, border_bottom, border_left, border_right, cv::BORDER_CONSTANT, border_color);
50+
51+ return std::make_tuple (
52+ border_left, border_top,
53+ 1.0 / new_width, 1.0 / new_height
54+ );
55+ }
56+
57+
58+ YOLOv5Session::YOLOv5Session (const std::string& model_path, std::vector<std::string> label_names)
59+ : m_label_names(std::move(label_names))
60+ , m_session_options(create_session_option())
61+ , m_session{m_env, model_path.c_str (), m_session_options}
62+ , m_memory_info{Ort::MemoryInfo::CreateCpu (OrtDeviceAllocator, OrtMemTypeCPU)}
63+ , m_input_names{m_session.GetInputNames ()}
64+ , m_output_names{m_session.GetOutputNames ()}
65+ , m_model_input(3 *YOLO5_INPUT_IMAGE_SIZE*YOLO5_INPUT_IMAGE_SIZE)
66+ {
67+ if (m_session.GetOutputCount () != 1 ){
68+ throw std::runtime_error (" YOLOv5 model does not have the correct output count, found count " + std::to_string (m_session.GetOutputCount ()));
69+ }
70+
71+ std::vector<int64_t > output_dims = m_session.GetOutputTypeInfo (0 ).GetTensorTypeAndShapeInfo ().GetShape ();
72+ if (output_dims.size () != 3 || output_dims[2 ] <= 5 ){
73+ throw std::runtime_error (" YOLOv5 model does not have the correct output dimension, found shape " + to_string (output_dims));
74+ }
75+ m_output_shape[2 ] = output_dims[2 ];
76+ if (output_dims[2 ] - 5 != static_cast <int >(m_label_names.size ())){
77+ throw std::runtime_error (
78+ " YOLOv5 model has " + std::to_string (output_dims[2 ]-5 ) +
79+ " output labels but YOLOv5Session was initialized with " + std::to_string (label_names.size ()) + " labels"
80+ );
81+ }
82+ m_model_output.resize (YOLO5_NUM_CANDIDATES * m_output_shape[2 ]);
83+ }
84+
85+ // input: rgb color order
86+ void YOLOv5Session::run (const cv::Mat& input_image, std::vector<YOLOv5Session::DetectionBox>& output_boxes){
87+ CV_Assert (input_image.depth () == CV_8U);
88+ CV_Assert (input_image.channels () == 3 );
89+
90+ cv::Mat image_resized;
91+
92+ int x_shift = 0 , y_shift = 0 ;
93+ double x_scale = 1.0 , y_scale = 1.0 ;
94+ std::tie (x_shift, y_shift, x_scale, y_scale) = resize_image_with_border (input_image, image_resized,
95+ YOLO5_INPUT_IMAGE_SIZE, YOLO5_INPUT_IMAGE_SIZE, cv::Scalar (114 , 114 , 114 ));
96+
97+ // Declare a destination Mat for float32
98+ cv::Mat image_float;
99+
100+ // Convert the uint8_image to image_float
101+ // The third argument (alpha) is a scaling factor.
102+ // For normalization to [0.0, 1.0], use 1.0 / 255.0.
103+ // For retaining original values (0-255), use 1.0.
104+ image_resized.convertTo (image_float, CV_32F, 1.0 / 255.0 );
105+
106+ for (int c = 0 , i = 0 ; c < 3 ; c++) {
107+ for (int row = 0 ; row < image_float.rows ; row++) {
108+ for (int col = 0 ; col < image_float.cols ; col++) {
109+ float pixel_value = image_float.at <cv::Vec3f>(row, col)[c];
110+ m_model_input[i++] = pixel_value;
111+ }
112+ }
113+ }
114+
115+ auto input_tensor = create_tensor<float >(m_memory_info, m_model_input, m_input_shape);
116+ auto output_tensor = create_tensor<float >(m_memory_info, m_model_output, m_output_shape);
117+
118+ const char * input_name_c = m_input_names[0 ].data ();
119+ const char * output_name_c = m_output_names[0 ].data ();
120+ // auto start = std::chrono::steady_clock::now();
121+ m_session.Run (m_run_options, &input_name_c, &input_tensor, 1 , &output_name_c, &output_tensor, 1 );
122+ // auto end = std::chrono::steady_clock::now();
123+ // auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
124+ // std::cout << "Yolov5 inference time: " << milliseconds << " ms" << std::endl;
125+
126+ const size_t cand_size = m_label_names.size () + 5 ;
127+
128+ std::vector<cv::Rect> pixel_boxes;
129+ std::vector<int > indices;
130+ std::vector<float > scores;
131+ std::vector<size_t > labels;
132+
133+ for (int i = 0 ; i < YOLO5_NUM_CANDIDATES; i++){
134+ float cx = m_model_output[cand_size*i];
135+ float cy = m_model_output[cand_size*i+1 ];
136+ float w = m_model_output[cand_size*i+2 ];
137+ float h = m_model_output[cand_size*i+3 ];
138+ float sc = m_model_output[cand_size*i+4 ];
139+
140+ float max_score = 0.0 ;
141+ size_t pred_label = 0 ; // predicted label
142+ for (size_t j_label = 0 ; j_label < m_label_names.size (); j_label++){
143+ float score = m_model_output[cand_size*i+5 +j_label];
144+ if (score > max_score){
145+ max_score = score;
146+ pred_label = j_label;
147+ }
148+ }
149+ scores.push_back (max_score * sc); // sc is like a global confidence scale?
150+ pixel_boxes.emplace_back ((int )(cx - w / 2 + 0.5 ), (int )(cy - h / 2 + 0.5 ), int (w + 0.5 ), int (h + 0.5 ));
151+ indices.push_back (i);
152+ labels.push_back (pred_label);
153+ }
154+
155+ cv::dnn::NMSBoxes (pixel_boxes, scores, 0 .2f , 0 .45f , indices);
156+
157+ // std::cout << "num found pixel_boxes " << indices.size() << std::endl;
158+ // return;
159+
160+ for (int index : indices)
161+ {
162+ // Note the model predicts on (640x640) images, we need to convert the detected pixel_boxes back to
163+ // the full frame dimension.
164+ double x = (pixel_boxes[index].x - x_shift) * x_scale;
165+ double y = (pixel_boxes[index].y - y_shift) * y_scale;
166+ double w = pixel_boxes[index].width * x_scale;
167+ double h = pixel_boxes[index].height * y_scale;
168+ // std::cout << scores[index] << " " << x << " " << y << " " << w << " " << h << std::endl;
169+
170+ YOLOv5Session::DetectionBox b;
171+ b.box = ImageFloatBox (x, y, w, h);
172+ b.score = scores[index];
173+ b.label_idx = labels[index];
174+ output_boxes.push_back (b);
175+ }
176+ }
177+
178+
179+
180+ }
181+ }
0 commit comments