Skip to content

Commit a33eb99

Browse files
author
Gin
committed
add YOLO model
1 parent 3d3588c commit a33eb99

File tree

11 files changed

+487
-20
lines changed

11 files changed

+487
-20
lines changed

SerialPrograms/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -903,12 +903,18 @@ file(GLOB MAIN_SOURCES
903903
Source/ML/DataLabeling/ML_SegmentAnythingModelConstants.h
904904
Source/ML/ML_Panels.cpp
905905
Source/ML/ML_Panels.h
906+
Source/ML/Models/ML_ONNXRuntimeHelpers.cpp
907+
Source/ML/Models/ML_ONNXRuntimeHelpers.h
908+
Source/ML/Models/ML_YOLOv5Model.cpp
909+
Source/ML/Models/ML_YOLOv5Model.h
906910
Source/ML/Programs/ML_LabelImages.cpp
907911
Source/ML/Programs/ML_LabelImages.h
908912
Source/ML/Programs/ML_LabelImagesOverlayManager.cpp
909913
Source/ML/Programs/ML_LabelImagesOverlayManager.h
910914
Source/ML/Programs/ML_LabelImagesWidget.cpp
911915
Source/ML/Programs/ML_LabelImagesWidget.h
916+
Source/ML/Programs/ML_RunYOLO.cpp
917+
Source/ML/Programs/ML_RunYOLO.h
912918
Source/ML/UI/ML_ImageAnnotationCommandRow.cpp
913919
Source/ML/UI/ML_ImageAnnotationCommandRow.h
914920
Source/ML/UI/ML_ImageAnnotationDisplayOption.cpp

SerialPrograms/Source/CommonFramework/ImageTypes/ImageViewRGB32.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class ImageViewRGB32 : public ImageViewPlanar32{
5757
QImage to_QImage_owning() const; // Return a copy that owns its own buffer. (slow)
5858
QImage scaled_to_QImage(size_t width, size_t height) const;
5959

60+
// convert to cv::Mat with BGRA color channel order
6061
cv::Mat to_opencv_Mat() const;
6162

6263
private:

SerialPrograms/Source/ML/DataLabeling/ML_SegmentAnythingModel.cpp

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,14 @@
1414
#include <opencv2/imgcodecs.hpp>
1515
#include <opencv2/imgproc.hpp>
1616
#include "3rdParty/ONNX/OnnxToolsPA.h"
17+
#include "ML/Models/ML_ONNXRuntimeHelpers.h"
1718
#include "ML_SegmentAnythingModelConstants.h"
1819
#include "ML_SegmentAnythingModel.h"
1920
#include "ML_AnnotationIO.h"
2021

2122
namespace PokemonAutomation{
2223
namespace ML{
2324

24-
Ort::SessionOptions create_session_option(){
25-
return Ort::SessionOptions{};
26-
27-
// create session using Apple ML
28-
29-
// Ort::SessionOptions so;
30-
// std::unordered_map<std::string, std::string> provider_options;
31-
// provider_options["ModelFormat"] = "NeuralNetwork";
32-
// so.AppendExecutionProvider("CoreML", provider_options);
33-
// return so;
34-
}
35-
36-
37-
template<typename T, class Buffer, class Shape> Ort::Value create_tensor(const OrtMemoryInfo* memory_info, Buffer& buffer, const Shape& shape){
38-
return Ort::Value::CreateTensor<T>(memory_info, buffer.data(), buffer.size(),
39-
shape.data(), shape.size());
40-
}
41-
4225

4326
SAMEmbedderSession::SAMEmbedderSession(const std::string& model_path)
4427
: session_options(create_session_option())

SerialPrograms/Source/ML/ML_Panels.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
#include "CommonFramework/Panels/PanelTools.h"
99
#include "Pokemon/Pokemon_Strings.h"
1010
#include "Programs/ML_LabelImages.h"
11-
11+
#include "Programs/ML_RunYOLO.h"
12+
#include "NintendoSwitch/NintendoSwitch_SingleSwitchProgram.h"
1213
#include "ML_Panels.h"
1314

1415

@@ -26,6 +27,8 @@ std::vector<PanelEntry> PanelListFactory::make_panels() const{
2627
if (PreloadSettings::instance().DEVELOPER_MODE){
2728
ret.emplace_back("---- Developer Tools ----");
2829
ret.emplace_back(make_panel<LabelImages_Descriptor, LabelImages>());
30+
// ret.emplace_back(make_panel<RunYOLO_Descriptor, RunYOLO>());
31+
ret.emplace_back(NintendoSwitch::make_single_switch_program<RunYOLO_Descriptor, RunYOLO>());
2932
// ret.emplace_back(make_single_switch_program<ThreeSegmentDudunsparceFinder_Descriptor, ThreeSegmentDudunsparceFinder>());
3033
}
3134

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/* ML ONNX Runtime Helpers
2+
*
3+
* From: https://github.com/PokemonAutomation/
4+
*
5+
* Helper functions to work with ONNX Runtime library
6+
*/
7+
8+
#include <iostream>
9+
#include <onnxruntime_cxx_api.h>
10+
#include "ML_ONNXRuntimeHelpers.h"
11+
12+
namespace PokemonAutomation{
13+
namespace ML{
14+
15+
Ort::SessionOptions create_session_option(){
16+
return Ort::SessionOptions{};
17+
18+
// create session using Apple ML
19+
20+
// Ort::SessionOptions so;
21+
// std::unordered_map<std::string, std::string> provider_options;
22+
// provider_options["ModelFormat"] = "NeuralNetwork";
23+
// so.AppendExecutionProvider("CoreML", provider_options);
24+
// return so;
25+
}
26+
27+
28+
void print_model_input_output_info(const Ort::Session& session){
29+
const auto input_names = session.GetInputNames();
30+
const auto output_names = session.GetOutputNames();
31+
32+
for (size_t i = 0; i < input_names.size(); ++i) {
33+
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
34+
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
35+
std::vector<int64_t> input_dims = tensor_info.GetShape();
36+
37+
std::cout << "Input " << i << ": " << input_names[i] << " Type " << tensor_info.GetElementType() << " Shape: [";
38+
for (size_t j = 0; j < input_dims.size(); ++j) {
39+
std::cout << input_dims[j];
40+
if (j < input_dims.size() - 1) {
41+
std::cout << ", ";
42+
}
43+
}
44+
std::cout << "]" << std::endl;
45+
}
46+
47+
for (size_t i = 0; i < output_names.size(); ++i) {
48+
Ort::TypeInfo type_info = session.GetOutputTypeInfo(i);
49+
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
50+
std::vector<int64_t> output_dims = tensor_info.GetShape();
51+
52+
std::cout << "Output " << i << ": " << input_names[i] << " Type " << tensor_info.GetElementType() << " Shape: [";
53+
for (size_t j = 0; j < output_dims.size(); ++j) {
54+
std::cout << output_dims[j];
55+
if (j < output_dims.size() - 1) {
56+
std::cout << ", ";
57+
}
58+
}
59+
std::cout << "]" << std::endl;
60+
}
61+
}
62+
63+
}
64+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/* ML ONNX Runtime Helpers
2+
*
3+
* From: https://github.com/PokemonAutomation/
4+
*
5+
* Helper functions to work with ONNX Runtime library
6+
*/
7+
8+
#ifndef PokemonAutomation_ML_ONNXRuntimeHelpers_H
9+
#define PokemonAutomation_ML_ONNXRuntimeHelpers_H
10+
11+
12+
#include <vector>
13+
#include <sstream>
14+
#include <onnxruntime_cxx_api.h>
15+
16+
namespace PokemonAutomation{
17+
namespace ML{
18+
19+
20+
// Create an ONNX Runtiem session options object.
21+
// For now it only sets the session to be CPU. In future we can create options for GPU or macOS MPS
22+
Ort::SessionOptions create_session_option();
23+
24+
// Handy function to create an ONNX Runtime tensor view class from a vector-like `buffer` object holding
25+
// the tensor data and an array-like `shape` object that represents the dimension of the tensor.
26+
template<typename T, class Buffer, class Shape>
27+
Ort::Value create_tensor(const OrtMemoryInfo* memory_info, Buffer& buffer, const Shape& shape){
28+
return Ort::Value::CreateTensor<T>(memory_info, buffer.data(), buffer.size(),
29+
shape.data(), shape.size());
30+
}
31+
32+
33+
// Print vector as std::string. Useful for printing debugging info on tensor shapes
34+
template<typename T>
35+
std::string to_string(std::vector<T>& vec){
36+
std::ostringstream os;
37+
os << "[";
38+
std::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(os, ", "));
39+
os << "]";
40+
return os.str();
41+
}
42+
43+
// Print model input and output types and shapes to cout. Useful for debugging.
44+
void print_model_input_output_info(const Ort::Session& session);
45+
46+
47+
}
48+
}
49+
#endif
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/* ML YOLOv5 Model
2+
*
3+
* From: https://github.com/PokemonAutomation/
4+
*
5+
* Run YOLOv5 model for object detection
6+
*/
7+
8+
9+
#include <string>
10+
#include <iostream>
11+
#include <opencv2/imgproc.hpp>
12+
#include <opencv2/dnn.hpp>
13+
#include "ML/Models/ML_ONNXRuntimeHelpers.h"
14+
#include "ML_YOLOv5Model.h"
15+
16+
namespace PokemonAutomation{
17+
namespace ML{
18+
19+
std::tuple<int, int, double, double> resize_image_with_border(
20+
const cv::Mat& input_image,
21+
cv::Mat& output_image,
22+
int target_width, int target_height,
23+
cv::Scalar border_color = cv::Scalar(0,0,0)
24+
){
25+
int original_width = input_image.cols;
26+
int original_height = input_image.rows;
27+
28+
double scale_x = static_cast<double>(target_width) / original_width;
29+
double scale_y = static_cast<double>(target_height) / original_height;
30+
double scale = std::min(scale_x, scale_y);
31+
32+
int new_width = static_cast<int>(original_width * scale);
33+
int new_height = static_cast<int>(original_height * scale);
34+
new_width = std::min(new_width, target_width);
35+
new_height = std::min(new_height, target_height);
36+
37+
if (new_width == 0 || new_height == 0){
38+
throw std::runtime_error("Input Image too small: " + std::to_string(original_width) + " x " + std::to_string(original_height));
39+
}
40+
41+
cv::Mat resized_image;
42+
cv::resize(input_image, resized_image, cv::Size(new_width, new_height), 0, 0, cv::INTER_LINEAR); // INTER_AREA for shrinking
43+
44+
int border_top = (target_height - new_height) / 2;
45+
int border_bottom = target_height - new_height - border_top;
46+
int border_left = (target_width - new_width) / 2;
47+
int border_right = target_width - new_width - border_left;
48+
49+
cv::copyMakeBorder(resized_image, output_image, border_top, border_bottom, border_left, border_right, cv::BORDER_CONSTANT, border_color);
50+
51+
return std::make_tuple(
52+
border_left, border_top,
53+
1.0 / new_width, 1.0 / new_height
54+
);
55+
}
56+
57+
58+
YOLOv5Session::YOLOv5Session(const std::string& model_path, std::vector<std::string> label_names)
59+
: m_label_names(std::move(label_names))
60+
, m_session_options(create_session_option())
61+
, m_session{m_env, model_path.c_str(), m_session_options}
62+
, m_memory_info{Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)}
63+
, m_input_names{m_session.GetInputNames()}
64+
, m_output_names{m_session.GetOutputNames()}
65+
, m_model_input(3*YOLO5_INPUT_IMAGE_SIZE*YOLO5_INPUT_IMAGE_SIZE)
66+
{
67+
if (m_session.GetOutputCount() != 1){
68+
throw std::runtime_error("YOLOv5 model does not have the correct output count, found count " + std::to_string(m_session.GetOutputCount()));
69+
}
70+
71+
std::vector<int64_t> output_dims = m_session.GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
72+
if (output_dims.size() != 3 || output_dims[2] <= 5){
73+
throw std::runtime_error("YOLOv5 model does not have the correct output dimension, found shape " + to_string(output_dims));
74+
}
75+
m_output_shape[2] = output_dims[2];
76+
if (output_dims[2] - 5 != static_cast<int>(m_label_names.size())){
77+
throw std::runtime_error(
78+
"YOLOv5 model has " + std::to_string(output_dims[2]-5) +
79+
" output labels but YOLOv5Session was initialized with " + std::to_string(label_names.size()) + " labels"
80+
);
81+
}
82+
m_model_output.resize(YOLO5_NUM_CANDIDATES * m_output_shape[2]);
83+
}
84+
85+
// input: rgb color order
86+
void YOLOv5Session::run(const cv::Mat& input_image, std::vector<YOLOv5Session::DetectionBox>& output_boxes){
87+
CV_Assert(input_image.depth() == CV_8U);
88+
CV_Assert(input_image.channels() == 3);
89+
90+
cv::Mat image_resized;
91+
92+
int x_shift = 0, y_shift = 0;
93+
double x_scale = 1.0, y_scale = 1.0;
94+
std::tie(x_shift, y_shift, x_scale, y_scale) = resize_image_with_border(input_image, image_resized,
95+
YOLO5_INPUT_IMAGE_SIZE, YOLO5_INPUT_IMAGE_SIZE, cv::Scalar(114, 114, 114));
96+
97+
// Declare a destination Mat for float32
98+
cv::Mat image_float;
99+
100+
// Convert the uint8_image to image_float
101+
// The third argument (alpha) is a scaling factor.
102+
// For normalization to [0.0, 1.0], use 1.0 / 255.0.
103+
// For retaining original values (0-255), use 1.0.
104+
image_resized.convertTo(image_float, CV_32F, 1.0 / 255.0);
105+
106+
for (int c = 0, i = 0; c < 3; c++) {
107+
for (int row = 0; row < image_float.rows; row++) {
108+
for (int col = 0; col < image_float.cols; col++) {
109+
float pixel_value = image_float.at<cv::Vec3f>(row, col)[c];
110+
m_model_input[i++] = pixel_value;
111+
}
112+
}
113+
}
114+
115+
auto input_tensor = create_tensor<float>(m_memory_info, m_model_input, m_input_shape);
116+
auto output_tensor = create_tensor<float>(m_memory_info, m_model_output, m_output_shape);
117+
118+
const char* input_name_c = m_input_names[0].data();
119+
const char* output_name_c = m_output_names[0].data();
120+
// auto start = std::chrono::steady_clock::now();
121+
m_session.Run(m_run_options, &input_name_c, &input_tensor, 1, &output_name_c, &output_tensor, 1);
122+
// auto end = std::chrono::steady_clock::now();
123+
// auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
124+
// std::cout << "Yolov5 inference time: " << milliseconds << " ms" << std::endl;
125+
126+
const size_t cand_size = m_label_names.size() + 5;
127+
128+
std::vector<cv::Rect> pixel_boxes;
129+
std::vector<int> indices;
130+
std::vector<float> scores;
131+
std::vector<size_t> labels;
132+
133+
for(int i = 0; i < YOLO5_NUM_CANDIDATES; i++){
134+
float cx = m_model_output[cand_size*i];
135+
float cy = m_model_output[cand_size*i+1];
136+
float w = m_model_output[cand_size*i+2];
137+
float h = m_model_output[cand_size*i+3];
138+
float sc = m_model_output[cand_size*i+4];
139+
140+
float max_score = 0.0;
141+
size_t pred_label = 0; // predicted label
142+
for(size_t j_label = 0; j_label < m_label_names.size(); j_label++){
143+
float score = m_model_output[cand_size*i+5+j_label];
144+
if (score > max_score){
145+
max_score = score;
146+
pred_label = j_label;
147+
}
148+
}
149+
scores.push_back(max_score * sc); // sc is like a global confidence scale?
150+
pixel_boxes.emplace_back((int)(cx - w / 2 + 0.5), (int)(cy - h / 2 + 0.5), int(w + 0.5), int(h + 0.5));
151+
indices.push_back(i);
152+
labels.push_back(pred_label);
153+
}
154+
155+
cv::dnn::NMSBoxes(pixel_boxes, scores, 0.2f, 0.45f, indices);
156+
157+
// std::cout << "num found pixel_boxes " << indices.size() << std::endl;
158+
// return;
159+
160+
for (int index : indices)
161+
{
162+
// Note the model predicts on (640x640) images, we need to convert the detected pixel_boxes back to
163+
// the full frame dimension.
164+
double x = (pixel_boxes[index].x - x_shift) * x_scale;
165+
double y = (pixel_boxes[index].y - y_shift) * y_scale;
166+
double w = pixel_boxes[index].width * x_scale;
167+
double h = pixel_boxes[index].height * y_scale;
168+
// std::cout << scores[index] << " " << x << " " << y << " " << w << " " << h << std::endl;
169+
170+
YOLOv5Session::DetectionBox b;
171+
b.box = ImageFloatBox(x, y, w, h);
172+
b.score = scores[index];
173+
b.label_idx = labels[index];
174+
output_boxes.push_back(b);
175+
}
176+
}
177+
178+
179+
180+
}
181+
}

0 commit comments

Comments
 (0)