Skip to content

Commit cfded68

Browse files
author
Gin
committed
Fix partial pixel counts in kernel for NEON
1 parent bd8de5c commit cfded68

File tree

5 files changed

+59
-8
lines changed

5 files changed

+59
-8
lines changed

SerialPrograms/Source/CommonFramework/Logging/FileWindowLogger.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ FileWindowLoggerWindow::FileWindowLoggerWindow(FileWindowLogger& logger, QWidget
241241
log("<b>Window Startup...</b>");
242242
log("Current path: " + QDir::currentPath());
243243
log("Executable path: " + qApp->applicationDirPath());
244+
log(QString::fromStdString("Program setting folder: " + SETTINGS_PATH()));
245+
log(QString::fromStdString("Program resources folder: " + RESOURCE_PATH()));
244246
add_window(*this);
245247
}
246248
FileWindowLoggerWindow::~FileWindowLoggerWindow(){

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_ARM64_NEON.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "Kernels/Kernels_arm64_NEON.h"
1010
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1111
#include "Kernels_ImageFilter_RGB32_Euclidean.h"
12+
#include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
1213

1314
namespace PokemonAutomation{
1415
namespace Kernels{
@@ -36,7 +37,12 @@ class ImageFilter_RgbEuclidean_arm64_NEON{
3637
return sum_u64[0] + sum_u64[1];
3738
}
3839

39-
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
40+
// Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
41+
// The counts are stored in m_count_u32.
42+
// If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
43+
// It also changes pixels in or out of the range to have the new color m_replacement_color_u32.
44+
// The resulting pixels are saved in out[4]
45+
PA_FORCE_INLINE void process_full(uint32_t out[4], const uint32_t in[4], const uint32x4_t* cmp_mask_u32 = nullptr){
4046
uint32x4_t in_u32 = vld1q_u32(in);
4147
// subtract the expected values
4248
uint32x4_t in_dif_u32 = vreinterpretq_u32_u8(vabdq_u8(vreinterpretq_u8_u32(in_u32), m_expected_color_rgb_u8));
@@ -58,6 +64,9 @@ class ImageFilter_RgbEuclidean_arm64_NEON{
5864

5965
// cmp_u32: if each pixel is within the range, its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
6066
uint32x4_t cmp_u32 = vcleq_u32(sum_sqr_u32, m_distance_squared_u32);
67+
if (cmp_mask_u32) {
68+
cmp_u32 = vandq_u32(cmp_u32, *cmp_mask_u32);
69+
}
6170
// Increase count for each pixel in range. Each uint32 lane is counted separately.
6271
// We achieve +=1 by substracting 0xFFFFFFFF
6372
m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
@@ -71,10 +80,12 @@ class ImageFilter_RgbEuclidean_arm64_NEON{
7180
}
7281
vst1q_u32(out, out_u32);
7382
}
83+
// Same as `process_full()` but only process `left` (< 4) pixels
7484
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
85+
uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8(PartialWordAccess_arm64_NEON::create_front_mask(left * 4));
7586
uint32_t buffer_in[4], buffer_out[4];
7687
memcpy(buffer_in, in, sizeof(uint32_t) * left);
77-
process_full(buffer_out, buffer_in);
88+
process_full(buffer_out, buffer_in, &cmp_mask_u32);
7889
memcpy(out, buffer_out, sizeof(uint32_t) * left);
7990
}
8091

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_ARM64_NEON.cpp

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1111
#include "Kernels_ImageFilter_RGB32_Range_Routines.h"
1212
#include "Kernels_ImageFilter_RGB32_Range.h"
13+
#include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
14+
15+
1316

1417
namespace PokemonAutomation{
1518
namespace Kernels{
@@ -37,11 +40,17 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
3740
{}
3841

3942
PA_FORCE_INLINE size_t count() const{
43+
// long pairwise add
4044
uint64x2_t sum_u64 = vpaddlq_u32(m_count_u32);
4145
return sum_u64[0] + sum_u64[1];
4246
}
4347

44-
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
48+
// Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
49+
// The counts are stored in m_count_u32.
50+
// If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
51+
// It also changes pixels in or out of the range to have the new color m_replacement_color_u32.
52+
// The resulting pixels are saved in out[4]
53+
PA_FORCE_INLINE void process_full(uint32_t out[4], const uint32_t in[4], const uint32x4_t* cmp_mask_u32 = nullptr){
4554
uint8x16_t in_u8 = vreinterpretq_u8_u32(vld1q_u32(in));
4655

4756
// Check if mins > pixel per color channel
@@ -50,9 +59,13 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
5059
uint8x16_t cmp1 = vcgtq_u8(in_u8, m_maxs_u8);
5160
// cmp: if mins > pixel or pixel > maxs per color channel
5261
uint8x16_t cmp_u8 = vorrq_u8(cmp0, cmp1);
62+
// vceqq_u32: compare bitwise equal
5363
// cmp_u32: if each pixel is within the range
5464
// If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
55-
uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), m_zeros_u8);
65+
uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), vreinterpretq_u32_u8(m_zeros_u8));
66+
if (cmp_mask_u32) {
67+
cmp_u32 = vandq_u32(cmp_u32, *cmp_mask_u32);
68+
}
5669
// Increase count for each pixel in range. Each uint32 lane is counted separately.
5770
// We achieve +=1 by substracting 0xFFFFFFFF
5871
m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
@@ -66,10 +79,12 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
6679
}
6780
vst1q_u32(out, out_u32);
6881
}
82+
// Same as `process_full()` but only process `left` (< 4) pixels
6983
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
84+
uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8(PartialWordAccess_arm64_NEON::create_front_mask(left * 4));
7085
uint32_t buffer_in[4], buffer_out[4];
7186
memcpy(buffer_in, in, sizeof(uint32_t) * left);
72-
process_full(buffer_out, buffer_in);
87+
process_full(buffer_out, buffer_in, &cmp_mask_u32);
7388
memcpy(out, buffer_out, sizeof(uint32_t) * left);
7489
}
7590

@@ -135,7 +150,12 @@ class ToBlackWhite_RgbRange_arm64_NEON{
135150
return sum_u64[0] + sum_u64[1];
136151
}
137152

138-
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
153+
// Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
154+
// The counts are stored in m_count_u32.
155+
// If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
156+
// It also changes pixels into black or white depending on whether they are in range.
157+
// The resulting pixels are saved in out[4]
158+
PA_FORCE_INLINE void process_full(uint32_t out[4], const uint32_t in[4], const uint32x4_t* cmp_mask_u32 = nullptr){
139159
uint8x16_t in_u8 = vreinterpretq_u8_u32(vld1q_u32(in));
140160

141161
// Check if mins > pixel per color channel
@@ -144,9 +164,13 @@ class ToBlackWhite_RgbRange_arm64_NEON{
144164
uint8x16_t cmp1 = vcgtq_u8(in_u8, m_maxs_u8);
145165
// cmp: if mins > pixel or pixel > maxs per color channel
146166
uint8x16_t cmp_u8 = vorrq_u8(cmp0, cmp1);
167+
// vceqq_u32: compare bitwise equal
147168
// cmp_u32: if each pixel is within the range
148169
// If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
149-
uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), m_zeros_u8);
170+
uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), vreinterpretq_u32_u8(m_zeros_u8));
171+
if (cmp_mask_u32) {
172+
cmp_u32 = vandq_u32(cmp_u32, *cmp_mask_u32);
173+
}
150174
// Increase count for each pixel in range. Each uint32 lane is counted separately.
151175
// We achieve +=1 by substracting 0xFFFFFFFF
152176
m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
@@ -157,10 +181,12 @@ class ToBlackWhite_RgbRange_arm64_NEON{
157181

158182
vst1q_u32(out, out_u32);
159183
}
184+
// Same as `process_full()` but only process `left` (< 4) pixels
160185
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
186+
uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8(PartialWordAccess_arm64_NEON::create_front_mask(left * 4));
161187
uint32_t buffer_in[4], buffer_out[4];
162188
memcpy(buffer_in, in, sizeof(uint32_t) * left);
163-
process_full(buffer_out, buffer_in);
189+
process_full(buffer_out, buffer_in, &cmp_mask_u32);
164190
memcpy(out, buffer_out, sizeof(uint32_t) * left);
165191
}
166192

SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ namespace Kernels{
2222
//
2323
class PartialWordAccess_arm64_NEON{
2424
public:
25+
// create a mask with first `bytes` low bytes are all 1s
26+
// If `bytes` is 3, then the returned vector is from low bytes to high bytes: [0xFF, 0xFF, 0xFF, 0, 0, 0, ..., 0]
27+
PA_FORCE_INLINE static uint8x16_t create_front_mask(size_t bytes) {
28+
PA_ALIGN_STRUCT(16) uint8_t bytes_values[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
29+
const uint8x16_t seq_u8x16 = vld1q_u8(bytes_values);
30+
return vcgtq_u8(vdupq_n_u8((uint8_t)bytes), seq_u8x16);
31+
}
32+
2533
// How many bytes of data to load. Allow at most 16 bytes.
2634
PA_FORCE_INLINE PartialWordAccess_arm64_NEON(size_t bytes)
2735
: m_shift(16 - bytes)

SerialPrograms/Source/Tests/Kernels_Tests.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,12 @@ int test_kernels_ToBlackWhiteRGB32Range(const ImageViewRGB32& image){
358358
cout << "Testing to_black_white_rgb32_range(), image size " << width << " x " << height << endl;
359359

360360
Color min_color(0, 0, 0);
361+
// Color min_color(0);
362+
361363
Color max_color(63, 63, 63);
364+
// Color max_color(255, 255, 255);
362365
// Color max_color(238, 24, 42);
366+
cout << "min color: " << min_color.to_string() << " max color: " << max_color.to_string() << endl;
363367

364368
const uint32_t mins = uint32_t(min_color);
365369
const uint32_t maxs = uint32_t(max_color);

0 commit comments

Comments
 (0)