1010#include " Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1111#include " Kernels_ImageFilter_RGB32_Range_Routines.h"
1212#include " Kernels_ImageFilter_RGB32_Range.h"
13+ #include " Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
14+
15+
1316
1417namespace PokemonAutomation {
1518namespace Kernels {
@@ -37,11 +40,17 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
3740 {}
3841
3942 PA_FORCE_INLINE size_t count () const {
43+ // long pairwise add
4044 uint64x2_t sum_u64 = vpaddlq_u32 (m_count_u32);
4145 return sum_u64[0 ] + sum_u64[1 ];
4246 }
4347
44- PA_FORCE_INLINE void process_full (uint32_t * out, const uint32_t * in){
48+ // Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
49+ // The counts are stored in m_count_u32.
50+ // If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
51+ // It also changes pixels in or out of the range to have the new color m_replacement_color_u32.
52+ // The resulting pixels are saved in out[4]
53+ PA_FORCE_INLINE void process_full (uint32_t out[4 ], const uint32_t in[4 ], const uint32x4_t * cmp_mask_u32 = nullptr ){
4554 uint8x16_t in_u8 = vreinterpretq_u8_u32 (vld1q_u32 (in));
4655
4756 // Check if mins > pixel per color channel
@@ -50,9 +59,13 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
5059 uint8x16_t cmp1 = vcgtq_u8 (in_u8, m_maxs_u8);
5160 // cmp: if mins > pixel or pixel > maxs per color channel
5261 uint8x16_t cmp_u8 = vorrq_u8 (cmp0, cmp1);
62+ // vceqq_u32: compare bitwise equal
5363 // cmp_u32: if each pixel is within the range
5464 // If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
55- uint32x4_t cmp_u32 = vceqq_u32 (vreinterpretq_u32_u8 (cmp_u8), m_zeros_u8);
65+ uint32x4_t cmp_u32 = vceqq_u32 (vreinterpretq_u32_u8 (cmp_u8), vreinterpretq_u32_u8 (m_zeros_u8));
66+ if (cmp_mask_u32) {
67+ cmp_u32 = vandq_u32 (cmp_u32, *cmp_mask_u32);
68+ }
5669 // Increase count for each pixel in range. Each uint32 lane is counted separately.
5770 // We achieve +=1 by substracting 0xFFFFFFFF
5871 m_count_u32 = vsubq_u32 (m_count_u32, cmp_u32);
@@ -66,10 +79,12 @@ class ImageFilterRunner_Rgb32Range_ARM64_NEON{
6679 }
6780 vst1q_u32 (out, out_u32);
6881 }
82+ // Same as `process_full()` but only process `left` (< 4) pixels
6983 PA_FORCE_INLINE void process_partial (uint32_t * out, const uint32_t * in, size_t left){
84+ uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8 (PartialWordAccess_arm64_NEON::create_front_mask (left * 4 ));
7085 uint32_t buffer_in[4 ], buffer_out[4 ];
7186 memcpy (buffer_in, in, sizeof (uint32_t ) * left);
72- process_full (buffer_out, buffer_in);
87+ process_full (buffer_out, buffer_in, &cmp_mask_u32 );
7388 memcpy (out, buffer_out, sizeof (uint32_t ) * left);
7489 }
7590
@@ -135,7 +150,12 @@ class ToBlackWhite_RgbRange_arm64_NEON{
135150 return sum_u64[0 ] + sum_u64[1 ];
136151 }
137152
138- PA_FORCE_INLINE void process_full (uint32_t * out, const uint32_t * in){
153+ // Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
154+ // The counts are stored in m_count_u32.
155+ // If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
156+ // It also changes pixels into black or white depending on whether they are in range.
157+ // The resulting pixels are saved in out[4]
158+ PA_FORCE_INLINE void process_full (uint32_t out[4 ], const uint32_t in[4 ], const uint32x4_t * cmp_mask_u32 = nullptr ){
139159 uint8x16_t in_u8 = vreinterpretq_u8_u32 (vld1q_u32 (in));
140160
141161 // Check if mins > pixel per color channel
@@ -144,9 +164,13 @@ class ToBlackWhite_RgbRange_arm64_NEON{
144164 uint8x16_t cmp1 = vcgtq_u8 (in_u8, m_maxs_u8);
145165 // cmp: if mins > pixel or pixel > maxs per color channel
146166 uint8x16_t cmp_u8 = vorrq_u8 (cmp0, cmp1);
167+ // vceqq_u32: compare bitwise equal
147168 // cmp_u32: if each pixel is within the range
148169 // If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
149- uint32x4_t cmp_u32 = vceqq_u32 (vreinterpretq_u32_u8 (cmp_u8), m_zeros_u8);
170+ uint32x4_t cmp_u32 = vceqq_u32 (vreinterpretq_u32_u8 (cmp_u8), vreinterpretq_u32_u8 (m_zeros_u8));
171+ if (cmp_mask_u32) {
172+ cmp_u32 = vandq_u32 (cmp_u32, *cmp_mask_u32);
173+ }
150174 // Increase count for each pixel in range. Each uint32 lane is counted separately.
151175 // We achieve +=1 by substracting 0xFFFFFFFF
152176 m_count_u32 = vsubq_u32 (m_count_u32, cmp_u32);
@@ -157,10 +181,12 @@ class ToBlackWhite_RgbRange_arm64_NEON{
157181
158182 vst1q_u32 (out, out_u32);
159183 }
184+ // Same as `process_full()` but only process `left` (< 4) pixels
160185 PA_FORCE_INLINE void process_partial (uint32_t * out, const uint32_t * in, size_t left){
186+ uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8 (PartialWordAccess_arm64_NEON::create_front_mask (left * 4 ));
161187 uint32_t buffer_in[4 ], buffer_out[4 ];
162188 memcpy (buffer_in, in, sizeof (uint32_t ) * left);
163- process_full (buffer_out, buffer_in);
189+ process_full (buffer_out, buffer_in, &cmp_mask_u32 );
164190 memcpy (out, buffer_out, sizeof (uint32_t ) * left);
165191 }
166192
0 commit comments