Skip to content

Commit a559722

Browse files
committed
Refactor ImageFilter Euclidean to use template routines.
1 parent 6c45bfd commit a559722

8 files changed

+225
-289
lines changed

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ size_t filter_rgb32_euclidean_x64_AVX512(
3636
uint32_t replacement, bool replace_color_within_range,
3737
uint32_t expected, double max_euclidean_distance
3838
);
39-
size_t filter_rgb32_euclidean_arm64_NEON(
39+
size_t filter_rgb32_euclidean_ARM64_NEON(
4040
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
4141
uint32_t* out, size_t out_bytes_per_row,
4242
uint32_t replacement, bool replace_color_within_range,
@@ -83,7 +83,7 @@ size_t filter_rgb32_euclidean(
8383
#endif
8484
#ifdef PA_AutoDispatch_arm64_20_M1
8585
if (CPU_CAPABILITY_CURRENT.OK_M1){
86-
return filter_rgb32_euclidean_arm64_NEON(
86+
return filter_rgb32_euclidean_ARM64_NEON(
8787
in, in_bytes_per_row, width, height,
8888
out, out_bytes_per_row,
8989
replacement, replace_color_within_range,

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_ARM64_NEON.cpp

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,87 @@ namespace PokemonAutomation{
1515
namespace Kernels{
1616

1717

18+
19+
class PixelTest_Rgb32Euclidean_ARM64_NEON{
20+
public:
21+
static const size_t VECTOR_SIZE = 4;
22+
using Mask = size_t;
23+
24+
public:
25+
PA_FORCE_INLINE PixelTest_Rgb32Euclidean_ARM64_NEON(
26+
uint32_t expected, double max_euclidean_distance
27+
)
28+
: m_expected_color_rgb_u8(vreinterpretq_u8_u32(vdupq_n_u32(expected_color & 0x00ffffff)))
29+
, m_distance_squared_u32(vdupq_n_u32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
30+
{}
31+
32+
// Return a mask indicating which lanes are in range.
33+
PA_FORCE_INLINE uint32x4_t test_word(uint32x4_t& pixel) const{
34+
uint32x4_t in_u32 = vld1q_u32(in);
35+
// subtract the expected values
36+
uint32x4_t in_dif_u32 = vreinterpretq_u32_u8(vabdq_u8(vreinterpretq_u8_u32(in_u32), m_expected_color_rgb_u8));
37+
38+
// Get green channel
39+
uint32x4_t in_g_u32 = vandq_u32(in_dif_u32, vdupq_n_u32(0x0000ff00));
40+
// Move green channel to the lower end of the 16-bit regions
41+
uint16x8_t in_g_u16 = vshrq_n_u16(vreinterpretq_u16_u32(in_g_u32), 8);
42+
// in_rb_u16 contains the red and blue channels. Each channel occupies a 16-bit region
43+
uint16x8_t in_rb_u16 = vandq_u16(vreinterpretq_u16_u32(in_dif_u32), vdupq_n_u16(0x00ff));
44+
45+
// Square operation
46+
uint16x8_t in_g2_u16 = vmulq_u16(in_g_u16, in_g_u16);
47+
uint16x8_t in_r2b2_u16 = vmulq_u16(in_rb_u16, in_rb_u16);
48+
49+
uint32x4_t in_g2_u32 = vreinterpretq_u32_u16(in_g2_u16);
50+
// Use pairwise addition and accumulate to add r2, g2, and b2 together
51+
uint32x4_t sum_sqr_u32 = vpadalq_u16(in_g2_u32, in_r2b2_u16);
52+
53+
// cmp_u32: if each pixel is within the range, its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
54+
return vcleq_u32(sum_sqr_u32, m_distance_squared_u32);
55+
}
56+
57+
private:
58+
uint8x16_t m_expected_color_rgb_u8;
59+
uint32x4_t m_distance_squared_u32;
60+
};
61+
62+
63+
64+
size_t filter_rgb32_euclidean_ARM64_NEON(
65+
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
66+
uint32_t* out, size_t out_bytes_per_row,
67+
uint32_t replacement, bool replace_color_within_range,
68+
uint32_t expected, double max_euclidean_distance
69+
){
70+
PixelTest_Rgb32Euclidean_ARM64_NEON tester(
71+
expected, max_euclidean_distance
72+
);
73+
FilterImage_Rgb32_ARM64_NEON<PixelTest_Rgb32Euclidean_ARM64_NEON> filter(
74+
tester, replacement, replace_color_within_range
75+
);
76+
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
77+
return filter.count();
78+
}
79+
size_t to_blackwhite_rgb32_euclidean_ARM64_NEON(
80+
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
81+
uint32_t* out, size_t out_bytes_per_row,
82+
bool in_range_black,
83+
uint32_t expected, double max_euclidean_distance
84+
){
85+
PixelTest_Rgb32Euclidean_ARM64_NEON tester(
86+
expected, max_euclidean_distance
87+
);
88+
ToBlackWhite_Rgb32_ARM64_NEON<PixelTest_Rgb32Euclidean_ARM64_NEON> filter(
89+
tester, in_range_black
90+
);
91+
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
92+
return filter.count();
93+
}
94+
95+
96+
97+
#if 0
98+
1899
class ImageFilter_RgbEuclidean_arm64_NEON{
19100
public:
20101
static const size_t VECTOR_SIZE = 4;
@@ -112,7 +193,7 @@ size_t filter_rgb32_euclidean_arm64_NEON(
112193
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
113194
return filter.count();
114195
}
115-
196+
#endif
116197

117198

118199

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_Default.cpp

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,38 +5,32 @@
55
*/
66

77
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
8+
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_Default.h"
89
#include "Kernels_ImageFilter_RGB32_Euclidean.h"
910

1011
namespace PokemonAutomation{
1112
namespace Kernels{
1213

1314

1415

15-
class ImageFilter_RgbEuclidean_Default{
16+
17+
class PixelTest_Rgb32Euclidean_Default{
1618
public:
1719
static const size_t VECTOR_SIZE = 1;
1820
using Mask = size_t;
1921

2022
public:
21-
ImageFilter_RgbEuclidean_Default(
22-
uint32_t replacement_color, bool replace_color_within_range,
23-
uint32_t expected_color, double max_euclidean_distance
23+
PA_FORCE_INLINE PixelTest_Rgb32Euclidean_Default(
24+
uint32_t expected, double max_euclidean_distance
2425
)
25-
: m_replacement_color(replacement_color)
26-
, m_replace_color_within_range(replace_color_within_range ? 1 : 0)
27-
, m_expected_r((expected_color & 0x00ff0000) >> 16)
28-
, m_expected_g((expected_color & 0x0000ff00) >> 8)
29-
, m_expected_b(expected_color & 0x000000ff)
26+
: m_expected_r((expected & 0x00ff0000) >> 16)
27+
, m_expected_g((expected & 0x0000ff00) >> 8)
28+
, m_expected_b(expected & 0x000000ff)
3029
, m_max_distance_squared((uint32_t)(max_euclidean_distance * max_euclidean_distance))
31-
, m_count(0)
3230
{}
3331

34-
PA_FORCE_INLINE size_t count() const{
35-
return m_count;
36-
}
37-
38-
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
39-
uint32_t pixel = in[0];
32+
// Return a mask indicating which lanes are in range.
33+
PA_FORCE_INLINE bool test_word(uint32_t pixel) const{
4034
uint32_t sum_sqr = 0;
4135
{
4236
uint32_t p = (pixel & 0x00ff0000) >> 16;
@@ -53,37 +47,51 @@ class ImageFilter_RgbEuclidean_Default{
5347
p -= m_expected_b;
5448
sum_sqr += p * p;
5549
}
56-
uint32_t ret = sum_sqr <= m_max_distance_squared;
57-
m_count += ret;
58-
ret ^= m_replace_color_within_range;
59-
out[0] = ret ? pixel : m_replacement_color;
60-
}
61-
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
62-
process_full(out, in);
50+
return sum_sqr <= m_max_distance_squared;
6351
}
6452

6553
private:
66-
const uint32_t m_replacement_color;
67-
const uint32_t m_replace_color_within_range;
6854
uint32_t m_expected_r;
6955
uint32_t m_expected_g;
7056
uint32_t m_expected_b;
7157
uint32_t m_max_distance_squared;
72-
size_t m_count;
7358
};
59+
60+
61+
7462
size_t filter_rgb32_euclidean_Default(
7563
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
7664
uint32_t* out, size_t out_bytes_per_row,
7765
uint32_t replacement, bool replace_color_within_range,
7866
uint32_t expected, double max_euclidean_distance
7967
){
80-
ImageFilter_RgbEuclidean_Default filter(
81-
replacement, replace_color_within_range,
68+
PixelTest_Rgb32Euclidean_Default tester(
8269
expected, max_euclidean_distance
8370
);
71+
FilterImage_Rgb32_Default<PixelTest_Rgb32Euclidean_Default> filter(
72+
tester, replacement, replace_color_within_range
73+
);
8474
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
8575
return filter.count();
8676
}
77+
size_t to_blackwhite_rgb32_euclidean_Default(
78+
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
79+
uint32_t* out, size_t out_bytes_per_row,
80+
bool in_range_black,
81+
uint32_t expected, double max_euclidean_distance
82+
){
83+
PixelTest_Rgb32Euclidean_Default tester(
84+
expected, max_euclidean_distance
85+
);
86+
ToBlackWhite_Rgb32_Default<PixelTest_Rgb32Euclidean_Default> filter(
87+
tester, in_range_black
88+
);
89+
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
90+
return filter.count();
91+
}
92+
93+
94+
8795

8896

8997

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_x64_AVX2.cpp

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,55 +7,32 @@
77
#ifdef PA_AutoDispatch_x64_13_Haswell
88

99
#include <immintrin.h>
10-
#include "Kernels/Kernels_x64_AVX2.h"
1110
#include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_x64_AVX2.h"
1211
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
12+
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_x64_AVX2.h"
1313
#include "Kernels_ImageFilter_RGB32_Euclidean.h"
1414

1515
namespace PokemonAutomation{
1616
namespace Kernels{
1717

1818

19-
class ImageFilter_RgbEuclidean_x64_AVX2{
19+
20+
class PixelTest_Rgb32Euclidean_x64_AVX2{
2021
public:
2122
static const size_t VECTOR_SIZE = 8;
2223
using Mask = PartialWordAccess32_x64_AVX2;
2324

2425
public:
25-
ImageFilter_RgbEuclidean_x64_AVX2(
26-
uint32_t replacement, bool replace_color_within_range,
26+
PA_FORCE_INLINE PixelTest_Rgb32Euclidean_x64_AVX2(
2727
uint32_t expected, double max_euclidean_distance
2828
)
29-
: m_replacement(_mm256_set1_epi32(replacement))
30-
, m_invert(replace_color_within_range ? _mm256_set1_epi32(-1) : _mm256_setzero_si256())
31-
, m_expected_ag(_mm256_set1_epi32((expected >> 8) & 0x000000ff))
29+
: m_expected_ag(_mm256_set1_epi32((expected >> 8) & 0x000000ff))
3230
, m_expected_rb(_mm256_set1_epi32(expected & 0x00ff00ff))
3331
, m_distance_squared(_mm256_set1_epi32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
34-
, m_count(_mm256_setzero_si256())
3532
{}
3633

37-
PA_FORCE_INLINE size_t count() const{
38-
return reduce_add32_x64_AVX2(m_count);
39-
}
40-
41-
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
42-
__m256i pixel = _mm256_loadu_si256((const __m256i*)in);
43-
__m256i in_range_pixels = process_word(pixel);
44-
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
45-
_mm256_storeu_si256((__m256i*)out, pixel);
46-
}
47-
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
48-
__m256i pixel = mask.load_i32(in);
49-
__m256i in_range_pixels = process_word(pixel);
50-
in_range_pixels = _mm256_and_si256(in_range_pixels, mask.mask());
51-
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
52-
mask.store(out, pixel);
53-
}
54-
55-
private:
56-
// Process the pixel in-place.
5734
// Return a mask indicating which lanes are in range.
58-
PA_FORCE_INLINE __m256i process_word(__m256i& pixel) const{
35+
PA_FORCE_INLINE __m256i test_word(__m256i pixel) const{
5936
__m256i ag = _mm256_and_si256(_mm256_srli_epi16(pixel, 8), _mm256_set1_epi32(0x000000ff));
6037
__m256i rb = _mm256_and_si256(pixel, _mm256_set1_epi32(0x00ff00ff));
6138

@@ -70,30 +47,44 @@ class ImageFilter_RgbEuclidean_x64_AVX2{
7047
__m256i sum_sqr = _mm256_add_epi32(r, g);
7148
sum_sqr = _mm256_add_epi32(sum_sqr, b);
7249

73-
__m256i cmp = _mm256_cmpgt_epi32(m_distance_squared, sum_sqr);
74-
pixel = _mm256_blendv_epi8(m_replacement, pixel, _mm256_xor_si256(cmp, m_invert));
75-
76-
return cmp;
50+
return _mm256_cmpgt_epi32(m_distance_squared, sum_sqr);
7751
}
7852

7953
private:
80-
const __m256i m_replacement;
81-
const __m256i m_invert;
8254
const __m256i m_expected_ag;
8355
const __m256i m_expected_rb;
8456
const __m256i m_distance_squared;
85-
__m256i m_count;
8657
};
58+
59+
60+
8761
size_t filter_rgb32_euclidean_x64_AVX2(
8862
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
8963
uint32_t* out, size_t out_bytes_per_row,
9064
uint32_t replacement, bool replace_color_within_range,
9165
uint32_t expected, double max_euclidean_distance
9266
){
93-
ImageFilter_RgbEuclidean_x64_AVX2 filter(
94-
replacement, replace_color_within_range,
67+
PixelTest_Rgb32Euclidean_x64_AVX2 tester(
9568
expected, max_euclidean_distance
9669
);
70+
FilterImage_Rgb32_x64_AVX2<PixelTest_Rgb32Euclidean_x64_AVX2> filter(
71+
tester, replacement, replace_color_within_range
72+
);
73+
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
74+
return filter.count();
75+
}
76+
size_t to_blackwhite_rgb32_euclidean_x64_AVX2(
77+
const uint32_t* in, size_t in_bytes_per_row, size_t width, size_t height,
78+
uint32_t* out, size_t out_bytes_per_row,
79+
bool in_range_black,
80+
uint32_t expected, double max_euclidean_distance
81+
){
82+
PixelTest_Rgb32Euclidean_x64_AVX2 tester(
83+
expected, max_euclidean_distance
84+
);
85+
ToBlackWhite_Rgb32_x64_AVX2<PixelTest_Rgb32Euclidean_x64_AVX2> filter(
86+
tester, in_range_black
87+
);
9788
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
9889
return filter.count();
9990
}

0 commit comments

Comments
 (0)