Skip to content

Commit 5bb2f45

Browse files
committed
Refactor the image filter replacements to template function.
1 parent e593110 commit 5bb2f45

12 files changed

+394
-743
lines changed

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ PA_FORCE_INLINE void filter_per_pixel(
8888
}
8989

9090

91+
#if 0
92+
9193
template <typename Runner, typename Filter>
9294
PA_FORCE_INLINE void filter_per_pixel(
9395
const uint32_t* image, size_t bytes_per_row, size_t width, size_t height,
@@ -182,7 +184,7 @@ PA_FORCE_INLINE void filter_per_pixel(
182184
}
183185
}
184186

185-
187+
#endif
186188

187189

188190

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_ARM64_NEON.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,76 @@ namespace Kernels{
1414

1515

1616

17+
18+
template <typename PixelTester>
19+
class ToBlackWhite_Rgb32_ARM64_NEON{
20+
public:
21+
static const size_t VECTOR_SIZE = 4;
22+
using Mask = size_t;
23+
24+
public:
25+
ToBlackWhite_Rgb32_ARM64_NEON(
26+
const PixelTester& tester,
27+
uint32_t replacement_color, bool replace_color_within_range
28+
)
29+
: m_tester(tester)
30+
, m_replacement_color_u32(vdupq_n_u32(replacement_color))
31+
, m_replace_color_within_range(replace_color_within_range)
32+
, m_count_u32(vdupq_n_u32(0))
33+
{}
34+
35+
PA_FORCE_INLINE size_t count() const{
36+
// long pairwise add
37+
uint64x2_t sum_u64 = vpaddlq_u32(m_count_u32);
38+
return sum_u64[0] + sum_u64[1];
39+
}
40+
41+
// Given 4 pixels from in[4], apply color range comparison and count the pixels that are in range.
42+
// The counts are stored in m_count_u32.
43+
// If a per-pixel mask, cmp_mask_u32 is not nullptr, it only counts the pixels covered by the mask.
44+
// It also changes pixels in or out of the range to have the new color m_replacement_color_u32.
45+
// The resulting pixels are saved in out[4]
46+
PA_FORCE_INLINE void process_full(uint32_t out[4], const uint32_t in[4], const uint32x4_t* cmp_mask_u32 = nullptr){
47+
uint32x4_t pixel = vld1q_u32(in);
48+
uint32x4_t cmp_u32 = process_word(pixel);
49+
if (cmp_mask_u32) {
50+
cmp_u32 = vandq_u32(cmp_u32, *cmp_mask_u32);
51+
}
52+
// Increase count for each pixel in range. Each uint32 lane is counted separately.
53+
// We achieve +=1 by substracting 0xFFFFFFFF
54+
m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
55+
// select replacement color or in_u8 based on cmp_u32:
56+
uint32x4_t out_u32;
57+
if (m_replace_color_within_range){
58+
// vbslq_u32(a, b, c) for 1 bits in a, choose b; for 0 bits in a, choose c
59+
out_u32 = vbslq_u32(cmp_u32, m_replacement_color_u32, pixel);
60+
}else{
61+
out_u32 = vbslq_u32(cmp_u32, pixel, m_replacement_color_u32);
62+
}
63+
vst1q_u32(out, out_u32);
64+
}
65+
// Same as `process_full()` but only process `left` (< 4) pixels
66+
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
67+
uint32x4_t cmp_mask_u32 = vreinterpretq_u32_u8(PartialWordAccess_arm64_NEON::create_front_mask(left * 4));
68+
uint32_t buffer_in[4], buffer_out[4];
69+
memcpy(buffer_in, in, sizeof(uint32_t) * left);
70+
process_full(buffer_out, buffer_in, &cmp_mask_u32);
71+
memcpy(out, buffer_out, sizeof(uint32_t) * left);
72+
}
73+
74+
private:
75+
const PixelTester m_tester;
76+
uint32x4_t m_replacement_color_u32;
77+
bool m_replace_color_within_range;
78+
uint32x4_t m_count_u32;
79+
};
80+
81+
82+
83+
84+
85+
86+
1787
template <typename PixelTester>
1888
class ToBlackWhite_Rgb32_ARM64_NEON{
1989
public:

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_Default.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,52 @@ namespace Kernels{
1212

1313

1414

15+
16+
17+
template <typename PixelTester>
18+
class FilterImage_Rgb32_Default{
19+
public:
20+
static const size_t VECTOR_SIZE = 1;
21+
using Mask = size_t;
22+
23+
public:
24+
FilterImage_Rgb32_Default(
25+
const PixelTester& tester,
26+
uint32_t replacement_color, bool replace_color_within_range
27+
)
28+
: m_tester(tester)
29+
, m_replacement_color(replacement_color)
30+
, m_replace_color_within_range(replace_color_within_range)
31+
, m_count(0)
32+
{}
33+
34+
PA_FORCE_INLINE size_t count() const{
35+
return m_count;
36+
}
37+
38+
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
39+
uint32_t pixel = in[0];
40+
bool passed = m_tester.test_word(pixel);
41+
m_count += passed;
42+
passed ^= m_replace_color_within_range;
43+
out[0] = passed ? pixel : m_replacement_color;
44+
}
45+
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
46+
process_full(out, in);
47+
}
48+
49+
private:
50+
const PixelTester m_tester;
51+
const uint32_t m_replacement_color;
52+
const bool m_replace_color_within_range;
53+
size_t m_count;
54+
};
55+
56+
57+
58+
59+
60+
1561
template <typename PixelTester>
1662
class ToBlackWhite_Rgb32_Default{
1763
public:

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_x64_AVX2.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,63 @@ namespace Kernels{
1515

1616

1717

18+
template <typename PixelTester>
19+
class FilterImage_Rgb32_x64_AVX2{
20+
public:
21+
static const size_t VECTOR_SIZE = 8;
22+
using Mask = PartialWordAccess32_x64_AVX2;
23+
24+
public:
25+
FilterImage_Rgb32_x64_AVX2(
26+
const PixelTester& tester,
27+
uint32_t replacement, bool replace_color_within_range
28+
)
29+
: m_tester(tester)
30+
, m_replacement(_mm256_set1_epi32(replacement))
31+
, m_invert(replace_color_within_range ? _mm256_set1_epi32(-1) : _mm256_setzero_si256())
32+
, m_count(_mm256_setzero_si256())
33+
{}
34+
35+
PA_FORCE_INLINE size_t count() const{
36+
return reduce_add32_x64_AVX2(m_count);
37+
}
38+
39+
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
40+
__m256i pixel = _mm256_loadu_si256((const __m256i*)in);
41+
__m256i in_range_pixels = process_word(pixel);
42+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
43+
_mm256_storeu_si256((__m256i*)out, pixel);
44+
}
45+
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
46+
__m256i pixel = mask.load_i32(in);
47+
__m256i in_range_pixels = process_word(pixel);
48+
in_range_pixels = _mm256_and_si256(in_range_pixels, mask.mask());
49+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
50+
mask.store(out, pixel);
51+
}
52+
53+
private:
54+
// Process the pixel in-place.
55+
// Return a mask indicating which lanes are in range.
56+
PA_FORCE_INLINE __m256i process_word(__m256i& pixel) const{
57+
__m256i mask = m_tester.test_word(pixel);
58+
pixel = _mm256_blendv_epi8(
59+
m_replacement,
60+
pixel,
61+
_mm256_xor_si256(mask, m_invert)
62+
);
63+
return mask;
64+
}
65+
66+
private:
67+
const PixelTester m_tester;
68+
const __m256i m_replacement;
69+
const __m256i m_invert;
70+
__m256i m_count;
71+
};
72+
73+
74+
1875

1976
template <typename PixelTester>
2077
class ToBlackWhite_Rgb32_x64_AVX2{

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_x64_AVX512.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <immintrin.h>
99
#include "Common/Compiler.h"
1010

11+
1112
namespace PokemonAutomation{
1213
namespace Kernels{
1314

@@ -24,6 +25,65 @@ struct PartialWordMask{
2425

2526

2627

28+
template <typename PixelTester>
29+
class FilterImage_Rgb32_x64_AVX512{
30+
public:
31+
static const size_t VECTOR_SIZE = 16;
32+
using Mask = PartialWordMask;
33+
34+
public:
35+
FilterImage_Rgb32_x64_AVX512(
36+
const PixelTester& tester,
37+
uint32_t replacement, bool replace_color_within_range
38+
)
39+
: m_tester(tester)
40+
, m_replacement(_mm512_set1_epi32(replacement))
41+
, m_invert(replace_color_within_range ? 0xffff : 0)
42+
, m_count(_mm512_setzero_si512())
43+
{}
44+
45+
PA_FORCE_INLINE size_t count() const{
46+
return _mm512_reduce_add_epi32(m_count);
47+
}
48+
49+
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
50+
__m512i pixel = _mm512_loadu_si512((const __m512i*)in);
51+
__mmask16 in_range_pixels = process_word(pixel);
52+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
53+
_mm512_storeu_si512((__m512i*)out, pixel);
54+
}
55+
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
56+
__m512i pixel = _mm512_maskz_loadu_epi32(mask.m, in);
57+
__mmask16 in_range_pixels = process_word(pixel);
58+
in_range_pixels &= mask.m;
59+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
60+
_mm512_mask_storeu_epi32(out, mask.m, pixel);
61+
}
62+
63+
private:
64+
// Process the pixel in-place.
65+
// Return a mask indicating which lanes are in range.
66+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel) const{
67+
__mmask16 mask = m_tester.test_word(pixel);
68+
pixel = _mm512_mask_blend_epi32(
69+
mask ^ m_invert,
70+
m_replacement,
71+
pixel
72+
);
73+
return mask;
74+
}
75+
76+
private:
77+
const PixelTester m_tester;
78+
const __m512i m_replacement;
79+
const __mmask16 m_invert;
80+
__m512i m_count;
81+
};
82+
83+
84+
85+
86+
2787
template <typename PixelTester>
2888
class ToBlackWhite_Rgb32_x64_AVX512{
2989
public:

SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines_x64_SSE42.h

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,78 @@ struct PartialWordMask{
2828

2929

3030

31+
32+
template <typename PixelTester>
33+
class FilterImage_Rgb32_x64_SSE42{
34+
public:
35+
static const size_t VECTOR_SIZE = 4;
36+
using Mask = PartialWordMask;
37+
38+
public:
39+
FilterImage_Rgb32_x64_SSE42(
40+
const PixelTester& tester,
41+
uint32_t replacement, bool replace_color_within_range
42+
)
43+
: m_tester(tester)
44+
, m_replacement(_mm_set1_epi32(replacement))
45+
, m_invert(replace_color_within_range ? _mm_set1_epi32(-1) : _mm_setzero_si128())
46+
, m_count(_mm_setzero_si128())
47+
{}
48+
49+
PA_FORCE_INLINE size_t count() const{
50+
return reduce32_x64_SSE41(m_count);
51+
}
52+
53+
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
54+
__m128i pixel = _mm_loadu_si128((const __m128i*)in);
55+
__m128i in_range_pixels = process_word(pixel);
56+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
57+
_mm_storeu_si128((__m128i*)out, pixel);
58+
}
59+
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
60+
__m128i vmask = _mm_cmpgt_epi32(
61+
_mm_set1_epi32((uint32_t)mask.left),
62+
_mm_setr_epi32(0, 1, 2, 3)
63+
);
64+
65+
__m128i pixel = mask.loader.load(in);
66+
__m128i in_range_pixels = process_word(pixel);
67+
in_range_pixels = _mm_and_si128(in_range_pixels, vmask);
68+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
69+
size_t left = mask.left;
70+
do{
71+
out[0] = _mm_cvtsi128_si32(pixel);
72+
pixel = _mm_srli_si128(pixel, 4);
73+
out++;
74+
}while(--left);
75+
}
76+
77+
private:
78+
// Process the pixel in-place.
79+
// Return a mask indicating which lanes are in range.
80+
PA_FORCE_INLINE __m128i process_word(__m128i& pixel) const{
81+
__m128i mask = m_tester.test_word(pixel);
82+
pixel = _mm_blendv_epi8(
83+
m_replacement,
84+
pixel,
85+
_mm_xor_si128(mask, m_invert)
86+
);
87+
return mask;
88+
}
89+
90+
private:
91+
const PixelTester m_tester;
92+
const __m128i m_replacement;
93+
const __m128i m_invert;
94+
__m128i m_count;
95+
};
96+
97+
98+
99+
100+
101+
102+
31103
template <typename PixelTester>
32104
class ToBlackWhite_Rgb32_x64_SSE42{
33105
public:

0 commit comments

Comments
 (0)