Skip to content

Commit 82d2398

Browse files
committed
Fix rest of RGB32 range filter counting padding pixels.
1 parent 8424793 commit 82d2398

7 files changed

+93
-40
lines changed

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Brightness/Kernels_ImageFilter_RGB32_Brightness_x64_AVX512-VNNI.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ class ToBlackWhite_RgbBrightness_x64_AVX512{
6868
}
6969

7070
private:
71-
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel){
72-
// cout << "asdf" << endl;
71+
// Process the pixel in-place.
72+
// Return a mask indicating which lanes are in range.
73+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel) const{
7374
// __m512i r = _mm512_and_si512(_mm512_srli_epi32(pixel, 16), _mm512_set1_epi32(0x000000ff));
7475
// __m512i g = _mm512_and_si512(_mm512_srli_epi32(pixel, 8), _mm512_set1_epi32(0x000000ff));
7576
// __m512i b = _mm512_and_si512(pixel, _mm512_set1_epi32(0x000000ff));
@@ -85,7 +86,11 @@ class ToBlackWhite_RgbBrightness_x64_AVX512{
8586
__mmask16 cmp16 = _mm512_cmple_epu32_mask(pixel, m_threshold);
8687

8788
// Set to black or white.
88-
pixel = _mm512_mask_blend_epi32(cmp16 ^ m_in_range_black, _mm512_set1_epi32(0xff000000), _mm512_set1_epi32(-1));
89+
pixel = _mm512_mask_blend_epi32(
90+
cmp16 ^ m_in_range_black,
91+
_mm512_set1_epi32(0xff000000),
92+
_mm512_set1_epi32(-1)
93+
);
8994

9095
return cmp16;
9196
}

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_x64_AVX2.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,22 @@ class ImageFilter_RgbEuclidean_x64_AVX2{
4040

4141
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
4242
__m256i pixel = _mm256_loadu_si256((const __m256i*)in);
43-
pixel = process_word(pixel);
43+
__m256i in_range_pixels = process_word(pixel);
44+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
4445
_mm256_storeu_si256((__m256i*)out, pixel);
4546
}
4647
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
4748
__m256i pixel = mask.load_i32(in);
48-
pixel = process_word(pixel);
49+
__m256i in_range_pixels = process_word(pixel);
50+
in_range_pixels = _mm256_and_si256(in_range_pixels, mask.mask());
51+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
4952
mask.store(out, pixel);
5053
}
5154

5255
private:
53-
PA_FORCE_INLINE __m256i process_word(__m256i pixel){
56+
// Process the pixel in-place.
57+
// Return a mask indicating which lanes are in range.
58+
PA_FORCE_INLINE __m256i process_word(__m256i& pixel) const{
5459
__m256i ag = _mm256_and_si256(_mm256_srli_epi16(pixel, 8), _mm256_set1_epi32(0x000000ff));
5560
__m256i rb = _mm256_and_si256(pixel, _mm256_set1_epi32(0x00ff00ff));
5661

@@ -66,10 +71,9 @@ class ImageFilter_RgbEuclidean_x64_AVX2{
6671
sum_sqr = _mm256_add_epi32(sum_sqr, b);
6772

6873
__m256i cmp = _mm256_cmpgt_epi32(m_distance_squared, sum_sqr);
74+
pixel = _mm256_blendv_epi8(m_replacement, pixel, _mm256_xor_si256(cmp, m_invert));
6975

70-
m_count = _mm256_sub_epi32(m_count, cmp);
71-
cmp = _mm256_xor_si256(cmp, m_invert);
72-
return _mm256_blendv_epi8(m_replacement, pixel, cmp);
76+
return cmp;
7377
}
7478

7579
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_x64_AVX512.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,22 @@ class ImageFilterRunner_Rgb32Euclidean_x64_AVX512{
5252

5353
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
5454
__m512i pixel = _mm512_loadu_si512((const __m512i*)in);
55-
pixel = process_word(pixel);
55+
__mmask16 in_range_pixels = process_word(pixel);
56+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
5657
_mm512_storeu_si512((__m512i*)out, pixel);
5758
}
5859
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
5960
__m512i pixel = _mm512_maskz_loadu_epi32(mask.m, in);
60-
pixel = process_word(pixel);
61+
__mmask16 in_range_pixels = process_word(pixel);
62+
in_range_pixels &= mask.m;
63+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
6164
_mm512_mask_storeu_epi32(out, mask.m, pixel);
6265
}
6366

6467
private:
65-
PA_FORCE_INLINE __m512i process_word(__m512i pixel){
68+
// Process the pixel in-place.
69+
// Return a mask indicating which lanes are in range.
70+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel) const{
6671
__m512i ag = _mm512_and_si512(_mm512_srli_epi16(pixel, 8), _mm512_set1_epi32(0x000000ff));
6772
__m512i rb = _mm512_and_si512(pixel, _mm512_set1_epi32(0x00ff00ff));
6873

@@ -78,10 +83,9 @@ class ImageFilterRunner_Rgb32Euclidean_x64_AVX512{
7883
sum_sqr = _mm512_add_epi32(sum_sqr, b);
7984

8085
__mmask16 cmp = _mm512_cmpgt_epi32_mask(m_distance_squared, sum_sqr);
86+
pixel = _mm512_mask_blend_epi32(cmp ^ m_invert, m_replacement, pixel);
8187

82-
m_count = _mm512_mask_sub_epi32(m_count, cmp, m_count, _mm512_set1_epi32(-1));
83-
cmp ^= m_invert;
84-
return _mm512_mask_blend_epi32(cmp, m_replacement, pixel);
88+
return cmp;
8589
}
8690

8791
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_EuclideanDistance/Kernels_ImageFilter_RGB32_Euclidean_x64_SSE42.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,20 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
5858

5959
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
6060
__m128i pixel = _mm_loadu_si128((const __m128i*)in);
61-
pixel = process_word(pixel);
61+
__m128i in_range_pixels = process_word(pixel);
62+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
6263
_mm_storeu_si128((__m128i*)out, pixel);
6364
}
6465
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
66+
__m128i vmask = _mm_cmpgt_epi32(
67+
_mm_set1_epi32((uint32_t)mask.left),
68+
_mm_setr_epi32(0, 1, 2, 3)
69+
);
70+
6571
__m128i pixel = mask.loader.load(in);
66-
pixel = process_word(pixel);
72+
__m128i in_range_pixels = process_word(pixel);
73+
in_range_pixels = _mm_and_si128(in_range_pixels, vmask);
74+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
6775
size_t left = mask.left;
6876
do{
6977
out[0] = _mm_cvtsi128_si32(pixel);
@@ -73,7 +81,9 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
7381
}
7482

7583
private:
76-
PA_FORCE_INLINE __m128i process_word(__m128i pixel){
84+
// Process the pixel in-place.
85+
// Return a mask indicating which lanes are in range.
86+
PA_FORCE_INLINE __m128i process_word(__m128i& pixel) const{
7787
// _mm_srli_epi16: Shift 16-bit integers in pixels right by 8 while shifting in zeros,
7888
// ng: green channels of each pixel, but shifted right by 8 bits
7989
__m128i ng = _mm_and_si128(_mm_srli_epi16(pixel, 8), _mm_set1_epi32(0x000000ff));
@@ -98,10 +108,9 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
98108
sum_sqr = _mm_add_epi32(sum_sqr, b);
99109

100110
__m128i cmp = _mm_cmpgt_epi32(m_distance_squared, sum_sqr);
111+
pixel = _mm_blendv_epi8(m_replacement, pixel, _mm_xor_si128(cmp, m_invert));
101112

102-
m_count = _mm_sub_epi32(m_count, cmp);
103-
cmp = _mm_xor_si128(cmp, m_invert);
104-
return _mm_blendv_epi8(m_replacement, pixel, cmp);
113+
return cmp;
105114
}
106115

107116
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_AVX2.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ class ImageFilterRunner_Rgb32Range_x64_AVX2{
5959
}
6060

6161
private:
62-
PA_FORCE_INLINE __m256i process_word(__m256i& pixel){
62+
// Process the pixel in-place.
63+
// Return a mask indicating which lanes are in range.
64+
PA_FORCE_INLINE __m256i process_word(__m256i& pixel) const{
6365
__m256i adj = _mm256_xor_si256(pixel, _mm256_set1_epi8((uint8_t)0x80));
6466
__m256i cmp0 = _mm256_cmpgt_epi8(m_mins, adj);
6567
__m256i cmp1 = _mm256_cmpgt_epi8(adj, m_maxs);
@@ -129,25 +131,31 @@ class ToBlackWhite_RgbRange_x64_AVX2{
129131

130132
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
131133
__m256i pixel = _mm256_loadu_si256((const __m256i*)in);
132-
pixel = process_word(pixel);
134+
__m256i in_range_pixels = process_word(pixel);
135+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
133136
_mm256_storeu_si256((__m256i*)out, pixel);
134137
}
135138
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
136139
__m256i pixel = mask.load_i32(in);
137-
pixel = process_word(pixel);
140+
__m256i in_range_pixels = process_word(pixel);
141+
in_range_pixels = _mm256_and_si256(in_range_pixels, mask.mask());
142+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
138143
mask.store(out, pixel);
139144
}
140145

141146
private:
142-
PA_FORCE_INLINE __m256i process_word(__m256i pixel){
147+
// Process the pixel in-place.
148+
// Return a mask indicating which lanes are in range.
149+
PA_FORCE_INLINE __m256i process_word(__m256i& pixel) const{
143150
__m256i adj = _mm256_xor_si256(pixel, _mm256_set1_epi8((uint8_t)0x80));
144151
__m256i cmp0 = _mm256_cmpgt_epi8(m_mins, adj);
145152
__m256i cmp1 = _mm256_cmpgt_epi8(adj, m_maxs);
146153
cmp0 = _mm256_or_si256(cmp0, cmp1);
147154
cmp0 = _mm256_cmpeq_epi32(cmp0, _mm256_setzero_si256());
148-
m_count = _mm256_sub_epi32(m_count, cmp0);
155+
__m256i ret = cmp0;
149156
cmp0 = _mm256_xor_si256(cmp0, m_in_range_black);
150-
return _mm256_or_si256(cmp0, _mm256_set1_epi32(0xff000000));
157+
pixel = _mm256_or_si256(cmp0, _mm256_set1_epi32(0xff000000));
158+
return ret;
151159
}
152160

153161
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_AVX512.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ class ImageFilterRunner_Rgb32Range_x64_AVX512{
7272
}
7373

7474
private:
75-
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel){
75+
// Process the pixel in-place.
76+
// Return a mask indicating which lanes are in range.
77+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel) const{
7678
#if 0
7779
__mmask64 cmp64A = _mm512_cmpgt_epu8_mask(m_mins, pixel);
7880
__mmask64 cmp64B = _mm512_cmpgt_epu8_mask(pixel, m_maxs);
@@ -146,17 +148,22 @@ class ToBlackWhite_RgbRange_x64_AVX512{
146148

147149
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
148150
__m512i pixel = _mm512_loadu_si512((const __m512i*)in);
149-
pixel = process_word(pixel);
151+
__mmask16 in_range_pixels = process_word(pixel);
152+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
150153
_mm512_storeu_si512((__m512i*)out, pixel);
151154
}
152155
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
153156
__m512i pixel = _mm512_maskz_loadu_epi32(mask.m, in);
154-
pixel = process_word(pixel);
157+
__mmask16 in_range_pixels = process_word(pixel);
158+
in_range_pixels &= mask.m;
159+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
155160
_mm512_mask_storeu_epi32(out, mask.m, pixel);
156161
}
157162

158163
private:
159-
PA_FORCE_INLINE __m512i process_word(__m512i pixel){
164+
// Process the pixel in-place.
165+
// Return a mask indicating which lanes are in range.
166+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel) const{
160167
#if 0
161168
__mmask64 cmp64A = _mm512_cmpgt_epu8_mask(m_mins, pixel);
162169
__mmask64 cmp64B = _mm512_cmpgt_epu8_mask(pixel, m_maxs);
@@ -168,9 +175,12 @@ class ToBlackWhite_RgbRange_x64_AVX512{
168175
__m512i mask = _mm512_movm_epi8(cmp64B);
169176
__mmask16 cmp16 = _mm512_cmpeq_epi32_mask(mask, _mm512_set1_epi32(-1));
170177
#endif
171-
m_count = _mm512_mask_sub_epi32(m_count, cmp16, m_count, _mm512_set1_epi32(-1));
172-
cmp16 ^= m_in_range_black;
173-
return _mm512_mask_blend_epi32(cmp16, _mm512_set1_epi32(0xff000000), _mm512_set1_epi32(-1));
178+
pixel = _mm512_mask_blend_epi32(
179+
cmp16 ^ m_in_range_black,
180+
_mm512_set1_epi32(0xff000000),
181+
_mm512_set1_epi32(-1)
182+
);
183+
return cmp16;
174184
}
175185

176186
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_SSE42.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ class ImageFilterRunner_Rgb32Range_x64_SSE42{
8686
}
8787

8888
private:
89-
PA_FORCE_INLINE __m128i process_word(__m128i& pixel){
89+
// Process the pixel in-place.
90+
// Return a mask indicating which lanes are in range.
91+
PA_FORCE_INLINE __m128i process_word(__m128i& pixel) const{
9092
__m128i adj = _mm_xor_si128(pixel, _mm_set1_epi8((uint8_t)0x80));
9193
__m128i cmp0 = _mm_cmpgt_epi8(m_mins, adj);
9294
__m128i cmp1 = _mm_cmpgt_epi8(adj, m_maxs);
@@ -155,12 +157,20 @@ class ToBlackWhite_RgbRange_x64_SSE42{
155157

156158
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
157159
__m128i pixel = _mm_loadu_si128((const __m128i*)in);
158-
pixel = process_word(pixel);
160+
__m128i in_range_pixels = process_word(pixel);
161+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
159162
_mm_storeu_si128((__m128i*)out, pixel);
160163
}
161164
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
165+
__m128i vmask = _mm_cmpgt_epi32(
166+
_mm_set1_epi32((uint32_t)mask.left),
167+
_mm_setr_epi32(0, 1, 2, 3)
168+
);
169+
162170
__m128i pixel = mask.loader.load(in);
163-
pixel = process_word(pixel);
171+
__m128i in_range_pixels = process_word(pixel);
172+
in_range_pixels = _mm_and_si128(in_range_pixels, vmask);
173+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
164174
size_t left = mask.left;
165175
do{
166176
out[0] = _mm_cvtsi128_si32(pixel);
@@ -170,15 +180,18 @@ class ToBlackWhite_RgbRange_x64_SSE42{
170180
}
171181

172182
private:
173-
PA_FORCE_INLINE __m128i process_word(__m128i pixel){
183+
// Process the pixel in-place.
184+
// Return a mask indicating which lanes are in range.
185+
PA_FORCE_INLINE __m128i process_word(__m128i& pixel) const{
174186
__m128i adj = _mm_xor_si128(pixel, _mm_set1_epi8((uint8_t)0x80));
175187
__m128i cmp0 = _mm_cmpgt_epi8(m_mins, adj);
176188
__m128i cmp1 = _mm_cmpgt_epi8(adj, m_maxs);
177189
cmp0 = _mm_or_si128(cmp0, cmp1);
178190
cmp0 = _mm_cmpeq_epi32(cmp0, _mm_setzero_si128());
179-
m_count = _mm_sub_epi32(m_count, cmp0);
191+
__m128i ret = cmp0;
180192
cmp0 = _mm_xor_si128(cmp0, m_in_range_black);
181-
return _mm_or_si128(cmp0, _mm_set1_epi32(0xff000000));
193+
pixel = _mm_or_si128(cmp0, _mm_set1_epi32(0xff000000));
194+
return ret;
182195
}
183196

184197
private:

0 commit comments

Comments
 (0)