Skip to content

Commit 8424793

Browse files
committed
Partial fix for padding pixels being counted toward in-range pixels.
1 parent f540a60 commit 8424793

File tree

4 files changed

+45
-18
lines changed

4 files changed

+45
-18
lines changed

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_AVX2.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1313
#include "Kernels_ImageFilter_RGB32_Range.h"
1414

15+
//#include <iostream>
16+
//using std::cout;
17+
//using std::endl;
18+
1519
namespace PokemonAutomation{
1620
namespace Kernels{
1721

@@ -42,25 +46,27 @@ class ImageFilterRunner_Rgb32Range_x64_AVX2{
4246

4347
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
4448
__m256i pixel = _mm256_loadu_si256((const __m256i*)in);
45-
pixel = process_word(pixel);
49+
__m256i in_range_pixels = process_word(pixel);
50+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
4651
_mm256_storeu_si256((__m256i*)out, pixel);
4752
}
4853
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
4954
__m256i pixel = mask.load_i32(in);
50-
pixel = process_word(pixel);
55+
__m256i in_range_pixels = process_word(pixel);
56+
in_range_pixels = _mm256_and_si256(in_range_pixels, mask.mask());
57+
m_count = _mm256_sub_epi32(m_count, in_range_pixels);
5158
mask.store(out, pixel);
5259
}
5360

5461
private:
55-
PA_FORCE_INLINE __m256i process_word(__m256i pixel){
62+
PA_FORCE_INLINE __m256i process_word(__m256i& pixel){
5663
__m256i adj = _mm256_xor_si256(pixel, _mm256_set1_epi8((uint8_t)0x80));
5764
__m256i cmp0 = _mm256_cmpgt_epi8(m_mins, adj);
5865
__m256i cmp1 = _mm256_cmpgt_epi8(adj, m_maxs);
5966
cmp0 = _mm256_or_si256(cmp0, cmp1);
6067
cmp0 = _mm256_cmpeq_epi32(cmp0, _mm256_setzero_si256());
61-
m_count = _mm256_sub_epi32(m_count, cmp0);
62-
cmp0 = _mm256_xor_si256(cmp0, m_invert);
63-
return _mm256_blendv_epi8(m_replacement, pixel, cmp0);
68+
pixel = _mm256_blendv_epi8(m_replacement, pixel, _mm256_xor_si256(cmp0, m_invert));
69+
return cmp0;
6470
}
6571

6672
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_AVX512.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1111
#include "Kernels_ImageFilter_RGB32_Range.h"
1212

13+
//#include <iostream>
14+
//using std::cout;
15+
//using std::endl;
16+
1317
namespace PokemonAutomation{
1418
namespace Kernels{
1519

@@ -55,17 +59,20 @@ class ImageFilterRunner_Rgb32Range_x64_AVX512{
5559

5660
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
5761
__m512i pixel = _mm512_loadu_si512((const __m512i*)in);
58-
pixel = process_word(pixel);
62+
__mmask16 in_range_pixels = process_word(pixel);
63+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
5964
_mm512_storeu_si512((__m512i*)out, pixel);
6065
}
6166
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
6267
__m512i pixel = _mm512_maskz_loadu_epi32(mask.m, in);
63-
pixel = process_word(pixel);
68+
__mmask16 in_range_pixels = process_word(pixel);
69+
in_range_pixels &= mask.m;
70+
m_count = _mm512_mask_sub_epi32(m_count, in_range_pixels, m_count, _mm512_set1_epi32(-1));
6471
_mm512_mask_storeu_epi32(out, mask.m, pixel);
6572
}
6673

6774
private:
68-
PA_FORCE_INLINE __m512i process_word(__m512i pixel){
75+
PA_FORCE_INLINE __mmask16 process_word(__m512i& pixel){
6976
#if 0
7077
__mmask64 cmp64A = _mm512_cmpgt_epu8_mask(m_mins, pixel);
7178
__mmask64 cmp64B = _mm512_cmpgt_epu8_mask(pixel, m_maxs);
@@ -77,9 +84,9 @@ class ImageFilterRunner_Rgb32Range_x64_AVX512{
7784
__m512i mask = _mm512_movm_epi8(cmp64B);
7885
__mmask16 cmp16 = _mm512_cmpeq_epi32_mask(mask, _mm512_set1_epi32(-1));
7986
#endif
80-
m_count = _mm512_mask_sub_epi32(m_count, cmp16, m_count, _mm512_set1_epi32(-1));
81-
cmp16 ^= m_invert;
82-
return _mm512_mask_blend_epi32(cmp16, m_replacement, pixel);
87+
pixel = _mm512_mask_blend_epi32(cmp16 ^ m_invert, m_replacement, pixel);
88+
89+
return cmp16;
8390
}
8491

8592
private:

SerialPrograms/Source/Kernels/ImageFilters/RGB32_Range/Kernels_ImageFilter_RGB32_Range_x64_SSE42.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic_Routines.h"
1313
#include "Kernels_ImageFilter_RGB32_Range.h"
1414

15+
//#include <iostream>
16+
//using std::cout;
17+
//using std::endl;
18+
1519
namespace PokemonAutomation{
1620
namespace Kernels{
1721

@@ -59,12 +63,20 @@ class ImageFilterRunner_Rgb32Range_x64_SSE42{
5963

6064
PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
6165
__m128i pixel = _mm_loadu_si128((const __m128i*)in);
62-
pixel = process_word(pixel);
66+
__m128i in_range_pixels = process_word(pixel);
67+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
6368
_mm_storeu_si128((__m128i*)out, pixel);
6469
}
6570
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, const Mask& mask){
71+
__m128i vmask = _mm_cmpgt_epi32(
72+
_mm_set1_epi32((uint32_t)mask.left),
73+
_mm_setr_epi32(0, 1, 2, 3)
74+
);
75+
6676
__m128i pixel = mask.loader.load(in);
67-
pixel = process_word(pixel);
77+
__m128i in_range_pixels = process_word(pixel);
78+
in_range_pixels = _mm_and_si128(in_range_pixels, vmask);
79+
m_count = _mm_sub_epi32(m_count, in_range_pixels);
6880
size_t left = mask.left;
6981
do{
7082
out[0] = _mm_cvtsi128_si32(pixel);
@@ -74,15 +86,14 @@ class ImageFilterRunner_Rgb32Range_x64_SSE42{
7486
}
7587

7688
private:
77-
PA_FORCE_INLINE __m128i process_word(__m128i pixel){
89+
PA_FORCE_INLINE __m128i process_word(__m128i& pixel){
7890
__m128i adj = _mm_xor_si128(pixel, _mm_set1_epi8((uint8_t)0x80));
7991
__m128i cmp0 = _mm_cmpgt_epi8(m_mins, adj);
8092
__m128i cmp1 = _mm_cmpgt_epi8(adj, m_maxs);
8193
cmp0 = _mm_or_si128(cmp0, cmp1);
8294
cmp0 = _mm_cmpeq_epi32(cmp0, _mm_setzero_si128());
83-
m_count = _mm_sub_epi32(m_count, cmp0);
84-
cmp0 = _mm_xor_si128(cmp0, m_invert);
85-
return _mm_blendv_epi8(m_replacement, pixel, cmp0);
95+
pixel = _mm_blendv_epi8(m_replacement, pixel, _mm_xor_si128(cmp0, m_invert));
96+
return cmp0;
8697
}
8798

8899
private:

SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_x64_AVX2.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class PartialWordAccess32_x64_AVX2{
2525
);
2626
}
2727

28+
PA_FORCE_INLINE __m256i mask() const{
29+
return m_mask;
30+
}
2831
PA_FORCE_INLINE __m256i load_i32(const void* ptr) const{
2932
return _mm256_maskload_epi32((const int*)ptr, m_mask);
3033
}

0 commit comments

Comments
 (0)