Add shift by multiple constant#1220
Conversation
AntoinePrv
left a comment
There was a problem hiding this comment.
Also we need to add all the proper forward to dynamic for all other architectures to avoid ending up with a common implementation (unless that's all there is).
There was a problem hiding this comment.
I named this util as common seems to be more used for implementing the common architecture.
include/xsimd/arch/utils/shifts.hpp
Outdated
|
|
||
| template <class T, class T2, class A, class R, T... Vs> | ||
| XSIMD_INLINE batch<T, A> bitwise_lshift_as_twice_larger( | ||
| batch<T, A> const& self, batch_constant<T, A, Vs...>, R req) noexcept |
There was a problem hiding this comment.
Probably should remove R req here
|
@AntoinePrv unless there's is a huge need on your side, I won't schedule that one for the release. Let's just add masked load / store support for other architecture so that we can release, then begin a new commit wave :-) |
|
@serge-sans-paille I had hoped I could do it quickly before the weekend but it seems it will take longer. |
de97676 to
1cd12f9
Compare
This comment was marked as resolved.
This comment was marked as resolved.
c85add7 to
fb76fd7
Compare
caaab2a to
adead38
Compare
| namespace detail | ||
| { | ||
| // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift`` | ||
| // directly (or in a parent register function). | ||
| // The ``batch_constant`` overload is a rare but useful optimization. | ||
| // Running the detection here is less error prone than to add a fallback to all | ||
| // architectures. | ||
|
|
||
| template <class A, class B, class C, class = void> | ||
| struct has_bitwise_lshift_batch_const : std::false_type | ||
| { | ||
| }; | ||
|
|
||
| template <class A, class B, class C> | ||
| struct has_bitwise_lshift_batch_const<A, B, C, | ||
| void_t<decltype(kernel::bitwise_lshift<A>(std::declval<B>(), std::declval<C>(), A {}))>> | ||
| : std::true_type | ||
| { | ||
| }; |
There was a problem hiding this comment.
We run the detection of an optimization in the xsimd_api to avoid adding many error-prone overloads.
|
I've made a fresh take, and added the fallback to runtime API in This seems much improved over the runtime version. E.g. with GCC 15, SSE2, and uin8_t Compile time (new) .cfi_startproc
movdqa (%rdi), %xmm1
movdqa .LC0(%rip), %xmm0
pcmpeqd %xmm2, %xmm2
movdqa %xmm2, %xmm3
psllw $8, %xmm2
pmullw %xmm1, %xmm0
psrlw $8, %xmm3
pand %xmm2, %xmm1
pmullw .LC3(%rip), %xmm1
pand %xmm3, %xmm0
por %xmm1, %xmm0
ret
.cfi_endproc
.LFE6862:
.size _Z1fRKN5xsimd5batchIhNS_4sse2EEE, .-_Z1fRKN5xsimd5batchIhNS_4sse2EEE
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.value 2
.value 8
.value 16
.value 4
.value 2
.value 2
.value 2
.value 2
.align 16
.LC3:
.value 4
.value 16
.value 8
.value 2
.value 2
.value 2
.value 2
.value 2Run time (before) .cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
movzbl 15(%rdi), %eax
movzbl 13(%rdi), %edx
movzbl 12(%rdi), %ecx
movzbl 8(%rdi), %r15d
movzbl 7(%rdi), %r12d
movzbl 11(%rdi), %esi
movzbl 10(%rdi), %r8d
movb %al, -12(%rsp)
movzbl 9(%rdi), %r13d
movzbl 14(%rdi), %eax
movb %dl, -10(%rsp)
xorl %edx, %edx
movzbl 6(%rdi), %r9d
movzbl 3(%rdi), %ebx
movb %cl, -9(%rsp)
xorl %ecx, %ecx
movzbl 5(%rdi), %r10d
movzbl 4(%rdi), %r11d
addl %r13d, %r13d
addl %r8d, %r8d
movzbl 2(%rdi), %ebp
movzbl 1(%rdi), %r14d
movb %al, -11(%rsp)
sall $4, %ebx
movzbl (%rdi), %edi
movzbl %bl, %ebx
sall $4, %r11d
movzbl %r8b, %r8d
sall $2, %r14d
sall $3, %ebp
movzbl %r11b, %r11d
salq $24, %rbx
movb %dil, %dl
movzbl %bpl, %ebp
salq $32, %r11
salq $16, %rbp
addb %dl, %dl
sall $3, %r10d
movq %rdx, -40(%rsp)
movq %r14, %rdx
movzbl %r10b, %r10d
sall $2, %r9d
movq %rcx, -32(%rsp)
salq $40, %r10
movzbl %r9b, %r9d
salq $16, %r8
movq -40(%rsp), %rcx
salq $48, %r9
movb %dl, %ch
movq -32(%rsp), %rdx
movq %rcx, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq $-16711681, %rdi
orq %rbp, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-4278190081, %rbp
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %rbp, %rdi
orq %rbx, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-1095216660481, %rbx
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %rbx, %rdi
orq %r11, %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
movabsq $-280375465082881, %r11
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
movq %rdx, -32(%rsp)
movq -32(%rsp), %rcx
andq %r11, %rdi
orq %r10, %rdi
movb %r15b, %cl
movabsq $-71776119061217281, %r10
movq %rdi, -40(%rsp)
movq -40(%rsp), %rdi
addb %cl, %cl
andq %r10, %rdi
orq %r9, %rdi
leal (%r12,%r12), %r9d
movq %rdi, -40(%rsp)
movq -40(%rsp), %r12
salq $56, %r9
movabsq $72057594037927935, %rdi
andq %rdi, %r12
orq %r9, %r12
movq %r12, -40(%rsp)
movq -40(%rsp), %rdx
movq %rdx, -40(%rsp)
movq %r13, %rdx
movq %rcx, -32(%rsp)
movq -32(%rsp), %rcx
movzbl -11(%rsp), %eax
movb %dl, %ch
movzbl -10(%rsp), %edx
movq %rcx, -32(%rsp)
movq -32(%rsp), %r9
movzbl -9(%rsp), %ecx
andq $-16711681, %r9
orq %r9, %r8
addl %esi, %esi
addl %ecx, %ecx
addl %edx, %edx
movq %r8, -32(%rsp)
movq -32(%rsp), %r8
movzbl %sil, %esi
addl %eax, %eax
salq $24, %rsi
movzbl %cl, %ecx
movzbl %dl, %edx
movzbl %al, %eax
salq $32, %rcx
andq %rbp, %r8
salq $40, %rdx
orq %r8, %rsi
salq $48, %rax
movq %rsi, -32(%rsp)
movq -32(%rsp), %rsi
andq %rbx, %rsi
orq %rsi, %rcx
movq %rcx, -32(%rsp)
movq -32(%rsp), %rcx
andq %r11, %rcx
orq %rcx, %rdx
movq %rdx, -32(%rsp)
movq -32(%rsp), %rdx
andq %r10, %rdx
orq %rdx, %rax
movq %rax, -32(%rsp)
movzbl -12(%rsp), %eax
leal (%rax,%rax), %edx
movq -32(%rsp), %rax
salq $56, %rdx
andq %rdi, %rax
orq %rdx, %rax
movq %rax, -32(%rsp)
movdqa -40(%rsp), %xmm0
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc |
0c02310 to
845b7e1
Compare
845b7e1 to
323353b
Compare
@serge-sans-paille @JohanMabille this ideas works, but I cannot figure out how to refactor
bitwise_lshift_as_twice_largerinto a separate header.The issue:
xsimd_sse2.hppneedsutils/shits.hppforbitwise_lshift_as_twice_largerbitwise_lshift_as_twice_largerneeds definitions fromxsimd_sse2.hpp.avx2Forward declaring all the needed functions overloads from
sse2andavx2and the arch types inutils/shits.hpp.Perhaps it should use the functions from
xsimd_apiinstead? (which would also be better if there is a better implementation further down in the inheritance tree).Close #1218