Skip to content

Commit 1612570

Browse files
authored
Update optimizer.c
An optimized version of SIMD is used in file cpython/Python/optimizer.c for function count_exits().
1 parent 149fbb0 commit 1612570

File tree

1 file changed

+41
-1
lines changed

1 file changed

+41
-1
lines changed

Python/optimizer.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
#include <stdint.h>
1818
#include <stddef.h>
1919

20+
#ifdef USE_SIMD // HACL_CAN_COMPILE_VEC256
21+
#include <immintrin.h>
22+
#endif
23+
2024
#define NEED_OPCODE_METADATA
2125
#include "pycore_uop_metadata.h" // Uop tables
2226
#undef NEED_OPCODE_METADATA
@@ -307,7 +311,7 @@ static int
307311
executor_traverse(PyObject *o, visitproc visit, void *arg)
308312
{
309313
_PyExecutorObject *executor = (_PyExecutorObject *)o;
310-
for (uint32_t i = 0; i < executor->exit_count; i++) {
314+
for (uint32_t i = 0; i < executor->; i++) {
311315
Py_VISIT(executor->exits[i].executor);
312316
}
313317
return 0;
@@ -923,17 +927,53 @@ translate_bytecode_to_trace(
923927
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
924928

925929
/* Count the number of unused uops and exits
930+
* An optimized version of SIMD is used.
926931
*/
927932
static int
928933
count_exits(_PyUOpInstruction *buffer, int length)
929934
{
930935
int exit_count = 0;
936+
#if defined(USE_SIMD)
937+
// Use SIMD instructions for optimization of counting
938+
// Assume that _PyUOpInstruction contains only opcode
939+
// and its size is a multiple of the SIMD register size
940+
941+
// For AVX2 (256-bit registers)
942+
const __m256i exit_code = _mm256_set1_epi32(_EXIT_TRACE);
943+
const __m256i zero = _mm256_setzero_si256();
944+
945+
int i;
946+
947+
// Process data in blocks of 8 elements (256 bits / 32 bits)
948+
for (i = 0; i < length - 7; i += 8) {
949+
__m256i vec = _mm256_load_si256((const __m256i*)(buffer + i));
950+
__m256i cmp = _mm256_cmpeq_epi32(vec, exit_code);
951+
__m256i sum = _mm256_add_epi32(cmp, zero);
952+
953+
// Sum horizontal values
954+
__m128i lo = _mm256_extracti128_si256(sum, 0);
955+
__m128i hi = _mm256_extracti128_si256(sum, 1);
956+
__m128i total = _mm_add_epi32(lo, hi);
957+
958+
int result;
959+
_mm_store_si128((__m128i*)&result, total);
960+
exit_count += _mm_extract_epi32(total, 0) + _mm_extract_epi32(total, 1);
961+
}
962+
963+
// Process remaining elements
964+
for (; i < length; i++) {
965+
if (buffer[i].opcode == _EXIT_TRACE) {
966+
exit_count++;
967+
}
968+
}
969+
#else
931970
for (int i = 0; i < length; i++) {
932971
int opcode = buffer[i].opcode;
933972
if (opcode == _EXIT_TRACE) {
934973
exit_count++;
935974
}
936975
}
976+
#endif
937977
return exit_count;
938978
}
939979

0 commit comments

Comments
 (0)