|
17 | 17 | #include <stdint.h> |
18 | 18 | #include <stddef.h> |
19 | 19 |
|
| 20 | +#ifdef USE_SIMD // HACL_CAN_COMPILE_VEC256 |
| 21 | +#include <immintrin.h> |
| 22 | +#endif |
| 23 | + |
20 | 24 | #define NEED_OPCODE_METADATA |
21 | 25 | #include "pycore_uop_metadata.h" // Uop tables |
22 | 26 | #undef NEED_OPCODE_METADATA |
@@ -307,7 +311,7 @@ static int |
307 | 311 | executor_traverse(PyObject *o, visitproc visit, void *arg) |
308 | 312 | { |
309 | 313 | _PyExecutorObject *executor = (_PyExecutorObject *)o; |
310 | | - for (uint32_t i = 0; i < executor->exit_count; i++) { |
| 314 | + for (uint32_t i = 0; i < executor->; i++) { |
311 | 315 | Py_VISIT(executor->exits[i].executor); |
312 | 316 | } |
313 | 317 | return 0; |
@@ -923,17 +927,53 @@ translate_bytecode_to_trace( |
923 | 927 | #define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31))) |
924 | 928 |
|
925 | 929 | /* Count the number of unused uops and exits |
| 930 | +* An optimized version of SIMD is used. |
926 | 931 | */ |
927 | 932 | static int |
928 | 933 | count_exits(_PyUOpInstruction *buffer, int length) |
929 | 934 | { |
930 | 935 | int exit_count = 0; |
| 936 | +#if defined(USE_SIMD) |
| 937 | + // Use SIMD instructions for optimization of counting |
| 938 | + // Assume that _PyUOpInstruction contains only opcode |
| 939 | + // and its size is a multiple of the SIMD register size |
| 940 | + |
| 941 | + // For AVX2 (256-bit registers) |
| 942 | + const __m256i exit_code = _mm256_set1_epi32(_EXIT_TRACE); |
| 943 | + const __m256i zero = _mm256_setzero_si256(); |
| 944 | + |
| 945 | + int i; |
| 946 | + |
| 947 | + // Process data in blocks of 8 elements (256 bits / 32 bits) |
| 948 | + for (i = 0; i < length - 7; i += 8) { |
| 949 | + __m256i vec = _mm256_load_si256((const __m256i*)(buffer + i)); |
| 950 | + __m256i cmp = _mm256_cmpeq_epi32(vec, exit_code); |
| 951 | + __m256i sum = _mm256_add_epi32(cmp, zero); |
| 952 | + |
| 953 | + // Sum horizontal values |
| 954 | + __m128i lo = _mm256_extracti128_si256(sum, 0); |
| 955 | + __m128i hi = _mm256_extracti128_si256(sum, 1); |
| 956 | + __m128i total = _mm_add_epi32(lo, hi); |
| 957 | + |
| 958 | + int result; |
| 959 | + _mm_store_si128((__m128i*)&result, total); |
| 960 | + exit_count += _mm_extract_epi32(total, 0) + _mm_extract_epi32(total, 1); |
| 961 | + } |
| 962 | + |
| 963 | + // Process remaining elements |
| 964 | + for (; i < length; i++) { |
| 965 | + if (buffer[i].opcode == _EXIT_TRACE) { |
| 966 | + exit_count++; |
| 967 | + } |
| 968 | + } |
| 969 | +#else |
931 | 970 | for (int i = 0; i < length; i++) { |
932 | 971 | int opcode = buffer[i].opcode; |
933 | 972 | if (opcode == _EXIT_TRACE) { |
934 | 973 | exit_count++; |
935 | 974 | } |
936 | 975 | } |
| 976 | +#endif |
937 | 977 | return exit_count; |
938 | 978 | } |
939 | 979 |
|
|
0 commit comments