From 2d2be302a54131fe6ad9c02eb91be020e7ca1c42 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sun, 28 Dec 2025 22:05:41 +0000 Subject: [PATCH 1/9] Add base64 benchmark tool and optimize encoding/decoding Add Tools/binasciibench/binasciibench.py benchmark for measuring base64 encoding/decoding throughput. Optimize base64 encoding/decoding by eliminating loop-carried dependencies. Key changes: - Add base64_encode_trio() and base64_decode_quad() helper functions that process complete groups independently - Add base64_encode_fast() and base64_decode_fast() wrappers - Update b2a_base64 and a2b_base64 to use fast path for complete groups Performance gains (encode/decode speedup vs main, PGO builds): 64 bytes 64K 1M Zen2: 1.1x/1.6x 1.6x/2.4x 1.4x/2.4x Zen4: 1.2x/1.7x 1.6x/3.0x 1.5x/3.0x M4: 1.3x/1.9x 2.3x/2.8x 2.4x/2.9x RPi5-32: 1.4x/1.4x 2.4x/2.0x 2.0x/1.9x Additional SIMD implementations (NEON, AVX-512 VBMI) can achieve +50% to +1500% further gains and are planned for follow-on work. Co-authored-by: Claude Opus 4.5 --- Modules/binascii.c | 155 ++++++++++++++++++--- Tools/binasciibench/binasciibench.py | 193 +++++++++++++++++++++++++++ 2 files changed, 328 insertions(+), 20 deletions(-) create mode 100644 Tools/binasciibench/binasciibench.py diff --git a/Modules/binascii.c b/Modules/binascii.c index 13e4bc5be03ebd..aa73528bc6c5d5 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -101,6 +101,103 @@ static const unsigned char table_a2b_base64[] = { /* Max binary chunk size; limited only by available memory */ #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) +/* + * Base64 encoding/decoding helpers optimized for throughput. + * + * Key optimization: Process complete groups (3 bytes -> 4 chars for encode, + * 4 chars -> 3 bytes for decode) without loop-carried dependencies. + * This allows the compiler to better optimize the hot loops. + */ + +/* Forward declaration for table defined after the inline functions */ +static const unsigned char table_b2a_base64[]; + +/* Encode 3 bytes into 4 base64 characters. */ +static inline void +base64_encode_trio(const unsigned char *in, unsigned char *out, + const unsigned char *table) +{ + unsigned int combined = ((unsigned int)in[0] << 16) | + ((unsigned int)in[1] << 8) | + (unsigned int)in[2]; + out[0] = table[(combined >> 18) & 0x3f]; + out[1] = table[(combined >> 12) & 0x3f]; + out[2] = table[(combined >> 6) & 0x3f]; + out[3] = table[combined & 0x3f]; +} + +/* Encode multiple complete 3-byte groups. + * Returns the number of input bytes processed (always a multiple of 3). + */ +static inline Py_ssize_t +base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char *table) +{ + Py_ssize_t n_trios = in_len / 3; + Py_ssize_t i; + + /* Process complete 3-byte groups. Each iteration is independent. */ + for (i = 0; i < n_trios; i++) { + base64_encode_trio(in + i * 3, out + i * 4, table); + } + + return n_trios * 3; +} + +/* Decode 4 base64 characters into 3 bytes. + * Returns 1 on success, 0 if any character is invalid. + */ +static inline int +base64_decode_quad(const unsigned char *in, unsigned char *out, + const unsigned char *table) +{ + unsigned char v0 = table[in[0]]; + unsigned char v1 = table[in[1]]; + unsigned char v2 = table[in[2]]; + unsigned char v3 = table[in[3]]; + + if ((v0 | v1 | v2 | v3) & 0xc0) { + return 0; + } + + out[0] = (v0 << 2) | (v1 >> 4); + out[1] = (v1 << 4) | (v2 >> 2); + out[2] = (v2 << 6) | v3; + return 1; +} + +/* Decode multiple complete 4-character groups (no padding allowed). + * Returns the number of input characters processed. + * Stops at the first invalid character, padding, or incomplete group. + */ +static inline Py_ssize_t +base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char *table) +{ + Py_ssize_t n_quads = in_len / 4; + Py_ssize_t i; + + /* Process complete 4-character groups. Each iteration is mostly independent. */ + for (i = 0; i < n_quads; i++) { + const unsigned char *inp = in + i * 4; + + /* Check for padding - exit fast path to handle it properly. + * Four independent comparisons lets the compiler choose the optimal + * approach; on modern pipelined CPUs this is faster than bitmask tricks + * like XOR+SUB+AND for zero-detection which have data dependencies. + */ + if (inp[0] == '=' || inp[1] == '=' || inp[2] == '=' || inp[3] == '=') { + break; + } + + if (!base64_decode_quad(inp, out + i * 3, table)) { + break; + } + } + + return i * 4; +} + static const unsigned char table_b2a_base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; @@ -403,10 +500,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) goto error_end; } + size_t i = 0; /* Current position in input */ + + /* Fast path: use optimized decoder for complete quads. + * This works for both strict and non-strict mode for valid input. + * The fast path stops at padding, invalid chars, or incomplete groups. + */ + if (ascii_len >= 4) { + Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, + bin_data, table_a2b_base64); + if (fast_chars > 0) { + i = (size_t)fast_chars; + bin_data += (fast_chars / 4) * 3; + } + } + + /* Slow path: handle remaining input (padding, invalid chars, partial groups) */ int quad_pos = 0; unsigned char leftchar = 0; int pads = 0; - for (size_t i = 0; i < ascii_len; i++) { + for (; i < ascii_len; i++) { unsigned char this_ch = ascii_data[i]; /* Check for pad sequences and ignore @@ -533,9 +646,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/ { const unsigned char *bin_data; - int leftbits = 0; - unsigned char this_ch; - unsigned int leftchar = 0; Py_ssize_t bin_len; binascii_state *state; @@ -566,26 +676,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) } unsigned char *ascii_data = PyBytesWriter_GetData(writer); - for( ; bin_len > 0 ; bin_len--, bin_data++ ) { - /* Shift the data into our buffer */ - leftchar = (leftchar << 8) | *bin_data; - leftbits += 8; - - /* See if there are 6-bit groups ready */ - while ( leftbits >= 6 ) { - this_ch = (leftchar >> (leftbits-6)) & 0x3f; - leftbits -= 6; - *ascii_data++ = table_b2a_base64[this_ch]; - } - } - if ( leftbits == 2 ) { - *ascii_data++ = table_b2a_base64[(leftchar&3) << 4]; + /* Use the optimized fast path for complete 3-byte groups */ + Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data, + table_b2a_base64); + bin_data += fast_bytes; + ascii_data += (fast_bytes / 3) * 4; + bin_len -= fast_bytes; + + /* Handle remaining 0-2 bytes */ + if (bin_len == 1) { + /* 1 byte remaining: produces 2 base64 chars + 2 padding */ + unsigned int val = bin_data[0]; + *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f]; *ascii_data++ = BASE64_PAD; *ascii_data++ = BASE64_PAD; - } else if ( leftbits == 4 ) { - *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2]; + } + else if (bin_len == 2) { + /* 2 bytes remaining: produces 3 base64 chars + 1 padding */ + unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1]; + *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f]; *ascii_data++ = BASE64_PAD; } + if (newline) *ascii_data++ = '\n'; /* Append a courtesy newline */ diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py new file mode 100644 index 00000000000000..fc67489fa2b20e --- /dev/null +++ b/Tools/binasciibench/binasciibench.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +"""Benchmark for binascii base64 encoding and decoding performance. + +This benchmark measures the throughput of base64 encoding and decoding +operations using the binascii module's C implementation. + +Usage: + python Tools/binasciibench/binasciibench.py [--sizes S1,S2,...] + +Each benchmark runs for ~1.5 seconds to ensure accurate measurements. +""" + +import argparse +import binascii +import os +import statistics +import sys +import time + +# Default test parameters +DEFAULT_SIZES = [64, 1024, 65536, 1048576] + +# Timing targets +TARGET_TOTAL_TIME_S = 1.5 # Target ~1.5 seconds total per benchmark +MIN_ITERATIONS = 5 # Minimum iterations for statistical significance +MIN_OPS_PER_ITER = 10 # Minimum operations per iteration + + +def generate_test_data(size): + """Generate random binary data of the specified size.""" + return os.urandom(size) + + +def generate_base64_data(size): + """Generate valid base64-encoded data of approximately the specified decoded size.""" + binary = os.urandom(size) + return binascii.b2a_base64(binary, newline=False) + + +def benchmark_encode(data, num_ops): + """Benchmark base64 encoding.""" + b2a = binascii.b2a_base64 + start = time.perf_counter_ns() + for _ in range(num_ops): + b2a(data, newline=False) + end = time.perf_counter_ns() + return end - start + + +def benchmark_decode(data, num_ops): + """Benchmark base64 decoding.""" + a2b = binascii.a2b_base64 + start = time.perf_counter_ns() + for _ in range(num_ops): + a2b(data) + end = time.perf_counter_ns() + return end - start + + +def calibrate_and_run(bench_func, data, target_total_s): + """Calibrate and run benchmark to achieve target total time. + + Returns (times_ns, num_ops) where times_ns is a list of per-iteration + timings and num_ops is the number of operations per iteration. + """ + # Quick calibration: measure time for a small batch + num_ops = MIN_OPS_PER_ITER + elapsed_ns = bench_func(data, num_ops) + time_per_op_ns = elapsed_ns / num_ops + + # Calculate ops and iterations to hit target total time + # We want: iterations * num_ops * time_per_op = target_total + # With constraint: iterations >= MIN_ITERATIONS + target_ns = target_total_s * 1_000_000_000 + + # Start with minimum iterations, calculate required ops + iterations = MIN_ITERATIONS + total_ops_needed = int(target_ns / time_per_op_ns) + num_ops = max(MIN_OPS_PER_ITER, total_ops_needed // iterations) + + # If num_ops would be huge, increase iterations instead + max_ops_per_iter = 1_000_000 + if num_ops > max_ops_per_iter: + num_ops = max_ops_per_iter + iterations = max(MIN_ITERATIONS, total_ops_needed // num_ops) + + # Warmup + bench_func(data, num_ops) + + # Timed runs + times_ns = [] + for _ in range(iterations): + elapsed_ns = bench_func(data, num_ops) + times_ns.append(elapsed_ns) + + return times_ns, num_ops + + +def format_throughput(bytes_per_second): + """Format throughput in human-readable units.""" + if bytes_per_second >= 1_000_000_000: + return f"{bytes_per_second / 1_000_000_000:.2f} GB/s" + elif bytes_per_second >= 1_000_000: + return f"{bytes_per_second / 1_000_000:.2f} MB/s" + elif bytes_per_second >= 1_000: + return f"{bytes_per_second / 1_000:.2f} KB/s" + else: + return f"{bytes_per_second:.2f} B/s" + + +def format_size(size): + """Format size in human-readable units.""" + if size >= 1_048_576: + return f"{size // 1_048_576}M" + elif size >= 1024: + return f"{size // 1024}K" + else: + return str(size) + + +def print_results(name, size, times_ns, num_ops, data_size): + """Print benchmark results.""" + # Calculate statistics + times_per_op_ns = [t / num_ops for t in times_ns] + mean_ns = statistics.mean(times_per_op_ns) + stdev_ns = statistics.stdev(times_per_op_ns) if len(times_per_op_ns) > 1 else 0 + + # Calculate throughput + bytes_per_ns = data_size / mean_ns + bytes_per_second = bytes_per_ns * 1_000_000_000 + throughput = format_throughput(bytes_per_second) + + # Calculate coefficient of variation + cv = (stdev_ns / mean_ns * 100) if mean_ns > 0 else 0 + + size_str = format_size(size) + print(f"{name:<20} {size_str:>8} {mean_ns:>12.1f} ns " + f"(+/- {cv:>5.1f}%) {throughput:>12}") + + +def run_all_benchmarks(sizes): + """Run all benchmark variants for all sizes.""" + print(f"binascii base64 benchmark") + print(f"Python: {sys.version}") + print(f"Target time per benchmark: {TARGET_TOTAL_TIME_S}s") + print() + print(f"{'Benchmark':<20} {'Size':>8} {'Time/op':>15} " + f"{'Variance':>10} {'Throughput':>12}") + print("-" * 75) + + for size in sizes: + # Generate test data + binary_data = generate_test_data(size) + base64_data = generate_base64_data(size) + + # Benchmark encode + times, num_ops = calibrate_and_run(benchmark_encode, binary_data, + TARGET_TOTAL_TIME_S) + print_results("b2a_base64", size, times, num_ops, size) + + # Benchmark decode + times, num_ops = calibrate_and_run(benchmark_decode, base64_data, + TARGET_TOTAL_TIME_S) + print_results("a2b_base64", size, times, num_ops, size) + + print() + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark binascii base64 encoding and decoding", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument( + "-s", "--sizes", + type=str, + default=None, + help="Comma-separated list of sizes to test (e.g., '64,256,1024')" + ) + + args = parser.parse_args() + + if args.sizes: + sizes = [int(s.strip()) for s in args.sizes.split(",")] + else: + sizes = DEFAULT_SIZES + + run_all_benchmarks(sizes) + + +if __name__ == "__main__": + main() From 573eaf3fb7d95b44a95225483eab26c09f757b67 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 00:30:15 +0000 Subject: [PATCH 2/9] Fix MSVC build: move table_b2a_base64 before inline functions MSVC doesn't support forward declarations of arrays without explicit size. Move the table definition before the inline functions that use it, eliminating the need for a forward declaration. Co-authored-by: Claude Opus 4.5 --- Modules/binascii.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index aa73528bc6c5d5..e2727b638c0eed 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -109,8 +109,8 @@ static const unsigned char table_a2b_base64[] = { * This allows the compiler to better optimize the hot loops. */ -/* Forward declaration for table defined after the inline functions */ -static const unsigned char table_b2a_base64[]; +static const unsigned char table_b2a_base64[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; /* Encode 3 bytes into 4 base64 characters. */ static inline void @@ -198,9 +198,6 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, return i * 4; } -static const unsigned char table_b2a_base64[] = -"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - static const unsigned short crctab_hqx[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, From eaac67169aeb0894cdc8686dd9ee50e7b7aae6a3 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 00:42:35 +0000 Subject: [PATCH 3/9] NEWS entry --- .../Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst diff --git a/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst new file mode 100644 index 00000000000000..10c7f8632d736b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst @@ -0,0 +1,3 @@ +The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and +related codec has been optimized for modern pipelined CPU architectures and +now performs 2-3x faster across all platforms. From 060dbf5ee1ef38214b76c4d42807e1b29dd42c94 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 00:46:11 +0000 Subject: [PATCH 4/9] Add a whatsnew entry --- Doc/whatsnew/3.15.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 11f08031ec54f2..e8ab7c5b38f227 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -428,6 +428,12 @@ argparse inline code when color output is enabled. (Contributed by Savannah Ostrowski in :gh:`142390`.) +base64 & binascii +----------------- + +* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x + faster thanks to simple CPU pipelining optimizations. + calendar -------- From 1e12273abe55e73038b283430c38b8832fbcd5c1 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" <68491+gpshead@users.noreply.github.com> Date: Sun, 28 Dec 2025 17:07:24 -0800 Subject: [PATCH 5/9] expose the benchmark defaults in the help text. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Tools/binasciibench/binasciibench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py index fc67489fa2b20e..ec98bdee445e2c 100644 --- a/Tools/binasciibench/binasciibench.py +++ b/Tools/binasciibench/binasciibench.py @@ -175,7 +175,7 @@ def main(): parser.add_argument( "-s", "--sizes", type=str, - default=None, + default=",".join(map(str, DEFAULT_SIZES)), help="Comma-separated list of sizes to test (e.g., '64,256,1024')" ) From ef38895ff3208ed439ba1c63e7a26ea1c77a72ca Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 01:37:58 +0000 Subject: [PATCH 6/9] Align base64 tables to 64-byte cache line boundaries Add Py_ALIGNED(64) to both lookup tables to ensure each fits within a single L1 cache line, reducing potential cache misses during encoding/decoding loops. Co-authored-by: Claude Opus 4.5 --- Modules/binascii.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index e2727b638c0eed..4a968d7fa8b90c 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -76,7 +76,8 @@ get_binascii_state(PyObject *module) } -static const unsigned char table_a2b_base64[] = { +/* Align to 64 bytes to ensure table fits in a single L1 cache line */ +static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, @@ -109,7 +110,8 @@ static const unsigned char table_a2b_base64[] = { * This allows the compiler to better optimize the hot loops. */ -static const unsigned char table_b2a_base64[] = +/* Align to 64 bytes to ensure table fits in a single L1 cache line */ +static const unsigned char table_b2a_base64[] Py_ALIGNED(64) = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; /* Encode 3 bytes into 4 base64 characters. */ From 7458c99764b5d590282f126d868c25e11ec0cada Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 01:57:27 +0000 Subject: [PATCH 7/9] Use BASE64_PAD macro instead of literal '=' in fast path Replace hardcoded '=' characters with the BASE64_PAD macro for consistency with the rest of the codebase. Co-authored-by: Claude Opus 4.5 --- Modules/binascii.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index 4a968d7fa8b90c..861d6c92f10d6d 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -188,7 +188,8 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, * approach; on modern pipelined CPUs this is faster than bitmask tricks * like XOR+SUB+AND for zero-detection which have data dependencies. */ - if (inp[0] == '=' || inp[1] == '=' || inp[2] == '=' || inp[3] == '=') { + if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD || + inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) { break; } From 1f8ff742672fda7cb6b88689f4be74f985ec6679 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 02:07:14 +0000 Subject: [PATCH 8/9] Simplify block comment for base64 helpers Co-authored-by: Claude Opus 4.5 --- Modules/binascii.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index 861d6c92f10d6d..c947bf1e6b0203 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -103,11 +103,9 @@ static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) /* - * Base64 encoding/decoding helpers optimized for throughput. + * Fast base64 encoding/decoding helpers. * - * Key optimization: Process complete groups (3 bytes -> 4 chars for encode, - * 4 chars -> 3 bytes for decode) without loop-carried dependencies. - * This allows the compiler to better optimize the hot loops. + * Process complete groups without loop-carried dependencies. */ /* Align to 64 bytes to ensure table fits in a single L1 cache line */ @@ -138,7 +136,6 @@ base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, Py_ssize_t n_trios = in_len / 3; Py_ssize_t i; - /* Process complete 3-byte groups. Each iteration is independent. */ for (i = 0; i < n_trios; i++) { base64_encode_trio(in + i * 3, out + i * 4, table); } @@ -179,7 +176,6 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, Py_ssize_t n_quads = in_len / 4; Py_ssize_t i; - /* Process complete 4-character groups. Each iteration is mostly independent. */ for (i = 0; i < n_quads; i++) { const unsigned char *inp = in + i * 4; From 4b1245be20dcbad0158f2a70fbda7d01c6a59701 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Dec 2025 03:20:54 +0000 Subject: [PATCH 9/9] Remove binasciibench in favor of pyperformance bm_base64 addition PR. --- Tools/binasciibench/binasciibench.py | 193 --------------------------- 1 file changed, 193 deletions(-) delete mode 100644 Tools/binasciibench/binasciibench.py diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py deleted file mode 100644 index ec98bdee445e2c..00000000000000 --- a/Tools/binasciibench/binasciibench.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 -"""Benchmark for binascii base64 encoding and decoding performance. - -This benchmark measures the throughput of base64 encoding and decoding -operations using the binascii module's C implementation. - -Usage: - python Tools/binasciibench/binasciibench.py [--sizes S1,S2,...] - -Each benchmark runs for ~1.5 seconds to ensure accurate measurements. -""" - -import argparse -import binascii -import os -import statistics -import sys -import time - -# Default test parameters -DEFAULT_SIZES = [64, 1024, 65536, 1048576] - -# Timing targets -TARGET_TOTAL_TIME_S = 1.5 # Target ~1.5 seconds total per benchmark -MIN_ITERATIONS = 5 # Minimum iterations for statistical significance -MIN_OPS_PER_ITER = 10 # Minimum operations per iteration - - -def generate_test_data(size): - """Generate random binary data of the specified size.""" - return os.urandom(size) - - -def generate_base64_data(size): - """Generate valid base64-encoded data of approximately the specified decoded size.""" - binary = os.urandom(size) - return binascii.b2a_base64(binary, newline=False) - - -def benchmark_encode(data, num_ops): - """Benchmark base64 encoding.""" - b2a = binascii.b2a_base64 - start = time.perf_counter_ns() - for _ in range(num_ops): - b2a(data, newline=False) - end = time.perf_counter_ns() - return end - start - - -def benchmark_decode(data, num_ops): - """Benchmark base64 decoding.""" - a2b = binascii.a2b_base64 - start = time.perf_counter_ns() - for _ in range(num_ops): - a2b(data) - end = time.perf_counter_ns() - return end - start - - -def calibrate_and_run(bench_func, data, target_total_s): - """Calibrate and run benchmark to achieve target total time. - - Returns (times_ns, num_ops) where times_ns is a list of per-iteration - timings and num_ops is the number of operations per iteration. - """ - # Quick calibration: measure time for a small batch - num_ops = MIN_OPS_PER_ITER - elapsed_ns = bench_func(data, num_ops) - time_per_op_ns = elapsed_ns / num_ops - - # Calculate ops and iterations to hit target total time - # We want: iterations * num_ops * time_per_op = target_total - # With constraint: iterations >= MIN_ITERATIONS - target_ns = target_total_s * 1_000_000_000 - - # Start with minimum iterations, calculate required ops - iterations = MIN_ITERATIONS - total_ops_needed = int(target_ns / time_per_op_ns) - num_ops = max(MIN_OPS_PER_ITER, total_ops_needed // iterations) - - # If num_ops would be huge, increase iterations instead - max_ops_per_iter = 1_000_000 - if num_ops > max_ops_per_iter: - num_ops = max_ops_per_iter - iterations = max(MIN_ITERATIONS, total_ops_needed // num_ops) - - # Warmup - bench_func(data, num_ops) - - # Timed runs - times_ns = [] - for _ in range(iterations): - elapsed_ns = bench_func(data, num_ops) - times_ns.append(elapsed_ns) - - return times_ns, num_ops - - -def format_throughput(bytes_per_second): - """Format throughput in human-readable units.""" - if bytes_per_second >= 1_000_000_000: - return f"{bytes_per_second / 1_000_000_000:.2f} GB/s" - elif bytes_per_second >= 1_000_000: - return f"{bytes_per_second / 1_000_000:.2f} MB/s" - elif bytes_per_second >= 1_000: - return f"{bytes_per_second / 1_000:.2f} KB/s" - else: - return f"{bytes_per_second:.2f} B/s" - - -def format_size(size): - """Format size in human-readable units.""" - if size >= 1_048_576: - return f"{size // 1_048_576}M" - elif size >= 1024: - return f"{size // 1024}K" - else: - return str(size) - - -def print_results(name, size, times_ns, num_ops, data_size): - """Print benchmark results.""" - # Calculate statistics - times_per_op_ns = [t / num_ops for t in times_ns] - mean_ns = statistics.mean(times_per_op_ns) - stdev_ns = statistics.stdev(times_per_op_ns) if len(times_per_op_ns) > 1 else 0 - - # Calculate throughput - bytes_per_ns = data_size / mean_ns - bytes_per_second = bytes_per_ns * 1_000_000_000 - throughput = format_throughput(bytes_per_second) - - # Calculate coefficient of variation - cv = (stdev_ns / mean_ns * 100) if mean_ns > 0 else 0 - - size_str = format_size(size) - print(f"{name:<20} {size_str:>8} {mean_ns:>12.1f} ns " - f"(+/- {cv:>5.1f}%) {throughput:>12}") - - -def run_all_benchmarks(sizes): - """Run all benchmark variants for all sizes.""" - print(f"binascii base64 benchmark") - print(f"Python: {sys.version}") - print(f"Target time per benchmark: {TARGET_TOTAL_TIME_S}s") - print() - print(f"{'Benchmark':<20} {'Size':>8} {'Time/op':>15} " - f"{'Variance':>10} {'Throughput':>12}") - print("-" * 75) - - for size in sizes: - # Generate test data - binary_data = generate_test_data(size) - base64_data = generate_base64_data(size) - - # Benchmark encode - times, num_ops = calibrate_and_run(benchmark_encode, binary_data, - TARGET_TOTAL_TIME_S) - print_results("b2a_base64", size, times, num_ops, size) - - # Benchmark decode - times, num_ops = calibrate_and_run(benchmark_decode, base64_data, - TARGET_TOTAL_TIME_S) - print_results("a2b_base64", size, times, num_ops, size) - - print() - - -def main(): - parser = argparse.ArgumentParser( - description="Benchmark binascii base64 encoding and decoding", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__ - ) - parser.add_argument( - "-s", "--sizes", - type=str, - default=",".join(map(str, DEFAULT_SIZES)), - help="Comma-separated list of sizes to test (e.g., '64,256,1024')" - ) - - args = parser.parse_args() - - if args.sizes: - sizes = [int(s.strip()) for s in args.sizes.split(",")] - else: - sizes = DEFAULT_SIZES - - run_all_benchmarks(sizes) - - -if __name__ == "__main__": - main()