From 2d2be302a54131fe6ad9c02eb91be020e7ca1c42 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Sun, 28 Dec 2025 22:05:41 +0000
Subject: [PATCH 1/9] Add base64 benchmark tool and optimize encoding/decoding

Add Tools/binasciibench/binasciibench.py benchmark for measuring base64
encoding/decoding throughput.

Optimize base64 encoding/decoding by eliminating loop-carried dependencies.
Key changes:
- Add base64_encode_trio() and base64_decode_quad() helper functions
  that process complete groups independently
- Add base64_encode_fast() and base64_decode_fast() wrappers
- Update b2a_base64 and a2b_base64 to use fast path for complete groups

Performance gains (encode/decode speedup vs main, PGO builds):

             64 bytes    64K        1M
  Zen2:      1.1x/1.6x   1.6x/2.4x  1.4x/2.4x
  Zen4:      1.2x/1.7x   1.6x/3.0x  1.5x/3.0x
  M4:        1.3x/1.9x   2.3x/2.8x  2.4x/2.9x
  RPi5-32:   1.4x/1.4x   2.4x/2.0x  2.0x/1.9x

Additional SIMD implementations (NEON, AVX-512 VBMI) can achieve
+50% to +1500% further gains and are planned for follow-on work.

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 Modules/binascii.c                   | 155 ++++++++++++++++++---
 Tools/binasciibench/binasciibench.py | 193 +++++++++++++++++++++++++++
 2 files changed, 328 insertions(+), 20 deletions(-)
 create mode 100644 Tools/binasciibench/binasciibench.py

diff --git a/Modules/binascii.c b/Modules/binascii.c
index 13e4bc5be03ebd..aa73528bc6c5d5 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -101,6 +101,103 @@ static const unsigned char table_a2b_base64[] = {
 /* Max binary chunk size; limited only by available memory */
 #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
 
+/*
+ * Base64 encoding/decoding helpers optimized for throughput.
+ *
+ * Key optimization: Process complete groups (3 bytes -> 4 chars for encode,
+ * 4 chars -> 3 bytes for decode) without loop-carried dependencies.
+ * This allows the compiler to better optimize the hot loops.
+ */
+
+/* Forward declaration for table defined after the inline functions */
+static const unsigned char table_b2a_base64[];
+
+/* Encode 3 bytes into 4 base64 characters. */
+static inline void
+base64_encode_trio(const unsigned char *in, unsigned char *out,
+                   const unsigned char *table)
+{
+    unsigned int combined = ((unsigned int)in[0] << 16) |
+                            ((unsigned int)in[1] << 8) |
+                            (unsigned int)in[2];
+    out[0] = table[(combined >> 18) & 0x3f];
+    out[1] = table[(combined >> 12) & 0x3f];
+    out[2] = table[(combined >> 6) & 0x3f];
+    out[3] = table[combined & 0x3f];
+}
+
+/* Encode multiple complete 3-byte groups.
+ * Returns the number of input bytes processed (always a multiple of 3).
+ */
+static inline Py_ssize_t
+base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char *table)
+{
+    Py_ssize_t n_trios = in_len / 3;
+    Py_ssize_t i;
+
+    /* Process complete 3-byte groups. Each iteration is independent. */
+    for (i = 0; i < n_trios; i++) {
+        base64_encode_trio(in + i * 3, out + i * 4, table);
+    }
+
+    return n_trios * 3;
+}
+
+/* Decode 4 base64 characters into 3 bytes.
+ * Returns 1 on success, 0 if any character is invalid.
+ */
+static inline int
+base64_decode_quad(const unsigned char *in, unsigned char *out,
+                   const unsigned char *table)
+{
+    unsigned char v0 = table[in[0]];
+    unsigned char v1 = table[in[1]];
+    unsigned char v2 = table[in[2]];
+    unsigned char v3 = table[in[3]];
+
+    if ((v0 | v1 | v2 | v3) & 0xc0) {
+        return 0;
+    }
+
+    out[0] = (v0 << 2) | (v1 >> 4);
+    out[1] = (v1 << 4) | (v2 >> 2);
+    out[2] = (v2 << 6) | v3;
+    return 1;
+}
+
+/* Decode multiple complete 4-character groups (no padding allowed).
+ * Returns the number of input characters processed.
+ * Stops at the first invalid character, padding, or incomplete group.
+ */
+static inline Py_ssize_t
+base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char *table)
+{
+    Py_ssize_t n_quads = in_len / 4;
+    Py_ssize_t i;
+
+    /* Process complete 4-character groups. Each iteration is mostly independent. */
+    for (i = 0; i < n_quads; i++) {
+        const unsigned char *inp = in + i * 4;
+
+        /* Check for padding - exit fast path to handle it properly.
+         * Four independent comparisons lets the compiler choose the optimal
+         * approach; on modern pipelined CPUs this is faster than bitmask tricks
+         * like XOR+SUB+AND for zero-detection which have data dependencies.
+         */
+        if (inp[0] == '=' || inp[1] == '=' || inp[2] == '=' || inp[3] == '=') {
+            break;
+        }
+
+        if (!base64_decode_quad(inp, out + i * 3, table)) {
+            break;
+        }
+    }
+
+    return i * 4;
+}
+
 static const unsigned char table_b2a_base64[] =
 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
@@ -403,10 +500,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
         goto error_end;
     }
 
+    size_t i = 0;  /* Current position in input */
+
+    /* Fast path: use optimized decoder for complete quads.
+     * This works for both strict and non-strict mode for valid input.
+     * The fast path stops at padding, invalid chars, or incomplete groups.
+     */
+    if (ascii_len >= 4) {
+        Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
+                                                   bin_data, table_a2b_base64);
+        if (fast_chars > 0) {
+            i = (size_t)fast_chars;
+            bin_data += (fast_chars / 4) * 3;
+        }
+    }
+
+    /* Slow path: handle remaining input (padding, invalid chars, partial groups) */
     int quad_pos = 0;
     unsigned char leftchar = 0;
     int pads = 0;
-    for (size_t i = 0; i < ascii_len; i++) {
+    for (; i < ascii_len; i++) {
         unsigned char this_ch = ascii_data[i];
 
         /* Check for pad sequences and ignore
@@ -533,9 +646,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
 /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
 {
     const unsigned char *bin_data;
-    int leftbits = 0;
-    unsigned char this_ch;
-    unsigned int leftchar = 0;
     Py_ssize_t bin_len;
     binascii_state *state;
 
@@ -566,26 +676,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
     }
     unsigned char *ascii_data = PyBytesWriter_GetData(writer);
 
-    for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
-        /* Shift the data into our buffer */
-        leftchar = (leftchar << 8) | *bin_data;
-        leftbits += 8;
-
-        /* See if there are 6-bit groups ready */
-        while ( leftbits >= 6 ) {
-            this_ch = (leftchar >> (leftbits-6)) & 0x3f;
-            leftbits -= 6;
-            *ascii_data++ = table_b2a_base64[this_ch];
-        }
-    }
-    if ( leftbits == 2 ) {
-        *ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
+    /* Use the optimized fast path for complete 3-byte groups */
+    Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
+                                               table_b2a_base64);
+    bin_data += fast_bytes;
+    ascii_data += (fast_bytes / 3) * 4;
+    bin_len -= fast_bytes;
+
+    /* Handle remaining 0-2 bytes */
+    if (bin_len == 1) {
+        /* 1 byte remaining: produces 2 base64 chars + 2 padding */
+        unsigned int val = bin_data[0];
+        *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
         *ascii_data++ = BASE64_PAD;
         *ascii_data++ = BASE64_PAD;
-    } else if ( leftbits == 4 ) {
-        *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
+    }
+    else if (bin_len == 2) {
+        /* 2 bytes remaining: produces 3 base64 chars + 1 padding */
+        unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
+        *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
         *ascii_data++ = BASE64_PAD;
     }
+
     if (newline)
         *ascii_data++ = '\n';       /* Append a courtesy newline */
 
diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py
new file mode 100644
index 00000000000000..fc67489fa2b20e
--- /dev/null
+++ b/Tools/binasciibench/binasciibench.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""Benchmark for binascii base64 encoding and decoding performance.
+
+This benchmark measures the throughput of base64 encoding and decoding
+operations using the binascii module's C implementation.
+
+Usage:
+    python Tools/binasciibench/binasciibench.py [--sizes S1,S2,...]
+
+Each benchmark runs for ~1.5 seconds to ensure accurate measurements.
+"""
+
+import argparse
+import binascii
+import os
+import statistics
+import sys
+import time
+
+# Default test parameters
+DEFAULT_SIZES = [64, 1024, 65536, 1048576]
+
+# Timing targets
+TARGET_TOTAL_TIME_S = 1.5  # Target ~1.5 seconds total per benchmark
+MIN_ITERATIONS = 5         # Minimum iterations for statistical significance
+MIN_OPS_PER_ITER = 10      # Minimum operations per iteration
+
+
+def generate_test_data(size):
+    """Generate random binary data of the specified size."""
+    return os.urandom(size)
+
+
+def generate_base64_data(size):
+    """Generate valid base64-encoded data of approximately the specified decoded size."""
+    binary = os.urandom(size)
+    return binascii.b2a_base64(binary, newline=False)
+
+
+def benchmark_encode(data, num_ops):
+    """Benchmark base64 encoding."""
+    b2a = binascii.b2a_base64
+    start = time.perf_counter_ns()
+    for _ in range(num_ops):
+        b2a(data, newline=False)
+    end = time.perf_counter_ns()
+    return end - start
+
+
+def benchmark_decode(data, num_ops):
+    """Benchmark base64 decoding."""
+    a2b = binascii.a2b_base64
+    start = time.perf_counter_ns()
+    for _ in range(num_ops):
+        a2b(data)
+    end = time.perf_counter_ns()
+    return end - start
+
+
+def calibrate_and_run(bench_func, data, target_total_s):
+    """Calibrate and run benchmark to achieve target total time.
+
+    Returns (times_ns, num_ops) where times_ns is a list of per-iteration
+    timings and num_ops is the number of operations per iteration.
+    """
+    # Quick calibration: measure time for a small batch
+    num_ops = MIN_OPS_PER_ITER
+    elapsed_ns = bench_func(data, num_ops)
+    time_per_op_ns = elapsed_ns / num_ops
+
+    # Calculate ops and iterations to hit target total time
+    # We want: iterations * num_ops * time_per_op = target_total
+    # With constraint: iterations >= MIN_ITERATIONS
+    target_ns = target_total_s * 1_000_000_000
+
+    # Start with minimum iterations, calculate required ops
+    iterations = MIN_ITERATIONS
+    total_ops_needed = int(target_ns / time_per_op_ns)
+    num_ops = max(MIN_OPS_PER_ITER, total_ops_needed // iterations)
+
+    # If num_ops would be huge, increase iterations instead
+    max_ops_per_iter = 1_000_000
+    if num_ops > max_ops_per_iter:
+        num_ops = max_ops_per_iter
+        iterations = max(MIN_ITERATIONS, total_ops_needed // num_ops)
+
+    # Warmup
+    bench_func(data, num_ops)
+
+    # Timed runs
+    times_ns = []
+    for _ in range(iterations):
+        elapsed_ns = bench_func(data, num_ops)
+        times_ns.append(elapsed_ns)
+
+    return times_ns, num_ops
+
+
+def format_throughput(bytes_per_second):
+    """Format throughput in human-readable units."""
+    if bytes_per_second >= 1_000_000_000:
+        return f"{bytes_per_second / 1_000_000_000:.2f} GB/s"
+    elif bytes_per_second >= 1_000_000:
+        return f"{bytes_per_second / 1_000_000:.2f} MB/s"
+    elif bytes_per_second >= 1_000:
+        return f"{bytes_per_second / 1_000:.2f} KB/s"
+    else:
+        return f"{bytes_per_second:.2f} B/s"
+
+
+def format_size(size):
+    """Format size in human-readable units."""
+    if size >= 1_048_576:
+        return f"{size // 1_048_576}M"
+    elif size >= 1024:
+        return f"{size // 1024}K"
+    else:
+        return str(size)
+
+
+def print_results(name, size, times_ns, num_ops, data_size):
+    """Print benchmark results."""
+    # Calculate statistics
+    times_per_op_ns = [t / num_ops for t in times_ns]
+    mean_ns = statistics.mean(times_per_op_ns)
+    stdev_ns = statistics.stdev(times_per_op_ns) if len(times_per_op_ns) > 1 else 0
+
+    # Calculate throughput
+    bytes_per_ns = data_size / mean_ns
+    bytes_per_second = bytes_per_ns * 1_000_000_000
+    throughput = format_throughput(bytes_per_second)
+
+    # Calculate coefficient of variation
+    cv = (stdev_ns / mean_ns * 100) if mean_ns > 0 else 0
+
+    size_str = format_size(size)
+    print(f"{name:<20} {size_str:>8}  {mean_ns:>12.1f} ns  "
+          f"(+/- {cv:>5.1f}%)  {throughput:>12}")
+
+
+def run_all_benchmarks(sizes):
+    """Run all benchmark variants for all sizes."""
+    print(f"binascii base64 benchmark")
+    print(f"Python: {sys.version}")
+    print(f"Target time per benchmark: {TARGET_TOTAL_TIME_S}s")
+    print()
+    print(f"{'Benchmark':<20} {'Size':>8}  {'Time/op':>15}  "
+          f"{'Variance':>10}  {'Throughput':>12}")
+    print("-" * 75)
+
+    for size in sizes:
+        # Generate test data
+        binary_data = generate_test_data(size)
+        base64_data = generate_base64_data(size)
+
+        # Benchmark encode
+        times, num_ops = calibrate_and_run(benchmark_encode, binary_data,
+                                           TARGET_TOTAL_TIME_S)
+        print_results("b2a_base64", size, times, num_ops, size)
+
+        # Benchmark decode
+        times, num_ops = calibrate_and_run(benchmark_decode, base64_data,
+                                           TARGET_TOTAL_TIME_S)
+        print_results("a2b_base64", size, times, num_ops, size)
+
+        print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark binascii base64 encoding and decoding",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        "-s", "--sizes",
+        type=str,
+        default=None,
+        help="Comma-separated list of sizes to test (e.g., '64,256,1024')"
+    )
+
+    args = parser.parse_args()
+
+    if args.sizes:
+        sizes = [int(s.strip()) for s in args.sizes.split(",")]
+    else:
+        sizes = DEFAULT_SIZES
+
+    run_all_benchmarks(sizes)
+
+
+if __name__ == "__main__":
+    main()

From 573eaf3fb7d95b44a95225483eab26c09f757b67 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 00:30:15 +0000
Subject: [PATCH 2/9] Fix MSVC build: move table_b2a_base64 before inline
 functions

MSVC doesn't support forward declarations of arrays without explicit
size. Move the table definition before the inline functions that use
it, eliminating the need for a forward declaration.

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 Modules/binascii.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Modules/binascii.c b/Modules/binascii.c
index aa73528bc6c5d5..e2727b638c0eed 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -109,8 +109,8 @@ static const unsigned char table_a2b_base64[] = {
  * This allows the compiler to better optimize the hot loops.
  */
 
-/* Forward declaration for table defined after the inline functions */
-static const unsigned char table_b2a_base64[];
+static const unsigned char table_b2a_base64[] =
+"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
 /* Encode 3 bytes into 4 base64 characters. */
 static inline void
@@ -198,9 +198,6 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
     return i * 4;
 }
 
-static const unsigned char table_b2a_base64[] =
-"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
 
 static const unsigned short crctab_hqx[256] = {
     0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,

From eaac67169aeb0894cdc8686dd9ee50e7b7aae6a3 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 00:42:35 +0000
Subject: [PATCH 3/9] NEWS entry

---
 .../Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst     | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst

diff --git a/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
new file mode 100644
index 00000000000000..10c7f8632d736b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
@@ -0,0 +1,3 @@
+The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
+related codec has been optimized for modern pipelined CPU architectures and
+now performs 2-3x faster across all platforms.

From 060dbf5ee1ef38214b76c4d42807e1b29dd42c94 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 00:46:11 +0000
Subject: [PATCH 4/9] Add a whatsnew entry

---
 Doc/whatsnew/3.15.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index 11f08031ec54f2..e8ab7c5b38f227 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -428,6 +428,12 @@ argparse
   inline code when color output is enabled.
   (Contributed by Savannah Ostrowski in :gh:`142390`.)
 
+base64 & binascii
+-----------------
+
+* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
+  faster thanks to simple CPU pipelining optimizations.
+
 calendar
 --------
 

From 1e12273abe55e73038b283430c38b8832fbcd5c1 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <68491+gpshead@users.noreply.github.com>
Date: Sun, 28 Dec 2025 17:07:24 -0800
Subject: [PATCH 5/9] expose the benchmark defaults in the help text.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
---
 Tools/binasciibench/binasciibench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py
index fc67489fa2b20e..ec98bdee445e2c 100644
--- a/Tools/binasciibench/binasciibench.py
+++ b/Tools/binasciibench/binasciibench.py
@@ -175,7 +175,7 @@ def main():
     parser.add_argument(
         "-s", "--sizes",
         type=str,
-        default=None,
+        default=",".join(map(str, DEFAULT_SIZES)),
         help="Comma-separated list of sizes to test (e.g., '64,256,1024')"
     )
 

From ef38895ff3208ed439ba1c63e7a26ea1c77a72ca Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 01:37:58 +0000
Subject: [PATCH 6/9] Align base64 tables to 64-byte cache line boundaries

Add Py_ALIGNED(64) to both lookup tables to ensure each fits
within a single L1 cache line, reducing potential cache misses
during encoding/decoding loops.

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 Modules/binascii.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Modules/binascii.c b/Modules/binascii.c
index e2727b638c0eed..4a968d7fa8b90c 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -76,7 +76,8 @@ get_binascii_state(PyObject *module)
 }
 
 
-static const unsigned char table_a2b_base64[] = {
+/* Align to 64 bytes to ensure table fits in a single L1 cache line */
+static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@@ -109,7 +110,8 @@ static const unsigned char table_a2b_base64[] = {
  * This allows the compiler to better optimize the hot loops.
  */
 
-static const unsigned char table_b2a_base64[] =
+/* Align to 64 bytes to ensure table fits in a single L1 cache line */
+static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
 /* Encode 3 bytes into 4 base64 characters. */

From 7458c99764b5d590282f126d868c25e11ec0cada Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 01:57:27 +0000
Subject: [PATCH 7/9] Use BASE64_PAD macro instead of literal '=' in fast path

Replace hardcoded '=' characters with the BASE64_PAD macro
for consistency with the rest of the codebase.

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 Modules/binascii.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Modules/binascii.c b/Modules/binascii.c
index 4a968d7fa8b90c..861d6c92f10d6d 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -188,7 +188,8 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
          * approach; on modern pipelined CPUs this is faster than bitmask tricks
          * like XOR+SUB+AND for zero-detection which have data dependencies.
          */
-        if (inp[0] == '=' || inp[1] == '=' || inp[2] == '=' || inp[3] == '=') {
+        if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD ||
+            inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) {
             break;
         }
 

From 1f8ff742672fda7cb6b88689f4be74f985ec6679 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 02:07:14 +0000
Subject: [PATCH 8/9] Simplify block comment for base64 helpers

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 Modules/binascii.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Modules/binascii.c b/Modules/binascii.c
index 861d6c92f10d6d..c947bf1e6b0203 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -103,11 +103,9 @@ static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
 #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
 
 /*
- * Base64 encoding/decoding helpers optimized for throughput.
+ * Fast base64 encoding/decoding helpers.
  *
- * Key optimization: Process complete groups (3 bytes -> 4 chars for encode,
- * 4 chars -> 3 bytes for decode) without loop-carried dependencies.
- * This allows the compiler to better optimize the hot loops.
+ * Process complete groups without loop-carried dependencies.
  */
 
 /* Align to 64 bytes to ensure table fits in a single L1 cache line */
@@ -138,7 +136,6 @@ base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
     Py_ssize_t n_trios = in_len / 3;
     Py_ssize_t i;
 
-    /* Process complete 3-byte groups. Each iteration is independent. */
     for (i = 0; i < n_trios; i++) {
         base64_encode_trio(in + i * 3, out + i * 4, table);
     }
@@ -179,7 +176,6 @@ base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
     Py_ssize_t n_quads = in_len / 4;
     Py_ssize_t i;
 
-    /* Process complete 4-character groups. Each iteration is mostly independent. */
     for (i = 0; i < n_quads; i++) {
         const unsigned char *inp = in + i * 4;
 

From 4b1245be20dcbad0158f2a70fbda7d01c6a59701 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Mon, 29 Dec 2025 03:20:54 +0000
Subject: [PATCH 9/9] Remove binasciibench in favor of pyperformance bm_base64
 addition PR.

---
 Tools/binasciibench/binasciibench.py | 193 ---------------------------
 1 file changed, 193 deletions(-)
 delete mode 100644 Tools/binasciibench/binasciibench.py

diff --git a/Tools/binasciibench/binasciibench.py b/Tools/binasciibench/binasciibench.py
deleted file mode 100644
index ec98bdee445e2c..00000000000000
--- a/Tools/binasciibench/binasciibench.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark for binascii base64 encoding and decoding performance.
-
-This benchmark measures the throughput of base64 encoding and decoding
-operations using the binascii module's C implementation.
-
-Usage:
-    python Tools/binasciibench/binasciibench.py [--sizes S1,S2,...]
-
-Each benchmark runs for ~1.5 seconds to ensure accurate measurements.
-"""
-
-import argparse
-import binascii
-import os
-import statistics
-import sys
-import time
-
-# Default test parameters
-DEFAULT_SIZES = [64, 1024, 65536, 1048576]
-
-# Timing targets
-TARGET_TOTAL_TIME_S = 1.5  # Target ~1.5 seconds total per benchmark
-MIN_ITERATIONS = 5         # Minimum iterations for statistical significance
-MIN_OPS_PER_ITER = 10      # Minimum operations per iteration
-
-
-def generate_test_data(size):
-    """Generate random binary data of the specified size."""
-    return os.urandom(size)
-
-
-def generate_base64_data(size):
-    """Generate valid base64-encoded data of approximately the specified decoded size."""
-    binary = os.urandom(size)
-    return binascii.b2a_base64(binary, newline=False)
-
-
-def benchmark_encode(data, num_ops):
-    """Benchmark base64 encoding."""
-    b2a = binascii.b2a_base64
-    start = time.perf_counter_ns()
-    for _ in range(num_ops):
-        b2a(data, newline=False)
-    end = time.perf_counter_ns()
-    return end - start
-
-
-def benchmark_decode(data, num_ops):
-    """Benchmark base64 decoding."""
-    a2b = binascii.a2b_base64
-    start = time.perf_counter_ns()
-    for _ in range(num_ops):
-        a2b(data)
-    end = time.perf_counter_ns()
-    return end - start
-
-
-def calibrate_and_run(bench_func, data, target_total_s):
-    """Calibrate and run benchmark to achieve target total time.
-
-    Returns (times_ns, num_ops) where times_ns is a list of per-iteration
-    timings and num_ops is the number of operations per iteration.
-    """
-    # Quick calibration: measure time for a small batch
-    num_ops = MIN_OPS_PER_ITER
-    elapsed_ns = bench_func(data, num_ops)
-    time_per_op_ns = elapsed_ns / num_ops
-
-    # Calculate ops and iterations to hit target total time
-    # We want: iterations * num_ops * time_per_op = target_total
-    # With constraint: iterations >= MIN_ITERATIONS
-    target_ns = target_total_s * 1_000_000_000
-
-    # Start with minimum iterations, calculate required ops
-    iterations = MIN_ITERATIONS
-    total_ops_needed = int(target_ns / time_per_op_ns)
-    num_ops = max(MIN_OPS_PER_ITER, total_ops_needed // iterations)
-
-    # If num_ops would be huge, increase iterations instead
-    max_ops_per_iter = 1_000_000
-    if num_ops > max_ops_per_iter:
-        num_ops = max_ops_per_iter
-        iterations = max(MIN_ITERATIONS, total_ops_needed // num_ops)
-
-    # Warmup
-    bench_func(data, num_ops)
-
-    # Timed runs
-    times_ns = []
-    for _ in range(iterations):
-        elapsed_ns = bench_func(data, num_ops)
-        times_ns.append(elapsed_ns)
-
-    return times_ns, num_ops
-
-
-def format_throughput(bytes_per_second):
-    """Format throughput in human-readable units."""
-    if bytes_per_second >= 1_000_000_000:
-        return f"{bytes_per_second / 1_000_000_000:.2f} GB/s"
-    elif bytes_per_second >= 1_000_000:
-        return f"{bytes_per_second / 1_000_000:.2f} MB/s"
-    elif bytes_per_second >= 1_000:
-        return f"{bytes_per_second / 1_000:.2f} KB/s"
-    else:
-        return f"{bytes_per_second:.2f} B/s"
-
-
-def format_size(size):
-    """Format size in human-readable units."""
-    if size >= 1_048_576:
-        return f"{size // 1_048_576}M"
-    elif size >= 1024:
-        return f"{size // 1024}K"
-    else:
-        return str(size)
-
-
-def print_results(name, size, times_ns, num_ops, data_size):
-    """Print benchmark results."""
-    # Calculate statistics
-    times_per_op_ns = [t / num_ops for t in times_ns]
-    mean_ns = statistics.mean(times_per_op_ns)
-    stdev_ns = statistics.stdev(times_per_op_ns) if len(times_per_op_ns) > 1 else 0
-
-    # Calculate throughput
-    bytes_per_ns = data_size / mean_ns
-    bytes_per_second = bytes_per_ns * 1_000_000_000
-    throughput = format_throughput(bytes_per_second)
-
-    # Calculate coefficient of variation
-    cv = (stdev_ns / mean_ns * 100) if mean_ns > 0 else 0
-
-    size_str = format_size(size)
-    print(f"{name:<20} {size_str:>8}  {mean_ns:>12.1f} ns  "
-          f"(+/- {cv:>5.1f}%)  {throughput:>12}")
-
-
-def run_all_benchmarks(sizes):
-    """Run all benchmark variants for all sizes."""
-    print(f"binascii base64 benchmark")
-    print(f"Python: {sys.version}")
-    print(f"Target time per benchmark: {TARGET_TOTAL_TIME_S}s")
-    print()
-    print(f"{'Benchmark':<20} {'Size':>8}  {'Time/op':>15}  "
-          f"{'Variance':>10}  {'Throughput':>12}")
-    print("-" * 75)
-
-    for size in sizes:
-        # Generate test data
-        binary_data = generate_test_data(size)
-        base64_data = generate_base64_data(size)
-
-        # Benchmark encode
-        times, num_ops = calibrate_and_run(benchmark_encode, binary_data,
-                                           TARGET_TOTAL_TIME_S)
-        print_results("b2a_base64", size, times, num_ops, size)
-
-        # Benchmark decode
-        times, num_ops = calibrate_and_run(benchmark_decode, base64_data,
-                                           TARGET_TOTAL_TIME_S)
-        print_results("a2b_base64", size, times, num_ops, size)
-
-        print()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Benchmark binascii base64 encoding and decoding",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
-    )
-    parser.add_argument(
-        "-s", "--sizes",
-        type=str,
-        default=",".join(map(str, DEFAULT_SIZES)),
-        help="Comma-separated list of sizes to test (e.g., '64,256,1024')"
-    )
-
-    args = parser.parse_args()
-
-    if args.sizes:
-        sizes = [int(s.strip()) for s in args.sizes.split(",")]
-    else:
-        sizes = DEFAULT_SIZES
-
-    run_all_benchmarks(sizes)
-
-
-if __name__ == "__main__":
-    main()