Skip to content

Commit 92c0ec2

Browse files
gh-144264: Speed up Base64 decoding of data containing ignored characters (GH-144265)
Try the fast path again after decoding a quad the slow path. Use a bitmap cache for the ignorechars argument.
1 parent 0fa1fc6 commit 92c0ec2

File tree

3 files changed

+45
-11
lines changed

3 files changed

+45
-11
lines changed

Lib/test/test_binascii.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,17 @@ def assertNonBase64Data(data, expected, ignorechars):
202202
assertNonBase64Data(b'a\nb==', b'i', ignorechars=bytearray(b'\n'))
203203
assertNonBase64Data(b'a\nb==', b'i', ignorechars=memoryview(b'\n'))
204204

205+
# Same cell in the cache: '\r' >> 3 == '\n' >> 3.
206+
data = self.type2test(b'\r\n')
207+
with self.assertRaises(binascii.Error):
208+
binascii.a2b_base64(data, ignorechars=b'\r')
209+
self.assertEqual(binascii.a2b_base64(data, ignorechars=b'\r\n'), b'')
210+
# Same bit mask in the cache: '*' & 31 == '\n' & 31.
211+
data = self.type2test(b'*\n')
212+
with self.assertRaises(binascii.Error):
213+
binascii.a2b_base64(data, ignorechars=b'*')
214+
self.assertEqual(binascii.a2b_base64(data, ignorechars=b'*\n'), b'')
215+
205216
data = self.type2test(b'a\nb==')
206217
with self.assertRaises(TypeError):
207218
binascii.a2b_base64(data, ignorechars='')
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Speed up Base64 decoding of data containing ignored characters (both in
2+
non-strict mode and with an explicit *ignorechars* argument).
3+
It is now up to 2 times faster for multiline Base64 data.

Modules/binascii.c

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,23 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
469469
return PyBytesWriter_FinishWithPointer(writer, ascii_data);
470470
}
471471

472+
typedef unsigned char ignorecache_t[32];
472473

473474
static int
474-
ignorechar(unsigned char c, Py_buffer *ignorechars)
475+
ignorechar(unsigned char c, const Py_buffer *ignorechars,
476+
ignorecache_t ignorecache)
475477
{
476-
return (ignorechars->buf != NULL &&
477-
memchr(ignorechars->buf, c, ignorechars->len));
478+
if (ignorechars == NULL) {
479+
return 0;
480+
}
481+
if (ignorecache[c >> 3] & (1 << (c & 7))) {
482+
return 1;
483+
}
484+
if (memchr(ignorechars->buf, c, ignorechars->len)) {
485+
ignorecache[c >> 3] |= 1 << (c & 7);
486+
return 1;
487+
}
488+
return 0;
478489
}
479490

480491
/*[clinic input]
@@ -508,6 +519,13 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
508519
if (strict_mode == -1) {
509520
strict_mode = (ignorechars->buf != NULL);
510521
}
522+
if (!strict_mode || ignorechars->buf == NULL || ignorechars->len == 0) {
523+
ignorechars = NULL;
524+
}
525+
ignorecache_t ignorecache;
526+
if (ignorechars != NULL) {
527+
memset(ignorecache, 0, sizeof(ignorecache));
528+
}
511529

512530
/* Allocate the buffer */
513531
Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
@@ -517,8 +535,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
517535
}
518536
unsigned char *bin_data = PyBytesWriter_GetData(writer);
519537

520-
size_t i = 0; /* Current position in input */
521-
538+
fastpath:
522539
/* Fast path: use optimized decoder for complete quads.
523540
* This works for both strict and non-strict mode for valid input.
524541
* The fast path stops at padding, invalid chars, or incomplete groups.
@@ -527,7 +544,8 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
527544
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
528545
bin_data, table_a2b_base64);
529546
if (fast_chars > 0) {
530-
i = (size_t)fast_chars;
547+
ascii_data += fast_chars;
548+
ascii_len -= fast_chars;
531549
bin_data += (fast_chars / 4) * 3;
532550
}
533551
}
@@ -536,8 +554,8 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
536554
int quad_pos = 0;
537555
unsigned char leftchar = 0;
538556
int pads = 0;
539-
for (; i < ascii_len; i++) {
540-
unsigned char this_ch = ascii_data[i];
557+
for (; ascii_len; ascii_data++, ascii_len--) {
558+
unsigned char this_ch = *ascii_data;
541559

542560
/* Check for pad sequences and ignore
543561
** the invalid ones.
@@ -549,7 +567,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
549567
if (quad_pos == 0) {
550568
state = get_binascii_state(module);
551569
if (state) {
552-
PyErr_SetString(state->Error, (i == 0)
570+
PyErr_SetString(state->Error, (ascii_data == data->buf)
553571
? "Leading padding not allowed"
554572
: "Excess padding not allowed");
555573
}
@@ -580,7 +598,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
580598

581599
unsigned char v = table_a2b_base64[this_ch];
582600
if (v >= 64) {
583-
if (strict_mode && !ignorechar(this_ch, ignorechars)) {
601+
if (strict_mode && !ignorechar(this_ch, ignorechars, ignorecache)) {
584602
state = get_binascii_state(module);
585603
if (state) {
586604
PyErr_SetString(state->Error, "Only base64 data is allowed");
@@ -621,7 +639,9 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
621639
quad_pos = 0;
622640
*bin_data++ = (leftchar << 6) | (v);
623641
leftchar = 0;
624-
break;
642+
ascii_data++;
643+
ascii_len--;
644+
goto fastpath;
625645
}
626646
}
627647

0 commit comments

Comments
 (0)