Skip to content

Commit 8c54c3d

Browse files
gh-91576: Speed up iteration of strings (#91574)
1 parent a29f858 commit 8c54c3d

File tree

5 files changed

+79
-6
lines changed

5 files changed

+79
-6
lines changed

Include/internal/pycore_unicodeobject.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ extern void _PyUnicode_Fini(PyInterpreterState *);
2020
extern void _PyUnicode_FiniTypes(PyInterpreterState *);
2121
extern void _PyStaticUnicode_Dealloc(PyObject *);
2222

23+
extern PyTypeObject _PyUnicodeASCIIIter_Type;
2324

2425
/* other API */
2526

Lib/test/test_unicode.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import codecs
1010
import itertools
1111
import operator
12+
import pickle
1213
import struct
1314
import sys
1415
import textwrap
@@ -185,6 +186,36 @@ def test_iterators(self):
185186
self.assertEqual(next(it), "\u3333")
186187
self.assertRaises(StopIteration, next, it)
187188

189+
def test_iterators_invocation(self):
190+
cases = [type(iter('abc')), type(iter('🚀'))]
191+
for cls in cases:
192+
with self.subTest(cls=cls):
193+
self.assertRaises(TypeError, cls)
194+
195+
def test_iteration(self):
196+
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
197+
for case in cases:
198+
with self.subTest(string=case):
199+
self.assertEqual(case, "".join(iter(case)))
200+
201+
def test_exhausted_iterator(self):
202+
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
203+
for case in cases:
204+
with self.subTest(case=case):
205+
iterator = iter(case)
206+
tuple(iterator)
207+
self.assertRaises(StopIteration, next, iterator)
208+
209+
def test_pickle_iterator(self):
210+
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
211+
for case in cases:
212+
with self.subTest(case=case):
213+
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
214+
it = iter(case)
215+
with self.subTest(proto=proto):
216+
pickled = "".join(pickle.loads(pickle.dumps(it, proto)))
217+
self.assertEqual(case, pickled)
218+
188219
def test_count(self):
189220
string_tests.CommonTest.test_count(self)
190221
# check mixed argument types
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up iteration of ascii strings by 50%. Patch by Kumar Aditya.

Objects/object.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1936,6 +1936,7 @@ static PyTypeObject* static_types[] = {
19361936
&_PyNamespace_Type,
19371937
&_PyNone_Type,
19381938
&_PyNotImplemented_Type,
1939+
&_PyUnicodeASCIIIter_Type,
19391940
&_PyUnion_Type,
19401941
&_PyWeakref_CallableProxyType,
19411942
&_PyWeakref_ProxyType,

Objects/unicodeobject.c

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15697,7 +15697,7 @@ unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
1569715697
static PyObject *
1569815698
unicodeiter_next(unicodeiterobject *it)
1569915699
{
15700-
PyObject *seq, *item;
15700+
PyObject *seq;
1570115701

1570215702
assert(it != NULL);
1570315703
seq = it->it_seq;
@@ -15709,17 +15709,38 @@ unicodeiter_next(unicodeiterobject *it)
1570915709
int kind = PyUnicode_KIND(seq);
1571015710
const void *data = PyUnicode_DATA(seq);
1571115711
Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15712-
item = PyUnicode_FromOrdinal(chr);
15713-
if (item != NULL)
15714-
++it->it_index;
15715-
return item;
15712+
it->it_index++;
15713+
return unicode_char(chr);
1571615714
}
1571715715

1571815716
it->it_seq = NULL;
1571915717
Py_DECREF(seq);
1572015718
return NULL;
1572115719
}
1572215720

15721+
static PyObject *
15722+
unicode_ascii_iter_next(unicodeiterobject *it)
15723+
{
15724+
assert(it != NULL);
15725+
PyObject *seq = it->it_seq;
15726+
if (seq == NULL) {
15727+
return NULL;
15728+
}
15729+
assert(_PyUnicode_CHECK(seq));
15730+
assert(PyUnicode_IS_COMPACT_ASCII(seq));
15731+
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15732+
const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15733+
Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15734+
data, it->it_index);
15735+
it->it_index++;
15736+
PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15737+
return Py_NewRef(item);
15738+
}
15739+
it->it_seq = NULL;
15740+
Py_DECREF(seq);
15741+
return NULL;
15742+
}
15743+
1572315744
static PyObject *
1572415745
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
1572515746
{
@@ -15808,6 +15829,19 @@ PyTypeObject PyUnicodeIter_Type = {
1580815829
0,
1580915830
};
1581015831

15832+
PyTypeObject _PyUnicodeASCIIIter_Type = {
15833+
PyVarObject_HEAD_INIT(&PyType_Type, 0)
15834+
.tp_name = "str_ascii_iterator",
15835+
.tp_basicsize = sizeof(unicodeiterobject),
15836+
.tp_dealloc = (destructor)unicodeiter_dealloc,
15837+
.tp_getattro = PyObject_GenericGetAttr,
15838+
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15839+
.tp_traverse = (traverseproc)unicodeiter_traverse,
15840+
.tp_iter = PyObject_SelfIter,
15841+
.tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15842+
.tp_methods = unicodeiter_methods,
15843+
};
15844+
1581115845
static PyObject *
1581215846
unicode_iter(PyObject *seq)
1581315847
{
@@ -15819,7 +15853,12 @@ unicode_iter(PyObject *seq)
1581915853
}
1582015854
if (PyUnicode_READY(seq) == -1)
1582115855
return NULL;
15822-
it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15856+
if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15857+
it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15858+
}
15859+
else {
15860+
it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15861+
}
1582315862
if (it == NULL)
1582415863
return NULL;
1582515864
it->it_index = 0;

0 commit comments

Comments
 (0)