Skip to content

Commit ad50831

Browse files
Rename Grapheme to Segment. Add __repr__(), remove __iter__().
1 parent a95c3cb commit ad50831

File tree

2 files changed

+58
-41
lines changed

2 files changed

+58
-41
lines changed

Lib/test/test_unicodedata.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,17 @@ def test_linebreak_7643(self):
946946
self.assertEqual(len(lines), 1,
947947
r"%a should not be a linebreak" % c)
948948

949+
def test_segment_object(self):
950+
segments = list(unicodedata.iter_graphemes('spa\u0300m'))
951+
self.assertEqual(len(segments), 4, segments)
952+
segment = segments[2]
953+
self.assertEqual(segment.start, 2)
954+
self.assertEqual(segment.end, 4)
955+
self.assertEqual(str(segment), 'a\u0300')
956+
self.assertEqual(repr(segment), '<Segment 2:4>')
957+
self.assertRaises(TypeError, iter, segment)
958+
self.assertRaises(TypeError, len, segment)
959+
949960

950961
class NormalizationTest(unittest.TestCase):
951962
@staticmethod

Modules/unicodedata.c

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ _getrecord_ex(Py_UCS4 code)
9797
}
9898

9999
typedef struct {
100-
PyObject *GraphemeType;
100+
PyObject *SegmentType;
101101
PyObject *GraphemeBreakIteratorType;
102102
} unicodedatastate;
103103

@@ -1873,72 +1873,79 @@ _Py_NextGraphemeBreak(_PyGraphemeBreak *iter)
18731873
}
18741874

18751875

1876-
/* Grapheme Cluster object */
1876+
/* Text Segment object */
18771877

18781878
typedef struct {
18791879
PyObject_HEAD
18801880
PyObject *string;
18811881
Py_ssize_t start;
18821882
Py_ssize_t end;
1883-
} GraphemeObject;
1883+
} SegmentObject;
18841884

18851885
static void
1886-
Grapheme_dealloc(PyObject *self)
1886+
Segment_dealloc(PyObject *self)
18871887
{
18881888
PyObject_GC_UnTrack(self);
1889-
Py_DECREF(((GraphemeObject *)self)->string);
1889+
Py_DECREF(((SegmentObject *)self)->string);
18901890
PyObject_GC_Del(self);
18911891
}
18921892

18931893
static int
1894-
Grapheme_traverse(PyObject *self, visitproc visit, void *arg)
1894+
Segment_traverse(PyObject *self, visitproc visit, void *arg)
18951895
{
1896-
Py_VISIT(((GraphemeObject *)self)->string);
1896+
Py_VISIT(((SegmentObject *)self)->string);
18971897
return 0;
18981898
}
18991899

19001900
static int
1901-
Grapheme_clear(PyObject *self)
1901+
Segment_clear(PyObject *self)
19021902
{
1903-
Py_CLEAR(((GraphemeObject *)self)->string);
1903+
Py_CLEAR(((SegmentObject *)self)->string);
19041904
return 0;
19051905
}
19061906

19071907
static PyObject *
1908-
Grapheme_str(PyObject *self)
1908+
Segment_str(PyObject *self)
19091909
{
1910-
GraphemeObject *g = (GraphemeObject *)self;
1911-
return PyUnicode_Substring(g->string, g->start, g->end);
1910+
SegmentObject *s = (SegmentObject *)self;
1911+
return PyUnicode_Substring(s->string, s->start, s->end);
19121912
}
19131913

1914-
static PyMemberDef Grapheme_members[] = {
1915-
{"start", Py_T_PYSSIZET, offsetof(GraphemeObject, start), 0,
1914+
static PyObject *
1915+
Segment_repr(PyObject *self)
1916+
{
1917+
SegmentObject *s = (SegmentObject *)self;
1918+
return PyUnicode_FromFormat("<Segment %zd:%zd>", s->start, s->end);
1919+
}
1920+
1921+
static PyMemberDef Segment_members[] = {
1922+
{"start", Py_T_PYSSIZET, offsetof(SegmentObject, start), 0,
19161923
PyDoc_STR("grapheme start")},
1917-
{"end", Py_T_PYSSIZET, offsetof(GraphemeObject, end), 0,
1924+
{"end", Py_T_PYSSIZET, offsetof(SegmentObject, end), 0,
19181925
PyDoc_STR("grapheme end")},
19191926
{NULL} /* Sentinel */
19201927
};
19211928

1922-
static PyType_Slot Grapheme_slots[] = {
1923-
{Py_tp_dealloc, Grapheme_dealloc},
1924-
{Py_tp_iter, PyObject_SelfIter},
1925-
{Py_tp_traverse, Grapheme_traverse},
1926-
{Py_tp_clear, Grapheme_clear},
1927-
{Py_tp_str, Grapheme_str},
1928-
{Py_tp_members, Grapheme_members},
1929+
static PyType_Slot Segment_slots[] = {
1930+
{Py_tp_dealloc, Segment_dealloc},
1931+
{Py_tp_traverse, Segment_traverse},
1932+
{Py_tp_clear, Segment_clear},
1933+
{Py_tp_str, Segment_str},
1934+
{Py_tp_repr, Segment_repr},
1935+
{Py_tp_members, Segment_members},
19291936
{0, 0},
19301937
};
19311938

1932-
static PyType_Spec Grapheme_spec = {
1933-
.name = "unicodedata.Grapheme",
1934-
.basicsize = sizeof(GraphemeObject),
1939+
static PyType_Spec Segment_spec = {
1940+
.name = "unicodedata.Segment",
1941+
.basicsize = sizeof(SegmentObject),
19351942
.flags = (
19361943
Py_TPFLAGS_DEFAULT
19371944
| Py_TPFLAGS_HAVE_GC
19381945
| Py_TPFLAGS_DISALLOW_INSTANTIATION
19391946
| Py_TPFLAGS_IMMUTABLETYPE
19401947
),
1941-
.slots = Grapheme_slots
1948+
.slots = Segment_slots
19421949
};
19431950

19441951

@@ -1982,18 +1989,17 @@ GBI_iternext(PyObject *self)
19821989
return NULL;
19831990
}
19841991
PyObject *module = PyType_GetModule(Py_TYPE(it));
1985-
PyObject *GraphemeType = get_unicodedata_state(module)->GraphemeType;
1986-
GraphemeObject *g = PyObject_GC_New(GraphemeObject,
1987-
(PyTypeObject *)GraphemeType);
1988-
if (!g) {
1992+
PyObject *SegmentType = get_unicodedata_state(module)->SegmentType;
1993+
SegmentObject *s = PyObject_GC_New(SegmentObject,
1994+
(PyTypeObject *)SegmentType);
1995+
if (!s) {
19891996
return NULL;
19901997
}
1991-
g->string = Py_NewRef(it->iter.str);
1992-
g->start = start;
1993-
g->end = pos;
1994-
PyObject_GC_Track(g);
1995-
return (PyObject *)g;
1996-
// return PyUnicode_Substring(it->iter.str, start, pos);
1998+
s->string = Py_NewRef(it->iter.str);
1999+
s->start = start;
2000+
s->end = pos;
2001+
PyObject_GC_Track(s);
2002+
return (PyObject *)s;
19972003
}
19982004

19992005

@@ -2180,7 +2186,7 @@ static int
21802186
unicodedata_traverse(PyObject *module, visitproc visit, void *arg)
21812187
{
21822188
unicodedatastate *state = get_unicodedata_state(module);
2183-
Py_VISIT(state->GraphemeType);
2189+
Py_VISIT(state->SegmentType);
21842190
Py_VISIT(state->GraphemeBreakIteratorType);
21852191
return 0;
21862192
}
@@ -2189,7 +2195,7 @@ static int
21892195
unicodedata_clear(PyObject *module)
21902196
{
21912197
unicodedatastate *state = get_unicodedata_state(module);
2192-
Py_CLEAR(state->GraphemeType);
2198+
Py_CLEAR(state->SegmentType);
21932199
Py_CLEAR(state->GraphemeBreakIteratorType);
21942200
return 0;
21952201
}
@@ -2205,11 +2211,11 @@ unicodedata_exec(PyObject *module)
22052211
{
22062212
unicodedatastate *state = get_unicodedata_state(module);
22072213

2208-
PyObject *GraphemeType = PyType_FromModuleAndSpec(module, &Grapheme_spec, NULL);
2209-
if (GraphemeType == NULL) {
2214+
PyObject *SegmentType = PyType_FromModuleAndSpec(module, &Segment_spec, NULL);
2215+
if (SegmentType == NULL) {
22102216
return -1;
22112217
}
2212-
state->GraphemeType = GraphemeType;
2218+
state->SegmentType = SegmentType;
22132219

22142220
PyObject *GraphemeBreakIteratorType = PyType_FromModuleAndSpec(module, &GraphemeBreakIterator_spec, NULL);
22152221
if (GraphemeBreakIteratorType == NULL) {

0 commit comments

Comments
 (0)