Skip to content

Commit 9380ab9

Browse files
committed
Implement implicit concatenation with TemplateStr
1 parent 3e84771 commit 9380ab9

File tree

3 files changed

+268
-110
lines changed

3 files changed

+268
-110
lines changed

Grammar/python.gram

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ literal_pattern[pattern_ty]:
500500
literal_expr[expr_ty]:
501501
| signed_number !('+' | '-')
502502
| complex_number
503-
| &(STRING|FSTRING_START) strings
503+
| &(STRING|FSTRING_START|TSTRING_START) strings
504504
| 'None' { _PyAST_Constant(Py_None, NULL, EXTRA) }
505505
| 'True' { _PyAST_Constant(Py_True, NULL, EXTRA) }
506506
| 'False' { _PyAST_Constant(Py_False, NULL, EXTRA) }
@@ -840,8 +840,7 @@ atom[expr_ty]:
840840
| 'True' { _PyAST_Constant(Py_True, NULL, EXTRA) }
841841
| 'False' { _PyAST_Constant(Py_False, NULL, EXTRA) }
842842
| 'None' { _PyAST_Constant(Py_None, NULL, EXTRA) }
843-
| &(STRING|FSTRING_START) strings
844-
| &TSTRING_START tstring
843+
| &(STRING|FSTRING_START|TSTRING_START) strings
845844
| NUMBER
846845
| &'(' (tuple | group | genexp)
847846
| &'[' (list | listcomp)
@@ -939,7 +938,7 @@ tstring[expr_ty] (memo):
939938
| a=TSTRING_START b=tstring_middle* c=FSTRING_END { _PyPegen_template_str(p, a, (asdl_expr_seq*)b, c) }
940939

941940
string[expr_ty]: s[Token*]=STRING { _PyPegen_constant_from_string(p, s) }
942-
strings[expr_ty] (memo): a[asdl_expr_seq*]=(fstring|string)+ { _PyPegen_concatenate_strings(p, a, EXTRA) }
941+
strings[expr_ty] (memo): a[asdl_expr_seq*]=(fstring|string|tstring)+ { _PyPegen_concatenate_strings(p, a, EXTRA) }
943942

944943
list[expr_ty]:
945944
| '[' a=[star_named_expressions] ']' { _PyAST_List(a, Load, EXTRA) }

Parser/action_helpers.c

Lines changed: 204 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,83 +1587,112 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, Re
15871587
return _PyAST_JoinedStr(values, lineno, col_offset, debug_end_line, debug_end_offset, p->arena);
15881588
}
15891589

1590-
expr_ty
1591-
_PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
1592-
int lineno, int col_offset, int end_lineno,
1593-
int end_col_offset, PyArena *arena)
1590+
static expr_ty
1591+
_build_concatenated_bytes(Parser *p, asdl_expr_seq *strings, int lineno,
1592+
int col_offset, int end_lineno, int end_col_offset,
1593+
PyArena *arena)
15941594
{
15951595
Py_ssize_t len = asdl_seq_LEN(strings);
15961596
assert(len > 0);
15971597

1598-
int f_string_found = 0;
1599-
int unicode_string_found = 0;
1600-
int bytes_found = 0;
1598+
PyObject* res = Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
16011599

1602-
Py_ssize_t i = 0;
1603-
Py_ssize_t n_flattened_elements = 0;
1604-
for (i = 0; i < len; i++) {
1600+
/* Bytes literals never get a kind, but just for consistency
1601+
since they are represented as Constant nodes, we'll mirror
1602+
the same behavior as unicode strings for determining the
1603+
kind. */
1604+
PyObject* kind = asdl_seq_GET(strings, 0)->v.Constant.kind;
1605+
for (Py_ssize_t i = 0; i < len; i++) {
16051606
expr_ty elem = asdl_seq_GET(strings, i);
1606-
switch(elem->kind) {
1607-
case Constant_kind:
1608-
if (PyBytes_CheckExact(elem->v.Constant.value)) {
1609-
bytes_found = 1;
1610-
} else {
1611-
unicode_string_found = 1;
1612-
}
1613-
n_flattened_elements++;
1614-
break;
1615-
case JoinedStr_kind:
1616-
n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
1617-
f_string_found = 1;
1618-
break;
1619-
default:
1620-
n_flattened_elements++;
1621-
f_string_found = 1;
1622-
break;
1623-
}
1607+
PyBytes_Concat(&res, elem->v.Constant.value);
16241608
}
1609+
if (!res || _PyArena_AddPyObject(arena, res) < 0) {
1610+
Py_XDECREF(res);
1611+
return NULL;
1612+
}
1613+
return _PyAST_Constant(res, kind, lineno, col_offset, end_lineno, end_col_offset, p->arena);
1614+
}
16251615

1626-
if ((unicode_string_found || f_string_found) && bytes_found) {
1627-
RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
1616+
static expr_ty
1617+
_build_concatenated_unicode(Parser *p, asdl_expr_seq *strings, int lineno,
1618+
int col_offset, int end_lineno, int end_col_offset,
1619+
PyArena *arena)
1620+
{
1621+
Py_ssize_t len = asdl_seq_LEN(strings);
1622+
assert(len > 1);
1623+
1624+
expr_ty first = asdl_seq_GET(strings, 0);
1625+
1626+
/* When a string is getting concatenated, the kind of the string
1627+
is determined by the first string in the concatenation
1628+
sequence.
1629+
1630+
u"abc" "def" -> u"abcdef"
1631+
"abc" u"abc" -> "abcabc" */
1632+
PyObject *kind = first->v.Constant.kind;
1633+
1634+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
1635+
if (writer == NULL) {
16281636
return NULL;
16291637
}
16301638

1631-
if (bytes_found) {
1632-
PyObject* res = Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
1639+
for (Py_ssize_t i = 0; i < len; i++) {
1640+
expr_ty current_elem = asdl_seq_GET(strings, i);
1641+
assert(current_elem->kind == Constant_kind);
16331642

1634-
/* Bytes literals never get a kind, but just for consistency
1635-
since they are represented as Constant nodes, we'll mirror
1636-
the same behavior as unicode strings for determining the
1637-
kind. */
1638-
PyObject* kind = asdl_seq_GET(strings, 0)->v.Constant.kind;
1639-
for (i = 0; i < len; i++) {
1640-
expr_ty elem = asdl_seq_GET(strings, i);
1641-
PyBytes_Concat(&res, elem->v.Constant.value);
1642-
}
1643-
if (!res || _PyArena_AddPyObject(arena, res) < 0) {
1644-
Py_XDECREF(res);
1643+
if (PyUnicodeWriter_WriteStr(writer,
1644+
current_elem->v.Constant.value)) {
1645+
PyUnicodeWriter_Discard(writer);
16451646
return NULL;
16461647
}
1647-
return _PyAST_Constant(res, kind, lineno, col_offset, end_lineno, end_col_offset, p->arena);
16481648
}
16491649

1650-
if (!f_string_found && len == 1) {
1651-
return asdl_seq_GET(strings, 0);
1650+
PyObject *final = PyUnicodeWriter_Finish(writer);
1651+
if (final == NULL) {
1652+
return NULL;
1653+
}
1654+
if (_PyArena_AddPyObject(p->arena, final) < 0) {
1655+
Py_DECREF(final);
1656+
return NULL;
1657+
}
1658+
return _PyAST_Constant(final, kind, lineno, col_offset,
1659+
end_lineno, end_col_offset, arena);
1660+
}
1661+
1662+
static expr_ty
1663+
_build_concatenated_joined_str(Parser *p, asdl_expr_seq *strings,
1664+
int lineno, int col_offset, int end_lineno,
1665+
int end_col_offset, PyArena *arena)
1666+
{
1667+
Py_ssize_t len = asdl_seq_LEN(strings);
1668+
assert(len > 0);
1669+
1670+
Py_ssize_t n_flattened_elements = 0;
1671+
for (Py_ssize_t i = 0; i < len; i++) {
1672+
expr_ty elem = asdl_seq_GET(strings, i);
1673+
switch(elem->kind) {
1674+
case JoinedStr_kind:
1675+
n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
1676+
break;
1677+
default:
1678+
n_flattened_elements++;
1679+
break;
1680+
}
16521681
}
16531682

1683+
16541684
asdl_expr_seq* flattened = _Py_asdl_expr_seq_new(n_flattened_elements, p->arena);
16551685
if (flattened == NULL) {
16561686
return NULL;
16571687
}
16581688

16591689
/* build flattened list */
16601690
Py_ssize_t current_pos = 0;
1661-
Py_ssize_t j = 0;
1662-
for (i = 0; i < len; i++) {
1691+
for (Py_ssize_t i = 0; i < len; i++) {
16631692
expr_ty elem = asdl_seq_GET(strings, i);
16641693
switch(elem->kind) {
16651694
case JoinedStr_kind:
1666-
for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
1695+
for (Py_ssize_t j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
16671696
expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
16681697
if (subvalue == NULL) {
16691698
return NULL;
@@ -1680,13 +1709,13 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
16801709
/* calculate folded element count */
16811710
Py_ssize_t n_elements = 0;
16821711
int prev_is_constant = 0;
1683-
for (i = 0; i < n_flattened_elements; i++) {
1712+
for (Py_ssize_t i = 0; i < n_flattened_elements; i++) {
16841713
expr_ty elem = asdl_seq_GET(flattened, i);
16851714

16861715
/* The concatenation of a FormattedValue and an empty Constant should
16871716
lead to the FormattedValue itself. Thus, we will not take any empty
16881717
constants into account, just as in `_PyPegen_joined_str` */
1689-
if (f_string_found && elem->kind == Constant_kind &&
1718+
if (elem->kind == Constant_kind &&
16901719
PyUnicode_CheckExact(elem->v.Constant.value) &&
16911720
PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0)
16921721
continue;
@@ -1704,7 +1733,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
17041733

17051734
/* build folded list */
17061735
current_pos = 0;
1707-
for (i = 0; i < n_flattened_elements; i++) {
1736+
for (Py_ssize_t i = 0; i < n_flattened_elements; i++) {
17081737
expr_ty elem = asdl_seq_GET(flattened, i);
17091738

17101739
/* if the current elem and the following are constants,
@@ -1727,6 +1756,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
17271756
return NULL;
17281757
}
17291758
expr_ty last_elem = elem;
1759+
Py_ssize_t j;
17301760
for (j = i; j < n_flattened_elements; j++) {
17311761
expr_ty current_elem = asdl_seq_GET(flattened, j);
17321762
if (current_elem->kind == Constant_kind) {
@@ -1760,8 +1790,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
17601790
}
17611791

17621792
/* Drop all empty contanst strings */
1763-
if (f_string_found &&
1764-
PyUnicode_CheckExact(elem->v.Constant.value) &&
1793+
if (PyUnicode_CheckExact(elem->v.Constant.value) &&
17651794
PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0) {
17661795
continue;
17671796
}
@@ -1770,13 +1799,127 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
17701799
asdl_seq_SET(values, current_pos++, elem);
17711800
}
17721801

1773-
if (!f_string_found) {
1774-
assert(n_elements == 1);
1775-
expr_ty elem = asdl_seq_GET(values, 0);
1776-
assert(elem->kind == Constant_kind);
1777-
return elem;
1778-
}
1779-
17801802
assert(current_pos == n_elements);
17811803
return _PyAST_JoinedStr(values, lineno, col_offset, end_lineno, end_col_offset, p->arena);
17821804
}
1805+
1806+
static expr_ty
1807+
_build_concatenated_template_str(Parser *p, asdl_expr_seq *strings,
1808+
int lineno, int col_offset, int end_lineno,
1809+
int end_col_offset, PyArena *arena)
1810+
{
1811+
Py_ssize_t len = asdl_seq_LEN(strings);
1812+
assert(len > 0);
1813+
1814+
Py_ssize_t n_flattened_elements = 0;
1815+
for (Py_ssize_t i = 0; i < len; i++) {
1816+
expr_ty elem = asdl_seq_GET(strings, i);
1817+
switch(elem->kind) {
1818+
case TemplateStr_kind:
1819+
n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
1820+
break;
1821+
default:
1822+
n_flattened_elements++;
1823+
break;
1824+
}
1825+
}
1826+
1827+
1828+
asdl_expr_seq* flattened = _Py_asdl_expr_seq_new(n_flattened_elements, p->arena);
1829+
if (flattened == NULL) {
1830+
return NULL;
1831+
}
1832+
1833+
Py_ssize_t pos = 0;
1834+
for (Py_ssize_t i = 0; i < len; i++) {
1835+
expr_ty elem = asdl_seq_GET(strings, i);
1836+
1837+
switch (elem->kind) {
1838+
case TemplateStr_kind:
1839+
for (Py_ssize_t j = 0; j < asdl_seq_LEN(elem->v.TemplateStr.values); j++) {
1840+
expr_ty subitem = asdl_seq_GET(elem->v.TemplateStr.values, j);
1841+
asdl_seq_SET(flattened, pos++, subitem);
1842+
}
1843+
break;
1844+
case JoinedStr_kind: {
1845+
expr_ty joined_str = _build_concatenated_joined_str(p,
1846+
elem->v.JoinedStr.values, lineno, col_offset,
1847+
end_lineno, end_col_offset, arena);
1848+
asdl_seq_SET(flattened, pos++, joined_str);
1849+
break;
1850+
}
1851+
default:
1852+
asdl_seq_SET(flattened, pos++, elem);
1853+
break;
1854+
}
1855+
}
1856+
1857+
return _PyAST_TemplateStr(flattened, lineno, col_offset, end_lineno,
1858+
end_col_offset, arena);
1859+
}
1860+
1861+
expr_ty
1862+
_PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
1863+
int lineno, int col_offset, int end_lineno,
1864+
int end_col_offset, PyArena *arena)
1865+
{
1866+
Py_ssize_t len = asdl_seq_LEN(strings);
1867+
assert(len > 0);
1868+
1869+
int t_string_found = 0;
1870+
int f_string_found = 0;
1871+
int unicode_string_found = 0;
1872+
int bytes_found = 0;
1873+
1874+
Py_ssize_t i = 0;
1875+
for (i = 0; i < len; i++) {
1876+
expr_ty elem = asdl_seq_GET(strings, i);
1877+
switch(elem->kind) {
1878+
case Constant_kind:
1879+
if (PyBytes_CheckExact(elem->v.Constant.value)) {
1880+
bytes_found = 1;
1881+
} else {
1882+
unicode_string_found = 1;
1883+
}
1884+
break;
1885+
case JoinedStr_kind:
1886+
f_string_found = 1;
1887+
break;
1888+
case TemplateStr_kind:
1889+
t_string_found = 1;
1890+
break;
1891+
default:
1892+
f_string_found = 1;
1893+
break;
1894+
}
1895+
}
1896+
1897+
// Cannot mix unicode and bytes
1898+
if ((unicode_string_found || f_string_found || t_string_found) && bytes_found) {
1899+
RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
1900+
return NULL;
1901+
}
1902+
1903+
// If it's only bytes or only unicode string, do a simple concat
1904+
if (!f_string_found && !t_string_found) {
1905+
if (len == 1) {
1906+
return asdl_seq_GET(strings, 0);
1907+
}
1908+
else if (bytes_found) {
1909+
return _build_concatenated_bytes(p, strings, lineno, col_offset,
1910+
end_lineno, end_col_offset, arena);
1911+
}
1912+
else {
1913+
return _build_concatenated_unicode(p, strings, lineno, col_offset,
1914+
end_lineno, end_col_offset, arena);
1915+
}
1916+
}
1917+
1918+
if (t_string_found) {
1919+
return _build_concatenated_template_str(p, strings, lineno,
1920+
col_offset, end_lineno, end_col_offset, arena);
1921+
}
1922+
1923+
return _build_concatenated_joined_str(p, strings, lineno,
1924+
col_offset, end_lineno, end_col_offset, arena);
1925+
}

0 commit comments

Comments
 (0)