Skip to content

Commit b7900cd

Browse files
committed
fix: fix html parser raw text escapable mode
1 parent 28d91d0 commit b7900cd

File tree

2 files changed

+72
-7
lines changed

2 files changed

+72
-7
lines changed

Lib/html/parser.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase):
9898
containing respectively the named or numeric reference as the
9999
argument.
100100
"""
101-
102-
CDATA_CONTENT_ELEMENTS = ("script", "style")
101+
# For escapable raw text elements (textarea and title), CDATA mode is reused
102+
CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
103103

104104
def __init__(self, *, convert_charrefs=True):
105105
"""Initialize and reset this instance.
@@ -117,6 +117,7 @@ def reset(self):
117117
self.lasttag = '???'
118118
self.interesting = interesting_normal
119119
self.cdata_elem = None
120+
self._raw_escapable = False
120121
super().reset()
121122

122123
def feed(self, data):
@@ -140,11 +141,16 @@ def get_starttag_text(self):
140141

141142
def set_cdata_mode(self, elem):
142143
self.cdata_elem = elem.lower()
143-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
144+
if self.cdata_elem in ["textarea", "title"]:
145+
self._raw_escapable = True
146+
self.interesting = re.compile('[&]')
147+
else:
148+
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
144149

145150
def clear_cdata_mode(self):
146151
self.interesting = interesting_normal
147152
self.cdata_elem = None
153+
self._raw_escapable = False
148154

149155
# Internal -- handle data as far as reasonable. May leave state
150156
# and data to be processed by a subsequent call. If 'end' is
@@ -154,7 +160,7 @@ def goahead(self, end):
154160
i = 0
155161
n = len(rawdata)
156162
while i < n:
157-
if self.convert_charrefs and not self.cdata_elem:
163+
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
158164
j = rawdata.find('<', i)
159165
if j < 0:
160166
# if we can't find the next <, either we are at the end
@@ -177,7 +183,7 @@ def goahead(self, end):
177183
break
178184
j = n
179185
if i < j:
180-
if self.convert_charrefs and not self.cdata_elem:
186+
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
181187
self.handle_data(unescape(rawdata[i:j]))
182188
else:
183189
self.handle_data(rawdata[i:j])
@@ -210,7 +216,7 @@ def goahead(self, end):
210216
k = i + 1
211217
else:
212218
k += 1
213-
if self.convert_charrefs and not self.cdata_elem:
219+
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
214220
self.handle_data(unescape(rawdata[i:k]))
215221
else:
216222
self.handle_data(rawdata[i:k])
@@ -261,7 +267,7 @@ def goahead(self, end):
261267
assert 0, "interesting.search() lied"
262268
# end while
263269
if end and i < n:
264-
if self.convert_charrefs and not self.cdata_elem:
270+
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
265271
self.handle_data(unescape(rawdata[i:n]))
266272
else:
267273
self.handle_data(rawdata[i:n])

Lib/test/test_htmlparser.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,65 @@ def test_cdata_content(self):
295295
("data", content),
296296
("endtag", element_lower)])
297297

298+
def test_raw_text_content(self):
299+
# Tags should be treated as text in raw text and escapable raw text content.
300+
content = """<h1>tagshould be handled as text"""
301+
elements = [
302+
"script",
303+
"style",
304+
"title",
305+
"textarea",
306+
"SCRIPT",
307+
"STYLE",
308+
"TITLE",
309+
"TEXTAREA",
310+
"Script",
311+
"Style",
312+
"Title",
313+
"Textarea",
314+
]
315+
for element in elements:
316+
source = f"<{element}>{content}"
317+
self._run_check(source, [
318+
("starttag", element.lower(), []),
319+
("data", content)
320+
])
321+
322+
def test_escapable_raw_text_content(self):
323+
# Charrefs should be escaped in esacapable raw text content.
324+
class Collector(EventCollector):
325+
pass
326+
327+
content = "Timon &amp; Pumba"
328+
expected = "Timon & Pumba"
329+
elements = [
330+
"title",
331+
"textarea",
332+
"TITLE",
333+
"TEXTAREA",
334+
"Title",
335+
"Textarea",
336+
]
337+
for element in elements:
338+
source = f"<{element}>{content}"
339+
self._run_check(
340+
source, [
341+
("starttag", element.lower(), []),
342+
('data', expected),
343+
],
344+
collector=Collector(convert_charrefs=True),
345+
)
346+
# test with convert_charrefs=False
347+
self._run_check(
348+
source, [
349+
("starttag", element.lower(), []),
350+
('data', 'Timon '),
351+
('entityref', 'amp'),
352+
('data', ' Pumba')
353+
],
354+
)
355+
356+
298357
def test_cdata_with_closing_tags(self):
299358
# see issue #13358
300359
# make sure that HTMLParser calls handle_data only once for each CDATA.

0 commit comments

Comments
 (0)