Skip to content

Commit 420af54

Browse files
committed
fix: add escapable raw text mode to html parsel
1 parent 4b9e10d commit 420af54

File tree

2 files changed

+76
-3
lines changed

2 files changed

+76
-3
lines changed

Lib/html/parser.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
starttagopen = re.compile('<[a-zA-Z]')
2828
piclose = re.compile('>')
29+
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
2930
commentclose = re.compile(r'--\s*>')
3031
# Note:
3132
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase):
8283
"""
8384

8485
CDATA_CONTENT_ELEMENTS = ("script", "style")
86+
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")
8587

8688
def __init__(self, *, convert_charrefs=True):
8789
"""Initialize and reset this instance.
@@ -99,6 +101,7 @@ def reset(self):
99101
self.lasttag = '???'
100102
self.interesting = interesting_normal
101103
self.cdata_elem = None
104+
self.escapable_raw_text_elem = None
102105
super().reset()
103106

104107
def feed(self, data):
@@ -120,6 +123,14 @@ def get_starttag_text(self):
120123
"""Return full source of start tag: '<...>'."""
121124
return self.__starttag_text
122125

126+
def set_escapable_raw_text_mode(self, elem):
127+
self.escapable_raw_text_elem = elem.lower()
128+
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)
129+
130+
def clear_escapable_raw_text_mode(self):
131+
self.interesting = interesting_normal
132+
self.escapable_raw_text_elem = None
133+
123134
def set_cdata_mode(self, elem):
124135
self.cdata_elem = elem.lower()
125136
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
@@ -136,7 +147,7 @@ def goahead(self, end):
136147
i = 0
137148
n = len(rawdata)
138149
while i < n:
139-
if self.convert_charrefs and not self.cdata_elem:
150+
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
140151
j = rawdata.find('<', i)
141152
if j < 0:
142153
# if we can't find the next <, either we are at the end
@@ -155,11 +166,13 @@ def goahead(self, end):
155166
if match:
156167
j = match.start()
157168
else:
169+
if self.escapable_raw_text_elem:
170+
break
158171
if self.cdata_elem:
159172
break
160173
j = n
161174
if i < j:
162-
if self.convert_charrefs and not self.cdata_elem:
175+
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
163176
self.handle_data(unescape(rawdata[i:j]))
164177
else:
165178
self.handle_data(rawdata[i:j])
@@ -336,6 +349,8 @@ def parse_starttag(self, i):
336349
self.handle_startendtag(tag, attrs)
337350
else:
338351
self.handle_starttag(tag, attrs)
352+
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
353+
self.set_escapable_raw_text_mode(tag)
339354
if tag in self.CDATA_CONTENT_ELEMENTS:
340355
self.set_cdata_mode(tag)
341356
return endpos
@@ -411,8 +426,14 @@ def parse_endtag(self, i):
411426
self.handle_data(rawdata[i:gtpos])
412427
return gtpos
413428

429+
if self.escapable_raw_text_elem is not None: # title or textarea
430+
if elem != self.escapable_raw_text_elem:
431+
self.handle_data(rawdata[i:gtpos])
432+
return gtpos
433+
414434
self.handle_endtag(elem)
415435
self.clear_cdata_mode()
436+
self.clear_escapable_raw_text_mode()
416437
return gtpos
417438

418439
# Overridable -- finish processing of start+end tag: <tag.../>

Lib/test/test_htmlparser.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def test_cdata_content(self):
285285
#'foo = </\nscript>',
286286
#'foo = </ script>',
287287
]
288-
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
288+
elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea']
289289
for content in contents:
290290
for element in elements:
291291
element_lower = element.lower()
@@ -317,6 +317,58 @@ def get_events(self):
317317
("endtag", element_lower)],
318318
collector=Collector(convert_charrefs=False))
319319

320+
def test_escapable_raw_text_content(self):
321+
contents = [
322+
'<h2>This is a header</h2>',
323+
'Rebelious<h1>Heading'
324+
'<!-- not a comment --> &not-an-entity-ref;',
325+
"<not a='start tag'>",
326+
'<a href="" /> <p> <span></span>',
327+
'foo = "</scr" + "ipt>";',
328+
'foo = "</TITLE" + ">";',
329+
'foo = <\n/title> ',
330+
'<!-- document.write("</scr" + "ipt>"); -->',
331+
'\n//<![CDATA[\n'
332+
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
333+
'foo = "</sty" + "le>";',
334+
'<!-- \u2603 -->',
335+
# these two should be invalid according to the HTML 5 spec,
336+
# section 8.1.2.2
337+
#'foo = </\nscript>',
338+
#'foo = </ script>',
339+
]
340+
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
341+
for content in contents:
342+
for element in elements:
343+
element_lower = element.lower()
344+
s = '<{element}>{content}</{element}>'.format(element=element,
345+
content=content)
346+
self._run_check(s, [("starttag", element_lower, []),
347+
("data", content),
348+
("endtag", element_lower)])
349+
350+
def test_escapable_raw_text_with_closing_tags(self):
351+
# see issue #13358
352+
# make sure that HTMLParser calls handle_data only once for each CDATA.
353+
# The normal event collector normalizes the events in get_events,
354+
# so we override it to return the original list of events.
355+
class Collector(EventCollector):
356+
def get_events(self):
357+
return self.events
358+
359+
content = """<!-- not a comment --> &not-an-entity-ref;
360+
<a href="" /> </p><p> <span></span></style>
361+
'</script' + '>'"""
362+
for element in [' script', 'script ', ' script ',
363+
'\nscript', 'script\n', '\nscript\n']:
364+
element_lower = element.lower().strip()
365+
s = '<script>{content}</{element}>'.format(element=element,
366+
content=content)
367+
self._run_check(s, [("starttag", element_lower, []),
368+
("data", content),
369+
("endtag", element_lower)],
370+
collector=Collector(convert_charrefs=False))
371+
320372
def test_comments(self):
321373
html = ("<!-- I'm a valid comment -->"
322374
'<!--me too!-->'

0 commit comments

Comments
 (0)