Skip to content

Commit 7824ee8

Browse files
Fix errors and rewrite tests.
1 parent 4a9ad26 commit 7824ee8

File tree

2 files changed

+99
-60
lines changed

2 files changed

+99
-60
lines changed

Lib/html/parser.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,10 @@ def get_starttag_text(self):
169169

170170
def set_cdata_mode(self, elem):
171171
self.cdata_elem = elem.lower()
172-
if self.cdata_elem in ["textarea", "title"]:
173-
self._raw_escapable = True
174-
self.interesting = re.compile('[&]')
172+
self._raw_escapable = self.cdata_elem in ("textarea", "title")
173+
if self._raw_escapable and not self.convert_charrefs:
174+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
175+
re.IGNORECASE|re.ASCII)
175176
else:
176177
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
177178
re.IGNORECASE|re.ASCII)
@@ -189,7 +190,7 @@ def goahead(self, end):
189190
i = 0
190191
n = len(rawdata)
191192
while i < n:
192-
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
193+
if self.convert_charrefs and not self.cdata_elem:
193194
j = rawdata.find('<', i)
194195
if j < 0:
195196
# if we can't find the next <, either we are at the end

Lib/test/test_htmlparser.py

Lines changed: 94 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -317,63 +317,48 @@ def test_style_content(self, content):
317317
("data", content),
318318
("endtag", "style")])
319319

320-
def test_raw_text_content(self):
321-
# Tags should be treated as text in raw text and escapable raw text content.
322-
content = """<h1>tagshould be handled as text"""
323-
elements = [
324-
"script",
325-
"style",
326-
"title",
327-
"textarea",
328-
"SCRIPT",
329-
"STYLE",
330-
"TITLE",
331-
"TEXTAREA",
332-
"Script",
333-
"Style",
334-
"Title",
335-
"Textarea",
336-
]
337-
for element in elements:
338-
source = f"<{element}>{content}"
339-
self._run_check(source, [
340-
("starttag", element.lower(), []),
341-
("data", content)
342-
])
320+
@support.subTests('content', [
321+
'<!-- not a comment -->',
322+
"<not a='start tag'>",
323+
'<![CDATA[not a cdata]]>',
324+
'<!not a bogus comment>',
325+
'</not a bogus comment>',
326+
'\u2603',
327+
'< /title>',
328+
'</ title>',
329+
'</titled>',
330+
'</title\v>',
331+
'</title\xa0>',
332+
'</tıtle>',
333+
])
334+
def test_title_content(self, content):
335+
source = f"<title>{content}</title>"
336+
self._run_check(source, [
337+
("starttag", "title", []),
338+
("data", content),
339+
("endtag", "title"),
340+
])
343341

344-
def test_escapable_raw_text_content(self):
345-
# Charrefs should be escaped in esacapable raw text content.
346-
class Collector(EventCollector):
347-
pass
348-
349-
content = "Timon &amp; Pumba"
350-
expected = "Timon & Pumba"
351-
elements = [
352-
"title",
353-
"textarea",
354-
"TITLE",
355-
"TEXTAREA",
356-
"Title",
357-
"Textarea",
358-
]
359-
for element in elements:
360-
source = f"<{element}>{content}"
361-
self._run_check(
362-
source, [
363-
("starttag", element.lower(), []),
364-
('data', expected),
365-
],
366-
collector=Collector(convert_charrefs=True),
367-
)
368-
# test with convert_charrefs=False
369-
self._run_check(
370-
source, [
371-
("starttag", element.lower(), []),
372-
('data', 'Timon '),
373-
('entityref', 'amp'),
374-
('data', ' Pumba')
375-
],
376-
)
342+
@support.subTests('content', [
343+
'<!-- not a comment -->',
344+
"<not a='start tag'>",
345+
'<![CDATA[not a cdata]]>',
346+
'<!not a bogus comment>',
347+
'</not a bogus comment>',
348+
'\u2603',
349+
'< /textarea>',
350+
'</ textarea>',
351+
'</textareable>',
352+
'</textarea\v>',
353+
'</textarea\xa0>',
354+
])
355+
def test_textarea_content(self, content):
356+
source = f"<textarea>{content}</textarea>"
357+
self._run_check(source, [
358+
("starttag", "textarea", []),
359+
("data", content),
360+
("endtag", "textarea"),
361+
])
377362

378363
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
379364
'script/', 'script foo=bar', 'script foo=">"'])
@@ -404,6 +389,38 @@ def test_style_closing_tag(self, endtag):
404389
("endtag", "style")],
405390
collector=EventCollectorNoNormalize(convert_charrefs=False))
406391

392+
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
393+
'title/', 'title foo=bar', 'title foo=">"'])
394+
def test_title_closing_tag(self, endtag):
395+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
396+
s = f'<TitLe>{content}</{endtag}>'
397+
self._run_check(s, [("starttag", "title", []),
398+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
399+
("endtag", "title")],
400+
collector=EventCollectorNoNormalize(convert_charrefs=True))
401+
self._run_check(s, [("starttag", "title", []),
402+
('data', '<!-- not a comment --><i>Egg '),
403+
('entityref', 'amp'),
404+
('data', ' Spam</i>'),
405+
("endtag", "title")],
406+
collector=EventCollectorNoNormalize(convert_charrefs=False))
407+
408+
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
409+
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
410+
def test_textarea_closing_tag(self, endtag):
411+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
412+
s = f'<TexTarEa>{content}</{endtag}>'
413+
self._run_check(s, [("starttag", "textarea", []),
414+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
415+
("endtag", "textarea")],
416+
collector=EventCollectorNoNormalize(convert_charrefs=True))
417+
self._run_check(s, [("starttag", "textarea", []),
418+
('data', '<!-- not a comment --><i>Egg '),
419+
('entityref', 'amp'),
420+
('data', ' Spam</i>'),
421+
("endtag", "textarea")],
422+
collector=EventCollectorNoNormalize(convert_charrefs=False))
423+
407424
@support.subTests('tail,end', [
408425
('', False),
409426
('<', False),
@@ -421,6 +438,27 @@ def test_eof_in_script(self, tail, end):
421438
("data", content if end else content + tail)],
422439
collector=EventCollectorNoNormalize(convert_charrefs=False))
423440

441+
@support.subTests('tail,end', [
442+
('', False),
443+
('<', False),
444+
('</', False),
445+
('</t', False),
446+
('</title', False),
447+
('</title ', True),
448+
('</title foo=bar', True),
449+
('</title foo=">', True),
450+
])
451+
def test_eof_in_title(self, tail, end):
452+
s = f'<TitLe>Egg &amp; Spam{tail}'
453+
self._run_check(s, [("starttag", "title", []),
454+
("data", "Egg & Spam" + ('' if end else tail))],
455+
collector=EventCollectorNoNormalize(convert_charrefs=True))
456+
self._run_check(s, [("starttag", "title", []),
457+
('data', 'Egg '),
458+
('entityref', 'amp'),
459+
('data', ' Spam' + ('' if end else tail))],
460+
collector=EventCollectorNoNormalize(convert_charrefs=False))
461+
424462
def test_comments(self):
425463
html = ("<!-- I'm a valid comment -->"
426464
'<!--me too!-->'

0 commit comments

Comments
 (0)