Skip to content

Commit 524cac5

Browse files
* Make CDATA section parsing context depending.
* Add HTMLParser.support_cdata().
1 parent 9e1ae33 commit 524cac5

File tree

3 files changed

+74
-13
lines changed

3 files changed

+74
-13
lines changed

Doc/library/html.parser.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,17 @@ The output will then be:
121121
attributes can be preserved, etc.).
122122

123123

124+
.. method:: HTMLParser.support_cdata(flag)
125+
126+
Sets how the parser will parse CDATA declarations.
127+
If *flag* is true, then the :meth:`unknown_decl` method will be called
128+
for the CDATA section ``<![CDATA[...]]>``.
129+
If *flag* is false, then the :meth:`handle_comment` method will be called
130+
for ``<![CDATA[...>``.
131+
132+
.. versionadded:: 3.13.6
133+
134+
124135
The following methods are called when data or markup elements are encountered
125136
and they are meant to be overridden in a subclass. The base class
126137
implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):

Lib/html/parser.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def reset(self):
144144
self.lasttag = '???'
145145
self.interesting = interesting_normal
146146
self.cdata_elem = None
147+
self._support_cdata = False
147148
super().reset()
148149

149150
def feed(self, data):
@@ -174,6 +175,9 @@ def clear_cdata_mode(self):
174175
self.interesting = interesting_normal
175176
self.cdata_elem = None
176177

178+
def support_cdata(self, flag=True):
179+
self._support_cdata = flag
180+
177181
# Internal -- handle data as far as reasonable. May leave state
178182
# and data to be processed by a subsequent call. If 'end' is
179183
# true, force handling all data as if followed by EOF marker.
@@ -249,7 +253,10 @@ def goahead(self, end):
249253
break
250254
self.handle_comment(rawdata[i+4:j])
251255
elif startswith("<![CDATA[", i):
252-
self.unknown_decl(rawdata[i+3:])
256+
if self._support_cdata:
257+
self.unknown_decl(rawdata[i+3:])
258+
else:
259+
self.handle_comment(rawdata[i+1:])
253260
elif rawdata[i:i+9].lower() == '<!doctype':
254261
self.handle_decl(rawdata[i+2:])
255262
elif startswith("<!", i):
@@ -325,11 +332,14 @@ def parse_html_declaration(self, i):
325332
# this case is actually already handled in goahead()
326333
return self.parse_comment(i)
327334
elif rawdata[i:i+9] == '<![CDATA[':
328-
j = rawdata.find(']]>')
329-
if j < 0:
330-
return -1
331-
self.unknown_decl(rawdata[i+3: j])
332-
return j + 3
335+
if self._support_cdata:
336+
j = rawdata.find(']]>', i+9)
337+
if j < 0:
338+
return -1
339+
self.unknown_decl(rawdata[i+3: j])
340+
return j + 3
341+
else:
342+
return self.parse_bogus_comment(i)
333343
elif rawdata[i:i+9].lower() == '<!doctype':
334344
# find the closing >
335345
gtpos = rawdata.find('>', i+9)

Lib/test/test_htmlparser.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,16 @@ def get_events(self):
3434

3535
def handle_starttag(self, tag, attrs):
3636
self.append(("starttag", tag, attrs))
37+
if tag == 'svg':
38+
self.support_cdata(True)
3739

3840
def handle_startendtag(self, tag, attrs):
3941
self.append(("startendtag", tag, attrs))
4042

4143
def handle_endtag(self, tag):
4244
self.append(("endtag", tag))
45+
if tag == 'svg':
46+
self.support_cdata(False)
4347

4448
# all other markup
4549

@@ -643,10 +647,22 @@ def test_eof_in_declarations(self):
643647
('<!', [('comment', '')]),
644648
('<!-', [('comment', '-')]),
645649
('<![', [('comment', '[')]),
646-
('<![CDATA[', [('unknown decl', 'CDATA[')]),
647-
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
648-
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
649-
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
650+
('<![CDATA[', [('comment', '![CDATA[')]),
651+
('<![CDATA[x', [('comment', '![CDATA[x')]),
652+
('<![CDATA[x]', [('comment', '![CDATA[x]')]),
653+
('<![CDATA[x]]', [('comment', '![CDATA[x]]')]),
654+
('<svg><text y="100"><![CDATA[',
655+
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
656+
('unknown decl', 'CDATA[')]),
657+
('<svg><text y="100"><![CDATA[x',
658+
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
659+
('unknown decl', 'CDATA[x')]),
660+
('<svg><text y="100"><![CDATA[x]',
661+
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
662+
('unknown decl', 'CDATA[x]')]),
663+
('<svg><text y="100"><![CDATA[x]]',
664+
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
665+
('unknown decl', 'CDATA[x]]')]),
650666
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
651667
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
652668
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -737,11 +753,35 @@ def test_broken_condcoms(self):
737753
' printf("[<marquee>How?</marquee>]");\n'
738754
' }\n'),
739755
])
740-
def test_cdata_section(self, content):
756+
def test_cdata_section_content(self, content):
741757
# See "13.2.5.42 Markup declaration open state",
742758
# "13.2.5.69 CDATA section state", and issue bpo-32876.
743-
html = f'<![CDATA[{content}]]>'
744-
expected = [('unknown decl', 'CDATA[' + content)]
759+
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
760+
expected = [
761+
('starttag', 'svg', []),
762+
('starttag', 'text', [('y', '100')]),
763+
('unknown decl', 'CDATA[' + content),
764+
('endtag', 'text'),
765+
('endtag', 'svg'),
766+
]
767+
self._run_check(html, expected)
768+
769+
def test_cdata_section(self):
770+
# See "13.2.5.42 Markup declaration open state".
771+
html = ('<![CDATA[foo<br>bar]]>'
772+
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
773+
'<![CDATA[foo<br>bar]]>')
774+
expected = [
775+
('comment', '[CDATA[foo<br'),
776+
('data', 'bar]]>'),
777+
('starttag', 'svg', []),
778+
('starttag', 'text', [('y', '100')]),
779+
('unknown decl', 'CDATA[foo<br>bar'),
780+
('endtag', 'text'),
781+
('endtag', 'svg'),
782+
('comment', '[CDATA[foo<br'),
783+
('data', 'bar]]>'),
784+
]
745785
self._run_check(html, expected)
746786

747787
def test_convert_charrefs_dropped_text(self):

0 commit comments

Comments
 (0)