Skip to content

Commit 955db4e

Browse files
Address review comments.
1 parent d05303b commit 955db4e

File tree

2 files changed

+42
-15
lines changed

2 files changed

+42
-15
lines changed

Lib/html/parser.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,14 @@
3131
piclose = re.compile('>')
3232
commentclose = re.compile(r'--\s*>')
3333
# Note:
34-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
35+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3636
# explode, so don't do it.
37-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37+
# see the HTML5 specs section "13.2.5.6 Tag open state",
38+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
3942
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
4043
attrfind_tolerant = re.compile(r"""
4144
(
@@ -49,7 +52,7 @@
4952
)?
5053
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
5154
""", re.VERBOSE)
52-
locatetagend_tolerant = re.compile(r"""
55+
locatetagend = re.compile(r"""
5356
[a-zA-Z][^\t\n\r\f />]* # tag name
5457
[\t\n\r\f /]* # optional whitespace before attribute name
5558
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
@@ -63,6 +66,25 @@
6366
)*
6467
>?
6568
""", re.VERBOSE)
69+
# The following variables are not used, but are temporarily left for
70+
# backward compatibility.
71+
locatestarttagend_tolerant = re.compile(r"""
72+
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
73+
(?:[\s/]* # optional whitespace before attribute name
74+
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
75+
(?:\s*=+\s* # value indicator
76+
(?:'[^']*' # LITA-enclosed value
77+
|"[^"]*" # LIT-enclosed value
78+
|(?!['"])[^>\s]* # bare value
79+
)
80+
\s* # possibly followed by a space
81+
)?(?:\s|/(?!>))*
82+
)*
83+
)?
84+
\s* # trailing whitespace
85+
""", re.VERBOSE)
86+
endendtag = re.compile('>')
87+
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6688

6789
# Character reference processing logic specific to attribute values
6890
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
@@ -315,7 +337,7 @@ def parse_html_declaration(self, i):
315337
return self.parse_bogus_comment(i)
316338

317339
# Internal -- parse bogus comment, return length or -1 if not terminated
318-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
340+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
319341
def parse_bogus_comment(self, i, report=1):
320342
rawdata = self.rawdata
321343
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -341,6 +363,8 @@ def parse_pi(self, i):
341363

342364
# Internal -- handle starttag, return end or -1 if not terminated
343365
def parse_starttag(self, i):
366+
# See the HTML5 specs section "13.2.5.8 Tag name state"
367+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
344368
self.__starttag_text = None
345369
endpos = self.check_for_whole_start_tag(i)
346370
if endpos < 0:
@@ -386,7 +410,7 @@ def parse_starttag(self, i):
386410
# or -1 if incomplete.
387411
def check_for_whole_start_tag(self, i):
388412
rawdata = self.rawdata
389-
match = locatetagend_tolerant.match(rawdata, i+1)
413+
match = locatetagend.match(rawdata, i+1)
390414
assert match
391415
j = match.end()
392416
if rawdata[j-1] != ">":
@@ -395,24 +419,27 @@ def check_for_whole_start_tag(self, i):
395419

396420
# Internal -- parse endtag, return end or -1 if incomplete
397421
def parse_endtag(self, i):
422+
# See the HTML5 specs section "13.2.5.7 End tag open state"
423+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
398424
rawdata = self.rawdata
399425
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
400-
if rawdata.find('>', i+2) < 0:
426+
if rawdata.find('>', i+2) < 0: # fast check
401427
return -1
402428
if not endtagopen.match(rawdata, i): # </ + letter
403-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
404429
if rawdata[i+2:i+3] == '>': # </> is ignored
430+
# "missing-end-tag-name" parser error
405431
return i+3
406432
else:
407433
return self.parse_bogus_comment(i)
408434

409-
match = locatetagend_tolerant.match(rawdata, i+2)
435+
match = locatetagend.match(rawdata, i+2)
410436
assert match
411437
j = match.end()
412438
if rawdata[j-1] != ">":
413439
return -1
414440

415-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
441+
# find the name: "13.2.5.8 Tag name state"
442+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
416443
match = tagfind_tolerant.match(rawdata, i+2)
417444
assert match
418445
tag = match.group(1).lower()

Misc/NEWS.d/next/Library/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ according to the HTML5 standard.
99

1010
* Null character (U+0000) no longer ends the tag name.
1111

12-
* Attributes and slashes after the tag name in end tags are now correctly
13-
parsed as comments, instead of terminating after the first ``>``
14-
in quoted attribute value. E.g. ``</script/foo=">"/>``.
12+
* Attributes and slashes after the tag name in end tags are now ignored,
13+
instead of terminating after the first ``>`` in quoted attribute value.
14+
E.g. ``</script/foo=">"/>``.
1515

1616
* Multiple slashes and whitespaces between the last attribute and closing ``>``
17-
are now accepted in both start and end tags. E.g. ``<a foo=bar/ //>``.
17+
are now ignored in both start and end tags. E.g. ``<a foo=bar/ //>``.
1818

1919
* Multiple ``=`` between attribute name and value are no longer collapsed.
2020
E.g. ``<a foo==bar>`` produces attribute "foo" with value "=bar".

0 commit comments

Comments
 (0)