3131piclose = re .compile ('>' )
3232commentclose = re .compile (r'--\s*>' )
3333# Note:
34- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
35+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
3636# explode, so don't do it.
37- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37+ # see the HTML5 specs section "13.2.5.6 Tag open state",
38+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
3942tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
4043attrfind_tolerant = re .compile (r"""
4144 (
4952 )?
5053 (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
5154""" , re .VERBOSE )
52- locatetagend_tolerant = re .compile (r"""
55+ locatetagend = re .compile (r"""
5356 [a-zA-Z][^\t\n\r\f />]* # tag name
5457 [\t\n\r\f /]* # optional whitespace before attribute name
5558 (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
6366 )*
6467 >?
6568""" , re .VERBOSE )
69+ # The following variables are not used, but are temporarily left for
70+ # backward compatibility.
71+ locatestarttagend_tolerant = re .compile (r"""
72+ <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
73+ (?:[\s/]* # optional whitespace before attribute name
74+ (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
75+ (?:\s*=+\s* # value indicator
76+ (?:'[^']*' # LITA-enclosed value
77+ |"[^"]*" # LIT-enclosed value
78+ |(?!['"])[^>\s]* # bare value
79+ )
80+ \s* # possibly followed by a space
81+ )?(?:\s|/(?!>))*
82+ )*
83+ )?
84+ \s* # trailing whitespace
85+ """ , re .VERBOSE )
86+ endendtag = re .compile ('>' )
87+ endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
6688
6789# Character reference processing logic specific to attribute values
6890# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
@@ -315,7 +337,7 @@ def parse_html_declaration(self, i):
315337 return self .parse_bogus_comment (i )
316338
317339 # Internal -- parse bogus comment, return length or -1 if not terminated
318- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
340+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
319341 def parse_bogus_comment (self , i , report = 1 ):
320342 rawdata = self .rawdata
321343 assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -341,6 +363,8 @@ def parse_pi(self, i):
341363
342364 # Internal -- handle starttag, return end or -1 if not terminated
343365 def parse_starttag (self , i ):
366+ # See the HTML5 specs section "13.2.5.8 Tag name state"
367+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
344368 self .__starttag_text = None
345369 endpos = self .check_for_whole_start_tag (i )
346370 if endpos < 0 :
@@ -386,7 +410,7 @@ def parse_starttag(self, i):
386410 # or -1 if incomplete.
387411 def check_for_whole_start_tag (self , i ):
388412 rawdata = self .rawdata
389- match = locatetagend_tolerant .match (rawdata , i + 1 )
413+ match = locatetagend .match (rawdata , i + 1 )
390414 assert match
391415 j = match .end ()
392416 if rawdata [j - 1 ] != ">" :
@@ -395,24 +419,27 @@ def check_for_whole_start_tag(self, i):
395419
396420 # Internal -- parse endtag, return end or -1 if incomplete
397421 def parse_endtag (self , i ):
422+ # See the HTML5 specs section "13.2.5.7 End tag open state"
423+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
398424 rawdata = self .rawdata
399425 assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
400- if rawdata .find ('>' , i + 2 ) < 0 :
426+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
401427 return - 1
402428 if not endtagopen .match (rawdata , i ): # </ + letter
403- # w3.org/TR/html5/tokenization.html#end-tag-open-state
404429 if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
430+ # "missing-end-tag-name" parser error
405431 return i + 3
406432 else :
407433 return self .parse_bogus_comment (i )
408434
409- match = locatetagend_tolerant .match (rawdata , i + 2 )
435+ match = locatetagend .match (rawdata , i + 2 )
410436 assert match
411437 j = match .end ()
412438 if rawdata [j - 1 ] != ">" :
413439 return - 1
414440
415- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
441+ # find the name: "13.2.5.8 Tag name state"
442+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
416443 match = tagfind_tolerant .match (rawdata , i + 2 )
417444 assert match
418445 tag = match .group (1 ).lower ()
0 commit comments