2626
2727starttagopen = re .compile ('<[a-zA-Z]' )
2828piclose = re .compile ('>' )
29+ escapable_raw_text_close = re .compile ('</(title|textarea)>' , re .I )
2930commentclose = re .compile (r'--\s*>' )
3031# Note:
3132# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase):
8283 """
8384
8485 CDATA_CONTENT_ELEMENTS = ("script" , "style" )
86+ ESCAPABLE_RAW_TEXT_ELEMENTS = ("title" , "textarea" )
8587
8688 def __init__ (self , * , convert_charrefs = True ):
8789 """Initialize and reset this instance.
@@ -99,6 +101,7 @@ def reset(self):
99101 self .lasttag = '???'
100102 self .interesting = interesting_normal
101103 self .cdata_elem = None
104+ self .escapable_raw_text_elem = None
102105 super ().reset ()
103106
104107 def feed (self , data ):
@@ -120,6 +123,14 @@ def get_starttag_text(self):
120123 """Return full source of start tag: '<...>'."""
121124 return self .__starttag_text
122125
126+ def set_escapable_raw_text_mode (self , elem ):
127+ self .escapable_raw_text_elem = elem .lower ()
128+ self .interesting = re .compile (r'</\s*%s\s*>' % self .escapable_raw_text_elem , re .I )
129+
130+ def clear_escapable_raw_text_mode (self ):
131+ self .interesting = interesting_normal
132+ self .escapable_raw_text_elem = None
133+
123134 def set_cdata_mode (self , elem ):
124135 self .cdata_elem = elem .lower ()
125136 self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
@@ -136,7 +147,7 @@ def goahead(self, end):
136147 i = 0
137148 n = len (rawdata )
138149 while i < n :
139- if self .convert_charrefs and not self .cdata_elem :
150+ if self .convert_charrefs and not self .cdata_elem and not self . escapable_raw_text_elem :
140151 j = rawdata .find ('<' , i )
141152 if j < 0 :
142153 # if we can't find the next <, either we are at the end
@@ -155,11 +166,13 @@ def goahead(self, end):
155166 if match :
156167 j = match .start ()
157168 else :
169+ if self .escapable_raw_text_elem :
170+ break
158171 if self .cdata_elem :
159172 break
160173 j = n
161174 if i < j :
162- if self .convert_charrefs and not self .cdata_elem :
175+ if self .convert_charrefs and not self .cdata_elem and not self . escapable_raw_text_elem :
163176 self .handle_data (unescape (rawdata [i :j ]))
164177 else :
165178 self .handle_data (rawdata [i :j ])
@@ -336,6 +349,8 @@ def parse_starttag(self, i):
336349 self .handle_startendtag (tag , attrs )
337350 else :
338351 self .handle_starttag (tag , attrs )
352+ if tag in self .ESCAPABLE_RAW_TEXT_ELEMENTS :
353+ self .set_escapable_raw_text_mode (tag )
339354 if tag in self .CDATA_CONTENT_ELEMENTS :
340355 self .set_cdata_mode (tag )
341356 return endpos
@@ -411,8 +426,14 @@ def parse_endtag(self, i):
411426 self .handle_data (rawdata [i :gtpos ])
412427 return gtpos
413428
429+ if self .escapable_raw_text_elem is not None : # title or textarea
430+ if elem != self .escapable_raw_text_elem :
431+ self .handle_data (rawdata [i :gtpos ])
432+ return gtpos
433+
414434 self .handle_endtag (elem )
415435 self .clear_cdata_mode ()
436+ self .clear_escapable_raw_text_mode ()
416437 return gtpos
417438
418439 # Overridable -- finish processing of start+end tag: <tag.../>
0 commit comments