fix: fix html parser raw text escapable mode

timonviola · timonviola · commit b7900cd4cbe1 · 2025-06-09T20:24:09.000+02:00
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase):
     containing respectively the named or numeric reference as the
     argument.
     """
-
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # For escapable raw text elements (textarea and title), CDATA mode is reused
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
 
     def __init__(self, *, convert_charrefs=True):
         """Initialize and reset this instance.
@@ -117,6 +117,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._raw_escapable = False
         super().reset()
 
     def feed(self, data):
@@ -140,11 +141,16 @@ def get_starttag_text(self):
 
     def set_cdata_mode(self, elem):
         self.cdata_elem = elem.lower()
-        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+        if self.cdata_elem in ["textarea", "title"]:
+            self._raw_escapable = True
+            self.interesting = re.compile('[&]')
+        else:
+            self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
 
     def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._raw_escapable = False
 
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
@@ -154,7 +160,7 @@ def goahead(self, end):
         i = 0
         n = len(rawdata)
         while i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                 j = rawdata.find('<', i)
                 if j < 0:
                     # if we can't find the next <, either we are at the end
@@ -177,7 +183,7 @@ def goahead(self, end):
                         break
                     j = n
             if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
@@ -210,7 +216,7 @@ def goahead(self, end):
                             k = i + 1
                     else:
                         k += 1
-                    if self.convert_charrefs and not self.cdata_elem:
+                    if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                         self.handle_data(unescape(rawdata[i:k]))
                     else:
                         self.handle_data(rawdata[i:k])
@@ -261,7 +267,7 @@ def goahead(self, end):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                 self.handle_data(unescape(rawdata[i:n]))
             else:
                 self.handle_data(rawdata[i:n])
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -295,6 +295,65 @@ def test_cdata_content(self):
                                     ("data", content),
                                     ("endtag", element_lower)])
 
+    def test_raw_text_content(self):
+        # Tags should be treated as text in raw text and escapable raw text content.
+        content = """<h1>tagshould be handled as text"""
+        elements = [
+            "script",
+            "style",
+            "title",
+            "textarea",
+            "SCRIPT",
+            "STYLE",
+            "TITLE",
+            "TEXTAREA",
+            "Script",
+            "Style",
+            "Title",
+            "Textarea",
+        ]
+        for element in elements:
+            source = f"<{element}>{content}"
+            self._run_check(source, [
+                ("starttag", element.lower(), []),
+                ("data", content)
+            ])
+
+    def test_escapable_raw_text_content(self):
+        # Charrefs should be escaped in esacapable raw text content.
+        class Collector(EventCollector):
+            pass
+
+        content = "Timon &amp; Pumba"
+        expected = "Timon & Pumba"
+        elements = [
+            "title",
+            "textarea",
+            "TITLE",
+            "TEXTAREA",
+            "Title",
+            "Textarea",
+        ]
+        for element in elements:
+            source = f"<{element}>{content}"
+            self._run_check(
+                source, [
+                  ("starttag", element.lower(), []),
+                  ('data', expected),
+                ],
+                collector=Collector(convert_charrefs=True),
+            )
+            # test with convert_charrefs=False
+            self._run_check(
+                source, [
+                  ("starttag", element.lower(), []),
+                  ('data', 'Timon '),
+                  ('entityref', 'amp'),
+                  ('data', ' Pumba')
+                ],
+            )
+
+
     def test_cdata_with_closing_tags(self):
         # see issue #13358
         # make sure that HTMLParser calls handle_data only once for each CDATA.