squash! gh-121284: Fix email address header folding with parsed encoded-word

medmunds · bitdancer · medmunds · commit c249511d5e81 · 2025-03-05T10:57:10.000-08:00
[Better fix from @bitdancer.] Co-authored-by: R David Murray <rdmurray@bitdance.com>
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
@@ -1053,7 +1053,7 @@ def get_fws(value):
     fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
     return fws, newvalue
 
-def get_encoded_word(value):
+def get_encoded_word(value, terminal_type='vtext'):
     """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
 
     """
@@ -1092,7 +1092,7 @@ def get_encoded_word(value):
             ew.append(token)
             continue
         chars, *remainder = _wsp_splitter(text, 1)
-        vtext = ValueTerminal(chars, 'vtext')
+        vtext = ValueTerminal(chars, terminal_type)
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
@@ -1134,7 +1134,7 @@ def get_unstructured(value):
         valid_ew = True
         if value.startswith('=?'):
             try:
-                token, value = get_encoded_word(value)
+                token, value = get_encoded_word(value, 'utext')
             except _InvalidEwError:
                 valid_ew = False
             except errors.HeaderParseError:
@@ -1163,7 +1163,7 @@ def get_unstructured(value):
         # the parser to go in an infinite loop.
         if valid_ew and rfc2047_matcher.search(tok):
             tok, *remainder = value.partition('=?')
-        vtext = ValueTerminal(tok, 'vtext')
+        vtext = ValueTerminal(tok, 'utext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
         value = ''.join(remainder)
@@ -2813,7 +2813,7 @@ def _refold_parse_tree(parse_tree, *, policy):
             continue
         tstr = str(part)
         if not want_encoding:
-            if part.token_type == 'ptext':
+            if part.token_type in ('ptext', 'vtext'):
                 # Encode if tstr contains special characters.
                 want_encoding = not SPECIALSNL.isdisjoint(tstr)
             else:
@@ -2837,13 +2837,6 @@ def _refold_parse_tree(parse_tree, *, policy):
             _fold_mime_parameters(part, lines, maxlen, encoding)
             continue
 
-        allow_refolding_subparts = True
-        if part.token_type == 'encoded-word':
-            # A parsed encoded-word containing specials must remain encoded,
-            # to keep specials from sneaking into a structured header unquoted.
-            # (The encoded-word can be split for folding.)
-            allow_refolding_subparts = SPECIALSNL.isdisjoint(tstr)
-
         if want_encoding and not wrap_as_ew_blocked:
             if not part.as_ew_allowed:
                 want_encoding = False
@@ -2863,7 +2856,7 @@ def _refold_parse_tree(parse_tree, *, policy):
                 # want it on a line by itself even if it fits, or it
                 # doesn't fit on a line by itself.  Either way, fall through
                 # to unpacking the subparts and wrapping them.
-            if allow_refolding_subparts and not hasattr(part, 'encode'):
+            if not hasattr(part, 'encode'):
                 # It's not a Terminal, do each piece individually.
                 parts = list(part) + parts
                 want_encoding = False
@@ -2917,7 +2910,7 @@ def _refold_parse_tree(parse_tree, *, policy):
                 leading_whitespace = ''.join(whitespace_accumulator)
                 last_ew = None
                 continue
-        if allow_refolding_subparts and not hasattr(part, 'encode'):
+        if not hasattr(part, 'encode'):
             # It's not a terminal, try folding the subparts.
             newparts = list(part)
             if part.token_type == 'bare-quoted-string':
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
@@ -3083,8 +3083,8 @@ def test_address_list_with_specials_in_encoded_word(self):
         cases = [
             # (to, folded)
             ('=?utf-8?q?A_v=C3=A9ry_long_name_with=2C_comma?= <to@example.com>',
-             '=?utf-8?q?A_v=C3=A9ry_long_name_with?=\n'
-             ' =?utf-8?q?=2C_comma?= <to@example.com>\n'),
+             'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n'
+             ' =?utf-8?q?=2C?= comma <to@example.com>\n'),
             ('=?utf-8?q?This_long_name_does_not_need_encoded=2Dword?= <to@example.com>',
              'This long name does not need\n'
              ' encoded-word <to@example.com>\n'),