bpo-22833: Fix bytes/str inconsistency in email.header.decode_header()

dlenski · dlenski · commit 4729b3739de3 · 2022-01-20T18:40:25.000-08:00
This functions possible return types have been non-intuitive and surprising
for the entirety of its Python 3.x history. It can return either:

1. `typing.List[typing.Tuple[bytes, typing.Optional[str]]]`
2. or `typing.List[typing.Tuple[str, None]]`, of length exactly 1

This has meant that any user of this function must be prepared to accept
either `bytes` or `str` for the first member of the 2-tuples it returns,
which is a very surprising behavior in Python 3.x, particularly given
that the second member of the tuple is supposed to represent the
charset/encoding of the first member.

This change eliminates case (2), ensuring that
`email.header.decode_header()` always returns `bytes`, never `str`, as the
first member of the 2-tuples it returns. It also adds a test case to verify
this behavior.
diff --git a/Lib/email/header.py b/Lib/email/header.py
@@ -61,7 +61,7 @@
 def decode_header(header):
     """Decode a message header value without converting charset.
 
-    Returns a list of (string, charset) pairs containing each of the decoded
+    Returns a list of (bytes, charset) pairs containing each of the decoded
     parts of the header.  Charset is None for non-encoded parts of the header,
     otherwise a lower-case string containing the name of the character set
     specified in the encoded string.
@@ -78,7 +78,7 @@ def decode_header(header):
                     for string, charset in header._chunks]
     # If no encoding, just return the header with no charset.
     if not ecre.search(header):
-        return [(header, None)]
+        return [bytes(header, 'raw-unicode-escape'), None)]
     # First step is to parse all the encoded parts into triplets of the form
     # (encoded_string, encoding, charset).  For unencoded strings, the last
     # two parts will be None.
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
@@ -2432,6 +2432,18 @@ def test_multiline_header(self):
         self.assertEqual(str(make_header(decode_header(s))),
                          '"Müller T" <T.Mueller@xxx.com>')
 
+    def test_unencoded_ascii(self):
+        # issue 22833
+        s = 'header without encoded words'
+        self.assertEqual(decode_header(s),
+            [(b'header without encoded words', None)])
+
+    def test_unencoded_utf8(self):
+        # issue 22833
+        s = 'header with unexpected non ASCII caract\xe8res'
+        self.assertEqual(decode_header(s),
+            [(b'header with unexpected non ASCII caract\xe8res', None)])
+
 
 # Test the MIMEMessage class
 class TestMIMEMessage(TestEmailBase):
diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst
@@ -0,0 +1,3 @@
+The :func:`email.header.decode_header` function now always provides :class:`bytes`,
+never :class:`str`, as the first member of the tuples it returns. Previously, it would
+return (str, None) when decoding a header consisting only of a single, unencoded part.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+The :func:`email.header.decode_header` function now always provides :class:`bytes`,
	`2`	+never :class:`str`, as the first member of the tuples it returns. Previously, it would
	`3`	`+return (str, None) when decoding a header consisting only of a single, unencoded part.`