From c8fc6585856b9be85e989665d956d6ada685983b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 11:26:19 +0100 Subject: [PATCH 01/10] deprecate non-ascii --- Doc/deprecations/pending-removal-in-3.17.rst | 6 ++++++ Lib/encodings/__init__.py | 8 +++++++- Lib/test/test_codecs.py | 10 +++++++--- .../2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst | 3 +++ 4 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst diff --git a/Doc/deprecations/pending-removal-in-3.17.rst b/Doc/deprecations/pending-removal-in-3.17.rst index 0a1c2f08cab3bd..e769c9d371e133 100644 --- a/Doc/deprecations/pending-removal-in-3.17.rst +++ b/Doc/deprecations/pending-removal-in-3.17.rst @@ -23,6 +23,12 @@ Pending removal in Python 3.17 (Contributed by Shantanu Jain in :gh:`91896`.) +* :mod:`encodings`: + + - Passing non-ascii *encoding* names to :func:`encodings.normalize_encoding` + is deprecated and scheduled for removal in Python 3.17. + (Contributed by Stan Ulbrych in :gh:`136702`) + * :mod:`typing`: - Before Python 3.14, old-style unions were implemented using the private class diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..b048fdc0223b86 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,9 +26,10 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs +import warnings import sys from . import aliases @@ -55,6 +56,11 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") + if not encoding.isascii(): + warnings.warn( + "Support for non-ascii encoding names will be removed in 3.17", + DeprecationWarning, stacklevel=2) + chars = [] punct = False for c in encoding: diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c35a4508943506..f1f0ac5ad36fd2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3886,15 +3886,14 @@ def search_function(encoding): self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4)) - self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4)) + with self.assertWarns(DeprecationWarning): + self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. @@ -3902,6 +3901,11 @@ def test_encodings_normalize_encoding(self): self.assertEqual(normalize('utf.8'), 'utf.8') self.assertEqual(normalize('utf...8'), 'utf...8') + # Non-ASCII *encoding* is deprecated. + with self.assertWarnsRegex(DeprecationWarning, + "Support for non-ascii encoding names will be removed in 3.17"): + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst new file mode 100644 index 00000000000000..88303f017f58c4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst @@ -0,0 +1,3 @@ +:mod:`encodings`: Deprecate passing a non-ascii *encoding* name to +:func:`encodings.normalize_encoding` and schedule removal of support for +Python 3.17. From 5b50daaddae581499840282c8ba8384d814925f0 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 11:34:12 +0100 Subject: [PATCH 02/10] Relocate import --- Lib/encodings/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index b048fdc0223b86..4a30d786f55881 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,7 +29,6 @@ """ import codecs -import warnings import sys from . import aliases @@ -57,6 +56,7 @@ def normalize_encoding(encoding): encoding = str(encoding, "ascii") if not encoding.isascii(): + import warnings warnings.warn( "Support for non-ascii encoding names will be removed in 3.17", DeprecationWarning, stacklevel=2) From 95f2e65dbdee909c88cd8b6276ad9c803c4115cb Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 13 Oct 2025 12:13:40 +0100 Subject: [PATCH 03/10] sanitize charset names in email --- Lib/email/_header_value_parser.py | 1 + Lib/email/utils.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 91243378dc0441..aa81f3554ca74a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,6 +796,7 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: + charset = utils._sanitize_charset_name(charset, 'us-ascii') value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3de1f0d24a15b0..67cc3a550b7d9d 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -446,6 +446,15 @@ def decode_params(params): new_params.append((name, '"%s"' % value)) return new_params +def _sanitize_charset_name(charset, fallback_charset): + if not charset: + return charset + sanitized = ''.join( + c for c in charset + if (ord(c) < 0xDC80 or ord(c) > 0xDCFF) and c.isascii() + ) + return sanitized if sanitized else fallback_charset + def collapse_rfc2231_value(value, errors='replace', fallback_charset='us-ascii'): if not isinstance(value, tuple) or len(value) != 3: @@ -458,6 +467,7 @@ def collapse_rfc2231_value(value, errors='replace', # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse # the value, so use the fallback_charset. charset = fallback_charset + charset = _sanitize_charset_name(charset, fallback_charset) rawbytes = bytes(text, 'raw-unicode-escape') try: return str(rawbytes, charset, errors) From fad52cd3cbc9d504190f3c52d84426590094f7a8 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Thu, 16 Oct 2025 16:48:47 +0100 Subject: [PATCH 04/10] Use table, replace with 'ascii' --- Lib/email/_header_value_parser.py | 4 ++-- Lib/email/utils.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index aa81f3554ca74a..d4d93006fb71ff 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,14 +796,14 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: - charset = utils._sanitize_charset_name(charset, 'us-ascii') + charset = utils._sanitize_charset_name(charset, 'ascii') value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for # unknown character set to make it easy to find, # because otherwise unknown charset is a silent # failure. - value = value.decode('us-ascii', 'surrogateescape') + value = value.decode('ascii', 'surrogateescape') if utils._has_surrogates(value): param.defects.append(errors.UndecodableBytesDefect()) value_parts.append(value) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 67cc3a550b7d9d..a93a7d0f86f849 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -446,17 +446,16 @@ def decode_params(params): new_params.append((name, '"%s"' % value)) return new_params +_SANITIZE_TABLE = str.maketrans({i: None for i in range(128, 65536)}) + def _sanitize_charset_name(charset, fallback_charset): if not charset: return charset - sanitized = ''.join( - c for c in charset - if (ord(c) < 0xDC80 or ord(c) > 0xDCFF) and c.isascii() - ) + sanitized = charset.translate(_SANITIZE_TABLE) return sanitized if sanitized else fallback_charset def collapse_rfc2231_value(value, errors='replace', - fallback_charset='us-ascii'): + fallback_charset='ascii'): if not isinstance(value, tuple) or len(value) != 3: return unquote(value) # While value comes to us as a unicode string, we need it to be a bytes From 9d6f06e00ebe87ed5c163e37ce12287c80a8071b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 1 Nov 2025 19:39:01 +0000 Subject: [PATCH 05/10] Review --- Lib/email/_header_value_parser.py | 3 +-- Lib/email/utils.py | 11 +---------- Lib/test/test_email/test_email.py | 5 ++++- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index d4d93006fb71ff..91243378dc0441 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,14 +796,13 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: - charset = utils._sanitize_charset_name(charset, 'ascii') value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for # unknown character set to make it easy to find, # because otherwise unknown charset is a silent # failure. - value = value.decode('ascii', 'surrogateescape') + value = value.decode('us-ascii', 'surrogateescape') if utils._has_surrogates(value): param.defects.append(errors.UndecodableBytesDefect()) value_parts.append(value) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index a93a7d0f86f849..3de1f0d24a15b0 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -446,16 +446,8 @@ def decode_params(params): new_params.append((name, '"%s"' % value)) return new_params -_SANITIZE_TABLE = str.maketrans({i: None for i in range(128, 65536)}) - -def _sanitize_charset_name(charset, fallback_charset): - if not charset: - return charset - sanitized = charset.translate(_SANITIZE_TABLE) - return sanitized if sanitized else fallback_charset - def collapse_rfc2231_value(value, errors='replace', - fallback_charset='ascii'): + fallback_charset='us-ascii'): if not isinstance(value, tuple) or len(value) != 3: return unquote(value) # While value comes to us as a unicode string, we need it to be a bytes @@ -466,7 +458,6 @@ def collapse_rfc2231_value(value, errors='replace', # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse # the value, so use the fallback_charset. charset = fallback_charset - charset = _sanitize_charset_name(charset, fallback_charset) rawbytes = bytes(text, 'raw-unicode-escape') try: return str(rawbytes, charset, errors) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index b8116d073a2670..3e216718fbb18d 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -5717,7 +5717,10 @@ def test_rfc2231_bad_character_in_encoding(self): """ msg = email.message_from_string(m) - self.assertEqual(msg.get_filename(), 'myfile.txt') + import warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + self.assertEqual(msg.get_filename(), 'myfile.txt') def test_rfc2231_single_tick_in_filename_extended(self): eq = self.assertEqual From 16697dcc6b5f5f0a316fa2e14537d516b9e16bb9 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 1 Nov 2025 20:22:51 +0000 Subject: [PATCH 06/10] Fix second warning --- Lib/test/test_email/test_headerregistry.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index ff7a6da644d572..7e9b56e800c5bd 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1,6 +1,7 @@ import datetime import textwrap import unittest +import warnings from email import errors from email import policy from email.message import Message @@ -247,7 +248,15 @@ def content_type_as_value(self, decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' folded = args[3] if l>3 else header + decoded + '\n' - h = self.make_header('Content-Type', source) + # Suppress deprecation warning for rfc2231_nonascii_in_charset_of_charset_parameter_value + if 'utf-8%E2%80%9D' in source: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + message='Support for non-ascii encoding names', + category=DeprecationWarning) + h = self.make_header('Content-Type', source) + else: + h = self.make_header('Content-Type', source) self.assertEqual(h.content_type, content_type) self.assertEqual(h.maintype, maintype) self.assertEqual(h.subtype, subtype) From e4036f858f8ad56d0d9ba38c5fb2dac06cb9d215 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 1 Nov 2025 20:52:51 +0000 Subject: [PATCH 07/10] Convert to asserts --- Lib/test/test_email/test_email.py | 4 +--- Lib/test/test_email/test_headerregistry.py | 8 ++------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 3e216718fbb18d..8fe51f67d7349e 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -5717,9 +5717,7 @@ def test_rfc2231_bad_character_in_encoding(self): """ msg = email.message_from_string(m) - import warnings - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + with self.assertWarns(DeprecationWarning): self.assertEqual(msg.get_filename(), 'myfile.txt') def test_rfc2231_single_tick_in_filename_extended(self): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 7e9b56e800c5bd..043eb376d67554 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1,7 +1,6 @@ import datetime import textwrap import unittest -import warnings from email import errors from email import policy from email.message import Message @@ -249,11 +248,8 @@ def content_type_as_value(self, header = 'Content-Type:' + ' ' if source else '' folded = args[3] if l>3 else header + decoded + '\n' # Suppress deprecation warning for rfc2231_nonascii_in_charset_of_charset_parameter_value - if 'utf-8%E2%80%9D' in source: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - message='Support for non-ascii encoding names', - category=DeprecationWarning) + if 'utf-8%E2%80%9D' in source and not 'ascii' in source: + with self.assertWarns(DeprecationWarning): h = self.make_header('Content-Type', source) else: h = self.make_header('Content-Type', source) From b8fc5f43c72d7a9ccf12605afe85022041acb244 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 1 Nov 2025 22:04:41 +0000 Subject: [PATCH 08/10] Fix for platforms with ordered tests --- Lib/test/test_email/test_headerregistry.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 043eb376d67554..67f655221c917d 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -247,8 +247,11 @@ def content_type_as_value(self, decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' folded = args[3] if l>3 else header + decoded + '\n' - # Suppress deprecation warning for rfc2231_nonascii_in_charset_of_charset_parameter_value - if 'utf-8%E2%80%9D' in source and not 'ascii' in source: + # Both rfc2231 test cases with utf-8%E2%80%9D raise warnings, + # clear encoding cache to ensure test isolation. + if 'utf-8%E2%80%9D' in source: + import encodings + encodings._cache.clear() with self.assertWarns(DeprecationWarning): h = self.make_header('Content-Type', source) else: From 7592af89de0b4905cc96d607290aea56a198535b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 1 Nov 2025 22:05:51 +0000 Subject: [PATCH 09/10] !fixup --- Lib/test/test_email/test_headerregistry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 67f655221c917d..1d0d0a49a82917 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -249,7 +249,7 @@ def content_type_as_value(self, folded = args[3] if l>3 else header + decoded + '\n' # Both rfc2231 test cases with utf-8%E2%80%9D raise warnings, # clear encoding cache to ensure test isolation. - if 'utf-8%E2%80%9D' in source: + if 'utf-8%E2%80%9D' in source and 'ascii' not in source: import encodings encodings._cache.clear() with self.assertWarns(DeprecationWarning): From 8c598998597aff232a6fbbbdc34f1d17e73f3506 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sun, 2 Nov 2025 14:26:08 +0000 Subject: [PATCH 10/10] Fix CI on Android and iOS --- Lib/email/_header_value_parser.py | 4 ++++ Lib/email/utils.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 91243378dc0441..c7f665b3990512 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,6 +796,10 @@ def params(self): value = urllib.parse.unquote(value, encoding='latin-1') else: try: + # Explicitly look up the codec for warning generation, see gh-140030 + # Can be removed in 3.17 + import codecs + codecs.lookup(charset) value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3de1f0d24a15b0..d4824dc3601b2d 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -460,6 +460,10 @@ def collapse_rfc2231_value(value, errors='replace', charset = fallback_charset rawbytes = bytes(text, 'raw-unicode-escape') try: + # Explicitly look up the codec for warning generation, see gh-140030 + # Can be removed in 3.17 + import codecs + codecs.lookup(charset) return str(rawbytes, charset, errors) except LookupError: # charset is not a known codec.