Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Correctly fold unknown-8bit originating from encoded words.
The unknown-8bit trick was designed to deal with unknown bytes in an
ASCII message, and it works fine for that.  However, I also tried to
extend it to handle bytes that can't be decoded using the charset
specified in an encoded word, and there it fails because there can be
other non-ASCII characters that were *successfully* decoded.  The fix is
simple: do the unknown-8bit encoding using the utf-8 codec.  This is
especially appropriate since anyone trying to do recovery on an unknown
byte string will probably attempt utf-8 first.
  • Loading branch information
bitdancer committed Dec 10, 2025
commit ee40a0c8d946d3c3fd553734004f580fb315feed
2 changes: 1 addition & 1 deletion Lib/email/_encoded_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def encode(string, charset='utf-8', encoding=None, lang=''):

"""
if charset == 'unknown-8bit':
bstring = string.encode('ascii', 'surrogateescape')
bstring = string.encode('utf-8', 'surrogateescape')
else:
bstring = string.encode(charset)
if encoding is None:
Expand Down
8 changes: 8 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3265,5 +3265,13 @@ def test_fold_unfoldable_element_stealing_whitespace(self):
token = parser.get_address_list(text)[0]
self._test(token, expected, policy=policy)

def test_encoded_word_with_undecodable_bytes(self):
self._test(parser.get_address_list(
' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
)[0],
' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n',
)


if __name__ == '__main__':
unittest.main()