Skip to content

gh-128110: Fix rfc2047 handling in email parser address headers #130749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,18 @@ def get_atom(value):
atom.append(token)
if value and value[0] in CFWS_LEADER:
token, value = get_cfws(value)
# Peek ahead to ignore linear-white-space between adjacent encoded-words.
if (
atom[-1].token_type == 'encoded-word'
and value.startswith('=?')
and all(ws.token_type == 'fws' for ws in token) # not comments
):
try:
get_encoded_word(value)
except errors.HeaderParseError:
pass
else:
token = EWWhiteSpaceTerminal(token, 'fws')
atom.append(token)
return atom, value

Expand Down
89 changes: 89 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,79 @@ def get_phrase_cfws_only_raises(self):
with self.assertRaises(errors.HeaderParseError):
parser.get_phrase(' (foo) ')

def test_get_phrase_adjacent_ew(self):
# In structured headers, the requirement to ignore linear-white-space
# between adjacent encoded-words is actually implemented by get_atom.
# But it's easier to see the results by testing get_phrase.
self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')

def test_get_phrase_adjacent_ew_different_encodings(self):
self._test_get_x(
parser.get_phrase,
'=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], ''
)

def test_get_phrase_adjacent_ew_encoded_spaces(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=',
'Encoded spaces preserved',
'Encoded spaces preserved',
[],
''
)

def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=',
'Comment (is not) linear-white-space',
'Comment linear-white-space',
[],
'',
comments=['is not'],
)

def test_get_phrase_adjacent_ew_no_error_on_defects(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?Def?= =?ascii?q?ect still joins?=',
'Defect still joins',
'Defect still joins',
[errors.InvalidHeaderDefect], # whitespace inside encoded word
''
)

def test_get_phrase_adjacent_ew_ignore_non_ew(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?No?= =?join?= for non-ew',
'No =?join?= for non-ew',
'No =?join?= for non-ew',
[],
''
)

def test_get_phrase_adjacent_ew_ignore_invalid_ew(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew',
'No =?ascii?rot13?wbva= for invalid ew',
'No =?ascii?rot13?wbva= for invalid ew',
[],
''
)

def test_get_phrase_adjacent_ew_missing_space(self):
self._test_get_x(
parser.get_phrase,
'=?ascii?q?Joi?==?ascii?q?ned?=',
'Joined',
'Joined',
[errors.InvalidHeaderDefect], # missing trailing whitespace
''
)

# get_local_part

def test_get_local_part_simple(self):
Expand Down Expand Up @@ -2365,6 +2438,22 @@ def test_get_address_rfc2047_display_name(self):
self.assertEqual(address[0].token_type,
'mailbox')

def test_get_address_rfc2047_display_name_adjacent_ews(self):
address = self._test_get_x(parser.get_address,
'=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= <foo@example.com>',
'Bérénice <foo@example.com>',
'Bérénice <foo@example.com>',
[],
'')
self.assertEqual(address.token_type, 'address')
self.assertEqual(len(address.mailboxes), 1)
self.assertEqual(address.mailboxes,
address.all_mailboxes)
self.assertEqual(address.mailboxes[0].display_name,
'Bérénice')
self.assertEqual(address[0].token_type,
'mailbox')

def test_get_address_empty_group(self):
address = self._test_get_x(parser.get_address,
'Monty Python:;',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix bug in the parsing of email address headers that could result in
extraneous spaces in the decoded text when using a modern email policy.
Space between pairs of adjacent rfc2047 encoded-words is now ignored, per
rfc2047 section 6.2 (and consistent with existing parsing of unstructured
headers like *Subject*).
Loading