File CVE-2023-27043-email-parsing-errors.patch of Package python-doc.32348

---
 Doc/library/email.utils.rst                                             |   19 -
 Lib/email/utils.py                                                      |  151 +++++++-
 Lib/test/test_email/test_email.py                                       |  187 +++++++++-
 Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst |    8 
 4 files changed, 344 insertions(+), 21 deletions(-)

--- a/Doc/library/email.utils.rst
+++ b/Doc/library/email.utils.rst
@@ -21,13 +21,18 @@ There are several useful utilities provi
    begins with angle brackets, they are stripped off.
 
 
-.. function:: parseaddr(address)
+.. function:: parseaddr(address, strict=True)
 
    Parse address -- which should be the value of some address-containing field such
    as :mailheader:`To` or :mailheader:`Cc` -- into its constituent *realname* and
    *email address* parts.  Returns a tuple of that information, unless the parse
    fails, in which case a 2-tuple of ``('', '')`` is returned.
 
+   If *strict* is true, use a strict parser which rejects malformed inputs.
+
+   .. versionchanged:: 3.13
+      Add *strict* optional parameter and reject malformed inputs by default.
+
 
 .. function:: formataddr(pair)
 
@@ -37,12 +42,15 @@ There are several useful utilities provi
    second element is returned unmodified.
 
 
-.. function:: getaddresses(fieldvalues)
+.. function:: getaddresses(fieldvalues, strict=True)
 
    This method returns a list of 2-tuples of the form returned by ``parseaddr()``.
    *fieldvalues* is a sequence of header field values as might be returned by
-   :meth:`Message.get_all <email.message.Message.get_all>`.  Here's a simple
-   example that gets all the recipients of a message::
+   :meth:`Message.get_all <email.message.Message.get_all>`.
+
+   If *strict* is true, use a strict parser which rejects malformed inputs.
+
+   Here's a simple example that gets all the recipients of a message::
 
       from email.utils import getaddresses
 
@@ -52,6 +60,9 @@ There are several useful utilities provi
       resent_ccs = msg.get_all('resent-cc', [])
       all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs)
 
+   .. versionchanged:: 3.13
+      Add *strict* optional parameter and reject malformed inputs by default.
+
 
 .. function:: parsedate(date)
 
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -100,15 +100,93 @@ def formataddr(pair):
     return address
 
 
-
-def getaddresses(fieldvalues):
-    """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
-    all = COMMASPACE.join(fieldvalues)
-    a = _AddressList(all)
-    return a.addresslist
+def _iter_escaped_chars(addr):
+    pos = 0
+    escape = False
+    for pos, ch in enumerate(addr):
+        if escape:
+            yield (pos, '\\' + ch)
+            escape = False
+        elif ch == '\\':
+            escape = True
+        else:
+            yield (pos, ch)
+    if escape:
+        yield (pos, '\\')
+
+
+def _strip_quoted_realnames(addr):
+    """Strip real names between quotes."""
+    if '"' not in addr:
+        # Fast path
+        return addr
+
+    start = 0
+    open_pos = None
+    result = []
+    for pos, ch in _iter_escaped_chars(addr):
+        if ch == '"':
+            if open_pos is None:
+                open_pos = pos
+            else:
+                if start != open_pos:
+                    result.append(addr[start:open_pos])
+                start = pos + 1
+                open_pos = None
+
+    if start < len(addr):
+        result.append(addr[start:])
+
+    return ''.join(result)
+
+
+supports_strict_parsing = True
+
+def getaddresses(fieldvalues, strict=True):
+    """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue.
+
+    When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in
+    its place.
+
+    If strict is true, use a strict parser which rejects malformed inputs.
+    """
+
+    # If strict is true, if the resulting list of parsed addresses is greater
+    # than the number of fieldvalues in the input list, a parsing error has
+    # occurred and consequently a list containing a single empty 2-tuple [('',
+    # '')] is returned in its place. This is done to avoid invalid output.
+    #
+    # Malformed input: getaddresses(['alice@example.com <bob@example.com>'])
+    # Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')]
+    # Safe output: [('', '')]
+
+    if not strict:
+        all = COMMASPACE.join(unicode(v) for v in fieldvalues)
+        a = _AddressList(all)
+        return a.addresslist
+
+    fieldvalues = [unicode(v) for v in fieldvalues]
+    fieldvalues = _pre_parse_validation(fieldvalues)
+    addr = COMMASPACE.join(fieldvalues)
+    a = _AddressList(addr)
+    result = _post_parse_validation(a.addresslist)
+
+    # Treat output as invalid if the number of addresses is not equal to the
+    # expected number of addresses.
+    n = 0
+    for v in fieldvalues:
+        # When a comma is used in the Real Name part it is not a deliminator.
+        # So strip those out before counting the commas.
+        v = _strip_quoted_realnames(v)
+        # Expected number of addresses: 1 + number of commas
+        n += 1 + v.count(',')
+    if len(result) != n:
+        return [('', '')]
+
+    return result
+
 
 
-
 ecre = re.compile(r'''
   =\?                   # literal =?
   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
@@ -210,19 +288,74 @@ def parsedate_tz(data):
     return _parsedate_tz(data)
 
 
-def parseaddr(addr):
+def parseaddr(addr, strict=True):
     """
     Parse addr into its constituent realname and email address parts.
 
     Return a tuple of realname and email address, unless the parse fails, in
     which case return a 2-tuple of ('', '').
+
+    If strict is True, use a strict parser which rejects malformed inputs.
     """
-    addrs = _AddressList(addr).addresslist
-    if not addrs:
-        return '', ''
+
+    if not strict:
+        addrs = _AddressList(addr).addresslist
+        if not addrs:
+            return ('', '')
+        return addrs[0]
+
+    if isinstance(addr, list):
+        addr = addr[0]
+
+    if not isinstance(addr, str):
+        return ('', '')
+
+    addr = _pre_parse_validation([addr])[0]
+    addrs = _post_parse_validation(_AddressList(addr).addresslist)
+
+    if not addrs or len(addrs) > 1:
+        return ('', '')
+
     return addrs[0]
 
 
+def _check_parenthesis(addr):
+    # Ignore parenthesis in quoted real names.
+    addr = _strip_quoted_realnames(addr)
+
+    opens = 0
+    for pos, ch in _iter_escaped_chars(addr):
+        if ch == '(':
+            opens += 1
+        elif ch == ')':
+            opens -= 1
+            if opens < 0:
+                return False
+    return (opens == 0)
+
+
+def _pre_parse_validation(email_header_fields):
+    accepted_values = []
+    for v in email_header_fields:
+        if not _check_parenthesis(v):
+            v = "('', '')"
+        accepted_values.append(v)
+
+    return accepted_values
+
+
+def _post_parse_validation(parsed_email_header_tuples):
+    accepted_values = []
+    # The parser would have parsed a correctly formatted domain-literal
+    # The existence of an [ after parsing indicates a parsing failure
+    for v in parsed_email_header_tuples:
+        if '[' in v[1]:
+            v = ('', '')
+        accepted_values.append(v)
+
+    return accepted_values
+
+
 # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
 def unquote(str):
     """Remove quotes from a string."""
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst
@@ -0,0 +1,8 @@
+:func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now
+return ``('', '')`` 2-tuples in more situations where invalid email
+addresses are encountered instead of potentially inaccurate values. Add
+optional *strict* parameter to these two functions: use ``strict=False`` to
+get the old behavior, accept malformed inputs.
+``getattr(email.utils, 'supports_strict_parsing', False)`` can be use to check
+if the *strict* paramater is available. Patch by Thomas Dwyer and Victor
+Stinner to improve the CVE-2023-27043 fix.
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright (C) 2001-2010 Python Software Foundation
 # Contact: email-sig@python.org
 # email package unit tests
@@ -2414,15 +2415,135 @@ Foo
            [('Al Person', 'aperson@dom.ain'),
             ('Bud Person', 'bperson@dom.ain')])
 
+    def test_parsing_errors(self):
+        """Test for parsing errors from CVE-2023-27043 and CVE-2019-16056"""
+        alice = 'alice@example.org'
+        bob = 'bob@example.com'
+        empty = ('', '')
+
+        # Test Utils.getaddresses() and Utils.parseaddr() on malformed email
+        # addresses: default behavior (strict=True) rejects malformed address,
+        # and strict=False which tolerates malformed address.
+        for invalid_separator, expected_non_strict in (
+            ('(', [('<%s>' % bob, alice)]),
+            (')', [('', alice), empty, ('', bob)]),
+            ('<', [('', alice), empty, ('', bob), empty]),
+            ('>', [('', alice), empty, ('', bob)]),
+            ('[', [('', '%s[<%s>]' % (alice, bob))]),
+            (']', [('', alice), empty, ('', bob)]),
+            ('@', [empty, empty, ('', bob)]),
+            (';', [('', alice), empty, ('', bob)]),
+            (':', [('', alice), ('', bob)]),
+            ('.', [('', alice + '.'), ('', bob)]),
+            ('"', [('', alice), ('', '<%s>' % bob)]),
+        ):
+            address = '%s%s<%s>' % (alice, invalid_separator, bob)
+            self.assertEqual(Utils.getaddresses([address]),
+                             [empty])
+            self.assertEqual(Utils.getaddresses([address], strict=False),
+                             expected_non_strict)
+
+            self.assertEqual(Utils.parseaddr([address]),
+                             empty)
+            self.assertEqual(Utils.parseaddr([address], strict=False),
+                             ('', address))
+
+        # Comma (',') is treated differently depending on strict parameter.
+        # Comma without quotes.
+        address = '%s,<%s>' % (alice, bob)
+        self.assertEqual(Utils.getaddresses([address]),
+                         [('', alice), ('', bob)])
+        self.assertEqual(Utils.getaddresses([address], strict=False),
+                         [('', alice), ('', bob)])
+        self.assertEqual(Utils.parseaddr([address]),
+                         empty)
+        self.assertEqual(Utils.parseaddr([address], strict=False),
+                         ('', address))
+
+        # Real name between quotes containing comma.
+        address = '"Alice, alice@example.org" <bob@example.com>'
+        expected_strict = ('Alice, alice@example.org', 'bob@example.com')
+        self.assertEqual(Utils.getaddresses([address]), [expected_strict])
+        self.assertEqual(Utils.getaddresses([address], strict=False), [expected_strict])
+        self.assertEqual(Utils.parseaddr([address]), expected_strict)
+        self.assertEqual(Utils.parseaddr([address], strict=False),
+                         ('', address))
+
+        # Valid parenthesis in comments.
+        address = 'alice@example.org (Alice)'
+        expected_strict = ('Alice', 'alice@example.org')
+        self.assertEqual(Utils.getaddresses([address]), [expected_strict])
+        self.assertEqual(Utils.getaddresses([address], strict=False), [expected_strict])
+        self.assertEqual(Utils.parseaddr([address]), expected_strict)
+        self.assertEqual(Utils.parseaddr([address], strict=False),
+                         ('', address))
+
+        # Invalid parenthesis in comments.
+        address = 'alice@example.org )Alice('
+        self.assertEqual(Utils.getaddresses([address]), [empty])
+        self.assertEqual(Utils.getaddresses([address], strict=False),
+                         [('', 'alice@example.org'), ('', ''), ('', 'Alice')])
+        self.assertEqual(Utils.parseaddr([address]), empty)
+        self.assertEqual(Utils.parseaddr([address], strict=False),
+                         ('', address))
+
+        # Two addresses with quotes separated by comma.
+        address = '"Jane Doe" <jane@example.net>, "John Doe" <john@example.net>'
+        self.assertEqual(Utils.getaddresses([address]),
+                         [('Jane Doe', 'jane@example.net'),
+                          ('John Doe', 'john@example.net')])
+        self.assertEqual(Utils.getaddresses([address], strict=False),
+                         [('Jane Doe', 'jane@example.net'),
+                          ('John Doe', 'john@example.net')])
+        self.assertEqual(Utils.parseaddr([address]), empty)
+        self.assertEqual(Utils.parseaddr([address], strict=False),
+                         ('', address))
+
+        # Test Utils.supports_strict_parsing attribute
+        self.assertEqual(Utils.supports_strict_parsing, True)
+
     def test_getaddresses_nasty(self):
-        eq = self.assertEqual
-        eq(Utils.getaddresses(['foo: ;']), [('', '')])
-        eq(Utils.getaddresses(
-           ['[]*-- =~$']),
-           [('', ''), ('', ''), ('', '*--')])
-        eq(Utils.getaddresses(
-           ['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>']),
-           [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')])
+        for addresses, expected in (
+            ([u'"Sürname, Firstname" <to@example.com>'],
+             [(u'Sürname, Firstname', 'to@example.com')]),
+
+            (['foo: ;'],
+             [('', '')]),
+
+            (['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>'],
+             [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]),
+
+            ([r'Pete(A nice \) chap) <pete(his account)@silly.test(his host)>'],
+             [('Pete (A nice ) chap his account his host)', 'pete@silly.test')]),
+
+            (['(Empty list)(start)Undisclosed recipients  :(nobody(I know))'],
+             [('', '')]),
+
+            (['Mary <@machine.tld:mary@example.net>, , jdoe@test   . example'],
+             [('Mary', 'mary@example.net'), ('', ''), ('', 'jdoe@test.example')]),
+
+            (['John Doe <jdoe@machine(comment).  example>'],
+             [('John Doe (comment)', 'jdoe@machine.example')]),
+
+            (['"Mary Smith: Personal Account" <smith@home.example>'],
+             [('Mary Smith: Personal Account', 'smith@home.example')]),
+
+            (['Undisclosed recipients:;'],
+             [('', '')]),
+
+            ([r'<boss@nil.test>, "Giant; \"Big\" Box" <bob@example.net>'],
+             [('', 'boss@nil.test'), ('Giant; "Big" Box', 'bob@example.net')]),
+        ):
+            self.assertEqual(Utils.getaddresses(addresses),
+                             expected)
+            self.assertEqual(Utils.getaddresses(addresses, strict=False),
+                             expected)
+
+        addresses = ['[]*-- =~$']
+        self.assertEqual(Utils.getaddresses(addresses),
+                         [('', '')])
+        self.assertEqual(Utils.getaddresses(addresses, strict=False),
+                         [('', ''), ('', ''), ('', '*--')])
 
     def test_getaddresses_embedded_comment(self):
         """Test proper handling of a nested comment"""
@@ -2430,6 +2551,54 @@ Foo
         addrs = Utils.getaddresses(['User ((nested comment)) <foo@bar.com>'])
         eq(addrs[0][1], 'foo@bar.com')
 
+    def test_iter_escaped_chars(self):
+        self.assertEqual(list(Utils._iter_escaped_chars(r'a\\b\"c\\"d')),
+                         [(0, 'a'),
+                          (2, '\\\\'),
+                          (3, 'b'),
+                          (5, '\\"'),
+                          (6, 'c'),
+                          (8, '\\\\'),
+                          (9, '"'),
+                          (10, 'd')])
+        self.assertEqual(list(Utils._iter_escaped_chars('a\\')),
+                         [(0, 'a'), (1, '\\')])
+
+    def test_strip_quoted_realnames(self):
+        def check(addr, expected):
+            self.assertEqual(Utils._strip_quoted_realnames(addr), expected)
+
+        check('"Jane Doe" <jane@example.net>, "John Doe" <john@example.net>',
+              ' <jane@example.net>,  <john@example.net>')
+        check(r'"Jane \"Doe\"." <jane@example.net>',
+              ' <jane@example.net>')
+
+        # special cases
+        check(r'before"name"after', 'beforeafter')
+        check(r'before"name"', 'before')
+        check(r'b"name"', 'b')  # single char
+        check(r'"name"after', 'after')
+        check(r'"name"a', 'a')  # single char
+        check(r'"name"', '')
+
+        # no change
+        for addr in (
+            'Jane Doe <jane@example.net>, John Doe <john@example.net>',
+            'lone " quote',
+        ):
+            self.assertEqual(Utils._strip_quoted_realnames(addr), addr)
+
+    def test_check_parenthesis(self):
+        addr = 'alice@example.net'
+        self.assertTrue(Utils._check_parenthesis('%s (Alice)' % addr))
+        self.assertFalse(Utils._check_parenthesis('%s )Alice(' % addr))
+        self.assertFalse(Utils._check_parenthesis('%s (Alice))' % addr))
+        self.assertFalse(Utils._check_parenthesis('%s ((Alice)' % addr))
+
+        # Ignore real name between quotes
+        self.assertTrue(Utils._check_parenthesis('")Alice((" %s' % addr))
+
+
     def test_make_msgid_collisions(self):
         # Test make_msgid uniqueness, even with multiple threads
         class MsgidsThread(Thread):
--- a/Lib/email/test/test_email_renamed.py
+++ b/Lib/email/test/test_email_renamed.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright (C) 2001-2007 Python Software Foundation
 # Contact: email-sig@python.org
 # email package unit tests
@@ -2276,14 +2277,47 @@ Foo
             ('Bud Person', 'bperson@dom.ain')])
 
     def test_getaddresses_nasty(self):
-        eq = self.assertEqual
-        eq(utils.getaddresses(['foo: ;']), [('', '')])
-        eq(utils.getaddresses(
-           ['[]*-- =~$']),
-           [('', ''), ('', ''), ('', '*--')])
-        eq(utils.getaddresses(
-           ['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>']),
-           [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')])
+        for addresses, expected in (
+            ([u'"Sürname, Firstname" <to@example.com>'],
+             [(u'Sürname, Firstname', 'to@example.com')]),
+
+            (['foo: ;'],
+             [('', '')]),
+
+            (['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>'],
+             [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]),
+
+            ([r'Pete(A nice \) chap) <pete(his account)@silly.test(his host)>'],
+             [('Pete (A nice ) chap his account his host)', 'pete@silly.test')]),
+
+            (['(Empty list)(start)Undisclosed recipients  :(nobody(I know))'],
+             [('', '')]),
+
+            (['Mary <@machine.tld:mary@example.net>, , jdoe@test   . example'],
+             [('Mary', 'mary@example.net'), ('', ''), ('', 'jdoe@test.example')]),
+
+            (['John Doe <jdoe@machine(comment).  example>'],
+             [('John Doe (comment)', 'jdoe@machine.example')]),
+
+            (['"Mary Smith: Personal Account" <smith@home.example>'],
+             [('Mary Smith: Personal Account', 'smith@home.example')]),
+
+            (['Undisclosed recipients:;'],
+             [('', '')]),
+
+            ([r'<boss@nil.test>, "Giant; \"Big\" Box" <bob@example.net>'],
+             [('', 'boss@nil.test'), ('Giant; "Big" Box', 'bob@example.net')]),
+        ):
+            self.assertEqual(utils.getaddresses(addresses),
+                             expected)
+            self.assertEqual(utils.getaddresses(addresses, strict=False),
+                             expected)
+
+        addresses = ['[]*-- =~$']
+        self.assertEqual(utils.getaddresses(addresses),
+                         [('', '')])
+        self.assertEqual(utils.getaddresses(addresses, strict=False),
+                         [('', ''), ('', ''), ('', '*--')])
 
     def test_getaddresses_embedded_comment(self):
         """Test proper handling of a nested comment"""
openSUSE Build Service is sponsored by