Better support for "funny" characters in subject prefixes.

encode_p(): Removed. _isunicode(): Helper for unicode type testing. prefix_subject(): Better algorithm for handling unicode subject prefixes. Now we always set the Subject header to a Header instance if we're adding the prefix. Always convert the prefix and all previous Subject header bits to unicode (possibly with character replacements) so that the Header class does the right thing, i.e. does the us-ascii, charset hint, utf-8 encoding. Because of this, if the list's charset is us-ascii, we'll substitute iso-8859-1 for a slightly wider character coverage.
author: bwarsaw 2002-10-02 14:01:55 +0000
committer: bwarsaw 2002-10-02 14:01:55 +0000
commit: 341386bbfb78fb23d09cfbc280b32d14a80e32e3 (patch)
tree: e91faad3e15b95fd988d74fa51fac1706941bca6 /Mailman
parent: 6b457c295f03acbd4df4a2f730ca94601936467e (diff)
download: mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.tar.gz
mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.tar.zst
mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.zip
1 files changed, 32 insertions, 42 deletions
diff --git a/Mailman/Handlers/CookHeaders.py b/Mailman/Handlers/CookHeaders.py
index 55ed5ac4d..c7e0f74a6 100644
--- a/Mailman/Handlers/CookHeaders.py
+++ b/Mailman/Handlers/CookHeaders.py
@@ -19,6 +19,7 @@
 
 from __future__ import nested_scopes
 import re
+from types import UnicodeType
 
 from email.Charset import Charset
 from email.Header import Header, decode_header
@@ -35,6 +36,11 @@ MAXLINELEN = 78
 
 
 
+def _isunicode(s):
+    return isinstance(s, UnicodeType)
+
+
+
 def process(mlist, msg, msgdata):
     # Set the "X-Ack: no" header if noack flag is set.
     if msgdata.get('noack'):
@@ -163,29 +169,6 @@ def process(mlist, msg, msgdata):
 
 
 
-def encode_p(mlist, subject, prefix):
-    # Decide whether we're going to encode the prefix or not
-    if mlist.encode_ascii_prefixes == 1:
-        # Always encode
-        return 1
-    try:
-        prefix.encode('us-ascii')
-    except UnicodeError:
-        # There are non-ASCII characters in the prefix, so we must encode it
-        return 1
-    if mlist.encode_ascii_prefixes == 0:
-        # Never encode if the prefix is ASCII
-        return 0
-    # What's left is `As needed' encoding.  Meaning, we'll only encode the
-    # prefix if the Subject: header contains non-ASCII characters.  Note that
-    # subject might be a Header instance, so str()-ify it first.
-    try:
-        str(subject).encode('us-ascii')
-    except UnicodeError:
-        return 1
-    return 0
-
-
 def prefix_subject(mlist, msg, msgdata):
     # Add the subject prefix unless the message is a digest or is being fast
     # tracked (e.g. internally crafted, delivered to a single user such as the
@@ -198,27 +181,34 @@ def prefix_subject(mlist, msg, msgdata):
     # and each word of the prefix is encoded in a different chunk in the
     # header, we won't find it.  I think in practice that's unlikely though.
     headerbits = decode_header(subject)
-    has_prefix = 0
     if prefix and subject:
         pattern = re.escape(prefix.strip())
         for decodedsubj, charset in headerbits:
             if re.search(pattern, decodedsubj, re.IGNORECASE):
-                has_prefix = 1
-    charset = Charset(Utils.GetCharSet(mlist.preferred_language))
-    # We purposefully leave no space b/w prefix and subject!
+                # The subject's already got the prefix, so don't change it
+                return
+    del msg['subject']
     if not subject:
-        del msg['subject']
-        h = Header(prefix, charset, header_name='Subject')
-        h.append(_('(no subject)'), charset)
-        msg['Subject'] = h
-    elif prefix and not has_prefix:
-        del msg['subject']
-        if encode_p(mlist, subject, prefix):
-            # We'll encode the new prefix (just in case) but leave the old
-            # subject alone, in case it was already encoded.
-            h = Header(prefix, charset, 128, header_name='Subject')
-        else:
-            h = Header(prefix, header_name='Subject')
-        for s, c in headerbits:
-            h.append(s, c)
-        msg['Subject'] = h
+        subject = _('(no subject)')
+    # Get the charset to encode the prefix in.  If this is us-ascii, we'll use
+    # iso-8859-1 instead, just to get a little extra coverage, and because the
+    # Header class tries us-ascii first anyway.
+    charset = Utils.GetCharSet(mlist.preferred_language)
+    if charset == 'us-ascii':
+        charset = 'iso-8859-1'
+    charset = Charset(charset)
+    # Convert the prefix to unicode so Header will do the 3-charset encoding.
+    # If prefix is a byte string and there are funky characters in it that
+    # don't match the charset, we might as well replace them now.
+    if not _isunicode(prefix):
+        prefix = unicode(prefix, charset.get_output_charset(), 'replace')
+    # We purposefully leave no space b/w prefix and subject!
+    h = Header(prefix, charset, header_name='Subject')
+    for s, c in headerbits:
+        # Once again, convert the string to unicode.
+        if c is None:
+            c = 'iso-8859-1'
+        if not _isunicode(s):
+            s = unicode(s, c, 'replace')
+        h.append(s, c)
+    msg['Subject'] = h
author	bwarsaw	2002-10-02 14:01:55 +0000
committer	bwarsaw	2002-10-02 14:01:55 +0000
commit	341386bbfb78fb23d09cfbc280b32d14a80e32e3 (patch)
tree	e91faad3e15b95fd988d74fa51fac1706941bca6 /Mailman
parent	6b457c295f03acbd4df4a2f730ca94601936467e (diff)
download	mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.tar.gz mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.tar.zst mailman-341386bbfb78fb23d09cfbc280b32d14a80e32e3.zip