author: bwarsaw 2003-01-20 02:12:45 +0000
committer: bwarsaw 2003-01-20 02:12:45 +0000
commit: fdbf0aad277d5b47be35ccecfb809e9cf4a9827e (patch)
tree: d1dc5b2b782155bd1f2e43e28fe2e2af8f7645ca /Mailman/Handlers/Scrubber.py
parent: 4796210314419fa48db20c255002c0b04aa56dcd (diff)
download: mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.gz
mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.zst
mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.zip
1 files changed, 56 insertions, 18 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index aa993af7e..3ba64b945 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -17,6 +17,8 @@
 """Cleanse a message for archiving.
 """
 
+from __future__ import nested_scopes
+
 import os
 import re
 import sha
@@ -24,7 +26,6 @@ import time
 import errno
 import binascii
 import tempfile
-import mimetypes
 from cStringIO import StringIO
 from types import IntType
 
@@ -51,6 +52,34 @@ dre = re.compile(r'^\.*')
 BR = '<br>\n'
 SPACE = ' '
 
+try:
+    from mimetypes import guess_all_extensions
+except ImportError:
+    import mimetypes
+    def guess_all_extensions(ctype, strict=1):
+        # BAW: sigh, guess_all_extensions() is new in Python 2.3
+        all = []
+        def check(map):
+            for e, t in map.items():
+                if t == ctype:
+                    all.append(e)
+        check(mimetypes.types_map)
+        if not strict:
+            check(mimetypes.common_types)
+        return all
+
+
+
+def guess_extension(ctype, ext):
+    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+    # and .wiz are all mapped to application/msword.  This sucks for finding
+    # the best reverse mapping.  If the extension is one of the giving
+    # mappings, we'll trust that, otherwise we'll just guess. :/
+    all = guess_all_extensions(ctype, strict=0)
+    if ext in all:
+        return ext
+    return all[0]
+
 
 
 # We're using a subclass of the standard Generator because we want to suppress
@@ -131,6 +160,7 @@ def process(mlist, msg, msgdata=None):
         msgdata = {}
     dir = calculate_attachments_dir(mlist, msg, msgdata)
     charset = None
+    lcset = Utils.GetCharSet(mlist.preferred_language)
     # Now walk over all subparts of this message and scrub out various types
     for part in msg.walk():
         ctype = part.get_type(part.get_default_type())
@@ -140,13 +170,16 @@ def process(mlist, msg, msgdata=None):
             # arbitrarily pick the charset of the first text/plain part in the
             # message.
             if charset is None:
-                charset = part.get_content_charset(charset)
+                charset = part.get_content_charset(lcset)
         elif ctype == 'text/html' and isinstance(sanitize, IntType):
             if sanitize == 0:
                 if outer:
                     raise DiscardMessage
-                part.set_payload(_('HTML attachment scrubbed and removed'))
-                part.set_type('text/plain')
+                del part['content-type']
+                part.set_payload(_('HTML attachment scrubbed and removed'),
+                                 # Adding charset arg and removing content-tpe
+                                 # sets content-type to text/plain
+                                 lcset)
             elif sanitize == 2:
                 # By leaving it alone, Pipermail will automatically escape it
                 pass
@@ -159,11 +192,11 @@ def process(mlist, msg, msgdata=None):
                     url = save_attachment(mlist, part, dir, filter_html=0)
                 finally:
                     os.umask(omask)
+                del part['content-type']
                 part.set_payload(_("""\
 An HTML attachment was scrubbed...
 URL: %(url)s
-"""))
-                part.set_type('text/plain')
+"""), lcset)
             else:
                 # HTML-escape it and store it as an attachment, but make it
                 # look a /little/ bit prettier. :(
@@ -185,11 +218,11 @@ URL: %(url)s
                     url = save_attachment(mlist, part, dir, filter_html=0)
                 finally:
                     os.umask(omask)
+                del part['content-type']
                 part.set_payload(_("""\
 An HTML attachment was scrubbed...
 URL: %(url)s
-"""))
-                part.set_type('text/plain')
+"""), lcset)
         elif ctype == 'message/rfc822':
             # This part contains a submessage, so it too needs scrubbing
             submsg = part.get_payload(0)
@@ -202,6 +235,7 @@ URL: %(url)s
             date = submsg.get('date', _('no date'))
             who = submsg.get('from', _('unknown sender'))
             size = len(str(submsg))
+            del part['content-type']
             part.set_payload(_("""\
 An embedded message was scrubbed...
 From: %(who)s
@@ -209,13 +243,12 @@ Subject: %(subject)s
 Date: %(date)s
 Size: %(size)s
 Url: %(url)s
-"""))
-            part.set_type('text/plain')
+"""), lcset)
         # If the message isn't a multipart, then we'll strip it out as an
         # attachment that would have to be separately downloaded.  Pipermail
         # will transform the url into a hyperlink.
         elif not part.is_multipart():
-            payload = part.get_payload()
+            payload = part.get_payload(decode=1)
             ctype = part.get_type()
             size = len(payload)
             omask = os.umask(002)
@@ -225,6 +258,8 @@ Url: %(url)s
                 os.umask(omask)
             desc = part.get('content-description', _('not available'))
             filename = part.get_filename(_('not available'))
+            del part['content-type']
+            del part['content-transfer-encoding']
             part.set_payload(_("""\
 A non-text attachment was scrubbed...
 Name: %(filename)s
@@ -232,8 +267,7 @@ Type: %(ctype)s
 Size: %(size)d bytes
 Desc: %(desc)s
 Url : %(url)s
-"""))
-            part.set_type('text/plain')
+"""), lcset)
         outer = 0
     # We still have to sanitize multipart messages to flat text because
     # Pipermail can't handle messages with list payloads.  This is a kludge;
@@ -242,8 +276,8 @@ Url : %(url)s
         # By default we take the charset of the first text/plain part in the
         # message, but if there was none, we'll use the list's preferred
         # language's charset.
-        if charset is None:
-            charset = Utils.GetCharSet(mlist.preferred_language)
+        if charset is None or charset == 'us-ascii':
+            charset = lcset
         # We now want to concatenate all the parts which have been scrubbed to
         # text/plain, into a single text/plain payload.  We need to make sure
         # all the characters in the concatenated string are in the same
@@ -261,7 +295,7 @@ Url : %(url)s
                 t = part.get_payload(decode=1)
             except binascii.Error:
                 t = part.get_payload()
-            partcharset = part.get_charset()
+            partcharset = part.get_content_charset()
             if partcharset and partcharset <> charset:
                 try:
                     t = unicode(t, partcharset, 'replace')
@@ -270,11 +304,14 @@ Url : %(url)s
                 except UnicodeError:
                     # Replace funny characters
                     t = unicode(t, 'ascii', 'replace').encode('ascii')
+            # Separation is useful
+            if t[-1] <> '\n':
+                t += '\n'
             text.append(t)
         # Now join the text and set the payload
         sep = _('-------------- next part --------------\n')
+        del msg['content-type']
         msg.set_payload(sep.join(text), charset)
-        msg.set_type('text/plain')
         del msg['content-transfer-encoding']
         msg.add_header('Content-Transfer-Encoding', '8bit')
     return msg
@@ -304,7 +341,8 @@ def save_attachment(mlist, msg, dir, filter_html=1):
     # e.g. image/jpg (should be image/jpeg).  For now we just store such
     # things as application/octet-streams since that seems the safest.
     ctype = msg.get_content_type()
-    ext = mimetypes.guess_extension(ctype)
+    fnext = os.path.splitext(msg.get_filename(''))[1]
+    ext = guess_extension(ctype, fnext)
     if not ext:
         # We don't know what it is, so assume it's just a shapeless
         # application/octet-stream, unless the Content-Type: is
author	bwarsaw	2003-01-20 02:12:45 +0000
committer	bwarsaw	2003-01-20 02:12:45 +0000
commit	fdbf0aad277d5b47be35ccecfb809e9cf4a9827e (patch)
tree	d1dc5b2b782155bd1f2e43e28fe2e2af8f7645ca /Mailman/Handlers/Scrubber.py
parent	4796210314419fa48db20c255002c0b04aa56dcd (diff)
download	mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.gz mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.zst mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.zip