diff options
| author | bwarsaw | 2003-01-20 02:12:45 +0000 |
|---|---|---|
| committer | bwarsaw | 2003-01-20 02:12:45 +0000 |
| commit | fdbf0aad277d5b47be35ccecfb809e9cf4a9827e (patch) | |
| tree | d1dc5b2b782155bd1f2e43e28fe2e2af8f7645ca /Mailman/Handlers/Scrubber.py | |
| parent | 4796210314419fa48db20c255002c0b04aa56dcd (diff) | |
| download | mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.gz mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.tar.zst mailman-fdbf0aad277d5b47be35ccecfb809e9cf4a9827e.zip | |
Diffstat (limited to 'Mailman/Handlers/Scrubber.py')
| -rw-r--r-- | Mailman/Handlers/Scrubber.py | 74 |
1 files changed, 56 insertions, 18 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index aa993af7e..3ba64b945 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -17,6 +17,8 @@ """Cleanse a message for archiving. """ +from __future__ import nested_scopes + import os import re import sha @@ -24,7 +26,6 @@ import time import errno import binascii import tempfile -import mimetypes from cStringIO import StringIO from types import IntType @@ -51,6 +52,34 @@ dre = re.compile(r'^\.*') BR = '<br>\n' SPACE = ' ' +try: + from mimetypes import guess_all_extensions +except ImportError: + import mimetypes + def guess_all_extensions(ctype, strict=1): + # BAW: sigh, guess_all_extensions() is new in Python 2.3 + all = [] + def check(map): + for e, t in map.items(): + if t == ctype: + all.append(e) + check(mimetypes.types_map) + if not strict: + check(mimetypes.common_types) + return all + + + +def guess_extension(ctype, ext): + # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, + # and .wiz are all mapped to application/msword. This sucks for finding + # the best reverse mapping. If the extension is one of the giving + # mappings, we'll trust that, otherwise we'll just guess. :/ + all = guess_all_extensions(ctype, strict=0) + if ext in all: + return ext + return all[0] + # We're using a subclass of the standard Generator because we want to suppress @@ -131,6 +160,7 @@ def process(mlist, msg, msgdata=None): msgdata = {} dir = calculate_attachments_dir(mlist, msg, msgdata) charset = None + lcset = Utils.GetCharSet(mlist.preferred_language) # Now walk over all subparts of this message and scrub out various types for part in msg.walk(): ctype = part.get_type(part.get_default_type()) @@ -140,13 +170,16 @@ def process(mlist, msg, msgdata=None): # arbitrarily pick the charset of the first text/plain part in the # message. if charset is None: - charset = part.get_content_charset(charset) + charset = part.get_content_charset(lcset) elif ctype == 'text/html' and isinstance(sanitize, IntType): if sanitize == 0: if outer: raise DiscardMessage - part.set_payload(_('HTML attachment scrubbed and removed')) - part.set_type('text/plain') + del part['content-type'] + part.set_payload(_('HTML attachment scrubbed and removed'), + # Adding charset arg and removing content-tpe + # sets content-type to text/plain + lcset) elif sanitize == 2: # By leaving it alone, Pipermail will automatically escape it pass @@ -159,11 +192,11 @@ def process(mlist, msg, msgdata=None): url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) + del part['content-type'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) else: # HTML-escape it and store it as an attachment, but make it # look a /little/ bit prettier. :( @@ -185,11 +218,11 @@ URL: %(url)s url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) + del part['content-type'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) elif ctype == 'message/rfc822': # This part contains a submessage, so it too needs scrubbing submsg = part.get_payload(0) @@ -202,6 +235,7 @@ URL: %(url)s date = submsg.get('date', _('no date')) who = submsg.get('from', _('unknown sender')) size = len(str(submsg)) + del part['content-type'] part.set_payload(_("""\ An embedded message was scrubbed... From: %(who)s @@ -209,13 +243,12 @@ Subject: %(subject)s Date: %(date)s Size: %(size)s Url: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) # If the message isn't a multipart, then we'll strip it out as an # attachment that would have to be separately downloaded. Pipermail # will transform the url into a hyperlink. elif not part.is_multipart(): - payload = part.get_payload() + payload = part.get_payload(decode=1) ctype = part.get_type() size = len(payload) omask = os.umask(002) @@ -225,6 +258,8 @@ Url: %(url)s os.umask(omask) desc = part.get('content-description', _('not available')) filename = part.get_filename(_('not available')) + del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_("""\ A non-text attachment was scrubbed... Name: %(filename)s @@ -232,8 +267,7 @@ Type: %(ctype)s Size: %(size)d bytes Desc: %(desc)s Url : %(url)s -""")) - part.set_type('text/plain') +"""), lcset) outer = 0 # We still have to sanitize multipart messages to flat text because # Pipermail can't handle messages with list payloads. This is a kludge; @@ -242,8 +276,8 @@ Url : %(url)s # By default we take the charset of the first text/plain part in the # message, but if there was none, we'll use the list's preferred # language's charset. - if charset is None: - charset = Utils.GetCharSet(mlist.preferred_language) + if charset is None or charset == 'us-ascii': + charset = lcset # We now want to concatenate all the parts which have been scrubbed to # text/plain, into a single text/plain payload. We need to make sure # all the characters in the concatenated string are in the same @@ -261,7 +295,7 @@ Url : %(url)s t = part.get_payload(decode=1) except binascii.Error: t = part.get_payload() - partcharset = part.get_charset() + partcharset = part.get_content_charset() if partcharset and partcharset <> charset: try: t = unicode(t, partcharset, 'replace') @@ -270,11 +304,14 @@ Url : %(url)s except UnicodeError: # Replace funny characters t = unicode(t, 'ascii', 'replace').encode('ascii') + # Separation is useful + if t[-1] <> '\n': + t += '\n' text.append(t) # Now join the text and set the payload sep = _('-------------- next part --------------\n') + del msg['content-type'] msg.set_payload(sep.join(text), charset) - msg.set_type('text/plain') del msg['content-transfer-encoding'] msg.add_header('Content-Transfer-Encoding', '8bit') return msg @@ -304,7 +341,8 @@ def save_attachment(mlist, msg, dir, filter_html=1): # e.g. image/jpg (should be image/jpeg). For now we just store such # things as application/octet-streams since that seems the safest. ctype = msg.get_content_type() - ext = mimetypes.guess_extension(ctype) + fnext = os.path.splitext(msg.get_filename(''))[1] + ext = guess_extension(ctype, fnext) if not ext: # We don't know what it is, so assume it's just a shapeless # application/octet-stream, unless the Content-Type: is |
