summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Mailman/Handlers/Scrubber.py74
1 files changed, 52 insertions, 22 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 918bd2054..401f6db49 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -22,8 +22,9 @@ import re
import sha
import time
import errno
-import mimetypes
+import binascii
import tempfile
+import mimetypes
from cStringIO import StringIO
from types import IntType
@@ -103,17 +104,23 @@ def process(mlist, msg, msgdata=None):
if msgdata is None:
msgdata = {}
dir = calculate_attachments_dir(mlist, msg, msgdata)
+ charset = None
# Now walk over all subparts of this message and scrub out various types
for part in msg.walk():
ctype = part.get_type(part.get_default_type())
# If the part is text/plain, we leave it alone
if ctype == 'text/plain':
- pass
+ # We need to choose a charset for the scrubbed message, so we'll
+ # arbitrarily pick the charset of the first text/plain part in the
+ # message.
+ if charset is None:
+ charset = part.get_content_charset(charset)
elif ctype == 'text/html' and isinstance(sanitize, IntType):
if sanitize == 0:
if outer:
raise DiscardMessage
part.set_payload(_('HTML attachment scrubbed and removed'))
+ part.set_type('text/plain')
elif sanitize == 2:
# By leaving it alone, Pipermail will automatically escape it
pass
@@ -130,6 +137,7 @@ def process(mlist, msg, msgdata=None):
An HTML attachment was scrubbed...
URL: %(url)s
"""))
+ part.set_type('text/plain')
else:
# HTML-escape it and store it as an attachment, but make it
# look a /little/ bit prettier. :(
@@ -155,6 +163,7 @@ URL: %(url)s
An HTML attachment was scrubbed...
URL: %(url)s
"""))
+ part.set_type('text/plain')
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
submsg = part.get_payload(0)
@@ -175,10 +184,7 @@ Date: %(date)s
Size: %(size)s
Url: %(url)s
"""))
- # If we were to leave the message/rfc822 Content-Type: header, it
- # would confuse the generator. So just delete it. The generator
- # will treat this as a text/plain message.
- del part['content-type']
+ part.set_type('text/plain')
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded. Pipermail
# will transform the url into a hyperlink.
@@ -201,24 +207,48 @@ Size: %(size)d bytes
Desc: %(desc)s
Url : %(url)s
"""))
+ part.set_type('text/plain')
outer = 0
- # We still have to sanitize the message to flat text because Pipermail
- # can't handle messages with list payloads. This is a kludge (def (n)
- # clever hack ;).
+ # We still have to sanitize multipart messages to flat text because
+ # Pipermail can't handle messages with list payloads. This is a kludge;
+ # def (n) clever hack ;).
if msg.is_multipart():
- # We're corrupting the boundary to provide some more useful
- # information, because while we can suppress subpart headers, we can't
- # suppress the inter-part boundary without a redesign of the Generator
- # class or a rewrite of of the whole _handle_multipart() method.
- msg.set_boundary('%s %s attachment' %
- ('-'*20, msg.get_type('text/plain')))
- sfp = StringIO()
- g = ScrubberGenerator(sfp, mangle_from_=0, skipheaders=0)
- g(msg)
- sfp.seek(0)
- # We don't care about parsing the body because we've already scrubbed
- # it of nasty stuff. Just slurp it all in.
- msg = HeaderParser(Message.Message).parse(sfp)
+ # By default we take the charset of the first text/plain part in the
+ # message, but if there was none, we'll use the list's preferred
+ # language's charset.
+ if charset is None:
+ charset = Utils.GetCharSet(mlist.preferred_language)
+ # We now want to concatenate all the parts which have been scrubbed to
+ # text/plain, into a single text/plain payload. We need to make sure
+ # all the characters in the concatenated string are in the same
+ # encoding, so we'll use the 'replace' key in the coercion call.
+ # BAW: Martin's original patch suggested we might want to try
+ # generalizing to utf-8, and that's probably a good idea (eventually).
+ text = []
+ for part in msg.get_payload():
+ # All parts should be scrubbed to text/plain by now.
+ partctype = part.get_content_type()
+ if partctype <> 'text/plain':
+ text.append(_('Skipped content of type %(partctype)s'))
+ continue
+ try:
+ t = part.get_payload(decode=1)
+ except binascii.Error:
+ t = part.get_payload()
+ partcharset = part.get_charset()
+ if partcharset and partcharset <> charset:
+ try:
+ t = unicode(t, partcharset, 'replace')
+ # Should use HTML-Escape, or try generalizing to UTF-8
+ t = t.encode(charset, 'replace')
+ except UnicodeError:
+ # Replace funny characters
+ t = unicode(t, 'ascii', 'replace').encode('ascii')
+ text.append(t)
+ # Now join the text and set the payload
+ sep = _('-------------- next part --------------\n')
+ msg.set_payload(sep.join(text), charset)
+ msg.set_type('text/plain')
return msg