summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Mailman/Handlers/Scrubber.py62
1 files changed, 56 insertions, 6 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 02f4fdcbb..1c1ea890f 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -27,6 +27,7 @@ from cStringIO import StringIO
import email
import email.Errors
from email.Parser import HeaderParser
+from email.Generator import Generator
from Mailman import LockFile
from Mailman import Message
@@ -38,20 +39,50 @@ ARCHIVE_FILE_VERSION = 1
+# We're using a subclass of the standard Generator because we want to suppress
+# headers in the subparts of multiparts. We use a hack -- the ctor argument
+# skipheaders to accomplish this. It's set to true for the outer Message
+# object, but false for all internal objects. We recognize that
+# sub-Generators will get created passing only mangle_from_ and maxheaderlen
+# to the ctors.
+#
+# This isn't perfect because we still get stuff like the multipart boundaries,
+# but see below for how we corrupt that to our nefarious goals.
+class ScrubberGenerator(Generator):
+ def __init__(self, outfp, mangle_from_=1, maxheaderlen=78, skipheaders=1):
+ Generator.__init__(self, outfp, mangle_from_=0)
+ self.__skipheaders = skipheaders
+
+ def _write_headers(self, msg):
+ if not self.__skipheaders:
+ Generator._write_headers(self, msg)
+
+
+
def process(mlist, msg, msgdata=None):
+ outer = 1
for part in msg.walk():
- # if the part is text/plain, we leave it alone
+ # If the part is text/plain, we leave it alone
if part.get_type('text/plain') == 'text/plain':
continue
+ # I think it's generally a good idea to scrub out HTML. You never
+ # know what's in there -- web bugs, JavaScript nasties, etc. If the
+ # whole message is HTML, just discard the entire thing. Otherwise,
+ # just add an indication that the HTML part was removed.
if part.get_type() == 'text/html':
- part.set_payload(cgi.escape(part.get_payload()))
+ if outer:
+ raise DiscardMessage
+ part.set_payload(_("An HTML attachment was scrubbed and removed"))
+ # If the message isn't a multipart, then we'll strip it out as an
+ # attachment that would have to be separately downloaded. Pipermail
+ # will transform the url into a hyperlink.
elif not part.is_multipart():
payload = part.get_payload()
ctype = part.get_type()
size = len(payload)
url = save_attachment(mlist, part)
desc = part.get('content-description', _('not available'))
- part.set_payload(_("""
+ part.set_payload(_("""\
A non-text attachment was scrubbed...
Type: %(ctype)s
Size: %(size)d bytes
@@ -59,10 +90,21 @@ Desc: %(desc)s
Url : %(url)s
"""))
# We still have to sanitize the message to flat text because Pipermail
- # can't handle messages with list payloads. Having to do it this way
- # seems most unfortunate. ;/
+ # can't handle messages with list payloads. This is a kludge (def (n)
+ # clever hack ;).
if msg.is_multipart():
- sfp = StringIO(str(msg))
+ # We're corrupting the boundary to provide some more useful
+ # information, because while we can suppress subpart headers, we can't
+ # suppress the inter-part boundary without a redesign of the Generator
+ # class or a rewrite of of the whole _handle_multipart() method.
+ msg.set_boundary('%s %s attachment' %
+ ('-'*20, msg.get_type('text/plain')))
+ sfp = StringIO()
+ g = ScrubberGenerator(sfp, mangle_from_=0, skipheaders=0)
+ g(msg)
+ sfp.seek(0)
+ # We don't care about parsing the body because we've already scrubbed
+ # it of nasty stuff. Just slurp it all in.
msg = HeaderParser(Message.Message).parse(sfp)
return msg
@@ -71,6 +113,7 @@ Url : %(url)s
def save_attachment(mlist, msg):
# The directory to store the attachment in
dir = os.path.join(mlist.archive_dir(), 'attachments')
+ # We need a lock to calculate the next attachment number
lock = LockFile.LockFile(os.path.join(mlist.archive_dir(),
'attachments.lock'))
lock.lock()
@@ -100,7 +143,14 @@ def save_attachment(mlist, msg):
lock.unlock()
# Figure out the attachment type and get the decoded data
decodedpayload = msg.get_payload(decode=1)
+ # BAW: mimetypes ought to handle non-standard, but commonly found types,
+ # e.g. image/jpg (should be image/jpeg). For now we just store such
+ # things as application/octet-streams since that seems the safest.
ext = mimetypes.guess_extension(msg.get_type())
+ if not ext:
+ # We don't know what it is, so assume it's just a shapeless
+ # application/octet-stream
+ ext = '.bin'
fp = open(os.path.join(dir, file + ext), 'w')
fp.write(decodedpayload)
fp.close()