diff options
| -rw-r--r-- | Mailman/Handlers/Scrubber.py | 62 |
1 files changed, 56 insertions, 6 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index 02f4fdcbb..1c1ea890f 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -27,6 +27,7 @@ from cStringIO import StringIO import email import email.Errors from email.Parser import HeaderParser +from email.Generator import Generator from Mailman import LockFile from Mailman import Message @@ -38,20 +39,50 @@ ARCHIVE_FILE_VERSION = 1 +# We're using a subclass of the standard Generator because we want to suppress +# headers in the subparts of multiparts. We use a hack -- the ctor argument +# skipheaders to accomplish this. It's set to true for the outer Message +# object, but false for all internal objects. We recognize that +# sub-Generators will get created passing only mangle_from_ and maxheaderlen +# to the ctors. +# +# This isn't perfect because we still get stuff like the multipart boundaries, +# but see below for how we corrupt that to our nefarious goals. +class ScrubberGenerator(Generator): + def __init__(self, outfp, mangle_from_=1, maxheaderlen=78, skipheaders=1): + Generator.__init__(self, outfp, mangle_from_=0) + self.__skipheaders = skipheaders + + def _write_headers(self, msg): + if not self.__skipheaders: + Generator._write_headers(self, msg) + + + def process(mlist, msg, msgdata=None): + outer = 1 for part in msg.walk(): - # if the part is text/plain, we leave it alone + # If the part is text/plain, we leave it alone if part.get_type('text/plain') == 'text/plain': continue + # I think it's generally a good idea to scrub out HTML. You never + # know what's in there -- web bugs, JavaScript nasties, etc. If the + # whole message is HTML, just discard the entire thing. Otherwise, + # just add an indication that the HTML part was removed. if part.get_type() == 'text/html': - part.set_payload(cgi.escape(part.get_payload())) + if outer: + raise DiscardMessage + part.set_payload(_("An HTML attachment was scrubbed and removed")) + # If the message isn't a multipart, then we'll strip it out as an + # attachment that would have to be separately downloaded. Pipermail + # will transform the url into a hyperlink. elif not part.is_multipart(): payload = part.get_payload() ctype = part.get_type() size = len(payload) url = save_attachment(mlist, part) desc = part.get('content-description', _('not available')) - part.set_payload(_(""" + part.set_payload(_("""\ A non-text attachment was scrubbed... Type: %(ctype)s Size: %(size)d bytes @@ -59,10 +90,21 @@ Desc: %(desc)s Url : %(url)s """)) # We still have to sanitize the message to flat text because Pipermail - # can't handle messages with list payloads. Having to do it this way - # seems most unfortunate. ;/ + # can't handle messages with list payloads. This is a kludge (def (n) + # clever hack ;). if msg.is_multipart(): - sfp = StringIO(str(msg)) + # We're corrupting the boundary to provide some more useful + # information, because while we can suppress subpart headers, we can't + # suppress the inter-part boundary without a redesign of the Generator + # class or a rewrite of of the whole _handle_multipart() method. + msg.set_boundary('%s %s attachment' % + ('-'*20, msg.get_type('text/plain'))) + sfp = StringIO() + g = ScrubberGenerator(sfp, mangle_from_=0, skipheaders=0) + g(msg) + sfp.seek(0) + # We don't care about parsing the body because we've already scrubbed + # it of nasty stuff. Just slurp it all in. msg = HeaderParser(Message.Message).parse(sfp) return msg @@ -71,6 +113,7 @@ Url : %(url)s def save_attachment(mlist, msg): # The directory to store the attachment in dir = os.path.join(mlist.archive_dir(), 'attachments') + # We need a lock to calculate the next attachment number lock = LockFile.LockFile(os.path.join(mlist.archive_dir(), 'attachments.lock')) lock.lock() @@ -100,7 +143,14 @@ def save_attachment(mlist, msg): lock.unlock() # Figure out the attachment type and get the decoded data decodedpayload = msg.get_payload(decode=1) + # BAW: mimetypes ought to handle non-standard, but commonly found types, + # e.g. image/jpg (should be image/jpeg). For now we just store such + # things as application/octet-streams since that seems the safest. ext = mimetypes.guess_extension(msg.get_type()) + if not ext: + # We don't know what it is, so assume it's just a shapeless + # application/octet-stream + ext = '.bin' fp = open(os.path.join(dir, file + ext), 'w') fp.write(decodedpayload) fp.close() |
