1 files changed, 56 insertions, 6 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 02f4fdcbb..1c1ea890f 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -27,6 +27,7 @@ from cStringIO import StringIO
 import email
 import email.Errors
 from email.Parser import HeaderParser
+from email.Generator import Generator
 
 from Mailman import LockFile
 from Mailman import Message
@@ -38,20 +39,50 @@ ARCHIVE_FILE_VERSION = 1
 
 
 
+# We're using a subclass of the standard Generator because we want to suppress
+# headers in the subparts of multiparts.  We use a hack -- the ctor argument
+# skipheaders to accomplish this.  It's set to true for the outer Message
+# object, but false for all internal objects.  We recognize that
+# sub-Generators will get created passing only mangle_from_ and maxheaderlen
+# to the ctors.
+#
+# This isn't perfect because we still get stuff like the multipart boundaries,
+# but see below for how we corrupt that to our nefarious goals.
+class ScrubberGenerator(Generator):
+    def __init__(self, outfp, mangle_from_=1, maxheaderlen=78, skipheaders=1):
+        Generator.__init__(self, outfp, mangle_from_=0)
+        self.__skipheaders = skipheaders
+
+    def _write_headers(self, msg):
+        if not self.__skipheaders:
+            Generator._write_headers(self, msg)
+
+
+
 def process(mlist, msg, msgdata=None):
+    outer = 1
     for part in msg.walk():
-        # if the part is text/plain, we leave it alone
+        # If the part is text/plain, we leave it alone
         if part.get_type('text/plain') == 'text/plain':
             continue
+        # I think it's generally a good idea to scrub out HTML.  You never
+        # know what's in there -- web bugs, JavaScript nasties, etc.  If the
+        # whole message is HTML, just discard the entire thing.  Otherwise,
+        # just add an indication that the HTML part was removed.
         if part.get_type() == 'text/html':
-            part.set_payload(cgi.escape(part.get_payload()))
+            if outer:
+                raise DiscardMessage
+            part.set_payload(_("An HTML attachment was scrubbed and removed"))
+        # If the message isn't a multipart, then we'll strip it out as an
+        # attachment that would have to be separately downloaded.  Pipermail
+        # will transform the url into a hyperlink.
         elif not part.is_multipart():
             payload = part.get_payload()
             ctype = part.get_type()
             size = len(payload)
             url = save_attachment(mlist, part)
             desc = part.get('content-description', _('not available'))
-            part.set_payload(_("""
+            part.set_payload(_("""\
 A non-text attachment was scrubbed...
 Type: %(ctype)s
 Size: %(size)d bytes
@@ -59,10 +90,21 @@ Desc: %(desc)s
 Url : %(url)s
 """))
     # We still have to sanitize the message to flat text because Pipermail
-    # can't handle messages with list payloads.  Having to do it this way
-    # seems most unfortunate. ;/
+    # can't handle messages with list payloads.  This is a kludge (def (n)
+    # clever hack ;).
     if msg.is_multipart():
-        sfp = StringIO(str(msg))
+        # We're corrupting the boundary to provide some more useful
+        # information, because while we can suppress subpart headers, we can't
+        # suppress the inter-part boundary without a redesign of the Generator
+        # class or a rewrite of of the whole _handle_multipart() method.
+        msg.set_boundary('%s %s attachment' %
+                         ('-'*20, msg.get_type('text/plain')))
+        sfp = StringIO()
+        g = ScrubberGenerator(sfp, mangle_from_=0, skipheaders=0)
+        g(msg)
+        sfp.seek(0)
+        # We don't care about parsing the body because we've already scrubbed
+        # it of nasty stuff.  Just slurp it all in.
         msg = HeaderParser(Message.Message).parse(sfp)
     return msg
 
@@ -71,6 +113,7 @@ Url : %(url)s
 def save_attachment(mlist, msg):
     # The directory to store the attachment in
     dir = os.path.join(mlist.archive_dir(), 'attachments')
+    # We need a lock to calculate the next attachment number
     lock = LockFile.LockFile(os.path.join(mlist.archive_dir(),
                                           'attachments.lock'))
     lock.lock()
@@ -100,7 +143,14 @@ def save_attachment(mlist, msg):
         lock.unlock()
     # Figure out the attachment type and get the decoded data
     decodedpayload = msg.get_payload(decode=1)
+    # BAW: mimetypes ought to handle non-standard, but commonly found types,
+    # e.g. image/jpg (should be image/jpeg).  For now we just store such
+    # things as application/octet-streams since that seems the safest.
     ext = mimetypes.guess_extension(msg.get_type())
+    if not ext:
+        # We don't know what it is, so assume it's just a shapeless
+        # application/octet-stream
+        ext = '.bin'
     fp = open(os.path.join(dir, file + ext), 'w')
     fp.write(decodedpayload)
     fp.close()