Much revision based on good feedback from mailman-developers.

Specifically, process(): Instead of a blanket discard of text/html parts, what we do depends on the value for ARCHIVE_HTML_SANITIZER. Also, the "scrubbed" message now includes the value of get_filename() if available. save_attachment(): Several refinements including: all of a message's attachments are stored in a subdirectory off archives/private/mylist/attachments. This subdir is calculated based on the SHA1 hash of the Message-ID: We store the attachment in a file based on the name of the get_filename() value, sanitize for nasty characters, absolute paths, etc. The filename is uniquified within the message's attachments subdir. This function also knows about text/html and performs filtering when ARCHIVE_HTML_SANITIZER is a program string. Note that currently, we assume that this program will filter html into plain text, so we change the attachment's suffix to .txt accordingly. We don't need the attachments.pck file anymore.
author: bwarsaw 2001-10-26 22:05:23 +0000
committer: bwarsaw 2001-10-26 22:05:23 +0000
commit: 463431ea0dfbeb1c3d0df192a68d93d84ca65a9a (patch)
tree: 01d0db9175b6624cb2c23af2cdc395c3e6710d85
parent: d0ad817a50ca2cb351a7f07e31717b507cd06b56 (diff)
download: mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.tar.gz
mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.tar.zst
mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.zip
1 files changed, 111 insertions, 42 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 911bee7e0..24a3c2b48 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -18,24 +18,30 @@
 """
 
 import os
-import cgi
+import re
+import sha
 import errno
-import cPickle
 import mimetypes
+import tempfile
 from cStringIO import StringIO
+from types import StringType
 
-import email
-import email.Errors
 from email.Parser import HeaderParser
 from email.Generator import Generator
 
+from Mailman import mm_cfg
+from Mailman import Utils
 from Mailman import LockFile
 from Mailman import Message
 from Mailman.Errors import DiscardMessage
 from Mailman.i18n import _
 from Mailman.Logging.Syslog import syslog
 
-ARCHIVE_FILE_VERSION = 1
+# Path characters for common platforms
+pre = re.compile(r'[/\\:]')
+# All other characters to strip out of Content-Disposition: filenames
+# (essentially anything that isn't an alphanum, dot, slash, or underscore.
+sre = re.compile(r'[^-\w.]')
 
 
 
@@ -65,14 +71,15 @@ def process(mlist, msg, msgdata=None):
         # If the part is text/plain, we leave it alone
         if part.get_type('text/plain') == 'text/plain':
             pass
-        # I think it's generally a good idea to scrub out HTML.  You never
-        # know what's in there -- web bugs, JavaScript nasties, etc.  If the
-        # whole message is HTML, just discard the entire thing.  Otherwise,
-        # just add an indication that the HTML part was removed.
-        elif part.get_type() == 'text/html':
-            if outer:
-                raise DiscardMessage
-            part.set_payload(_("An HTML attachment was scrubbed and removed"))
+        elif part.get_type() == 'text/html' and \
+             not isinstance(mm_cfg.ARCHIVE_HTML_SANITIZER, StringType):
+            if mm_cfg.ARCHIVE_HTML_SANITIZER == 0:
+                if outer:
+                    raise DiscardMessage
+                part.set_payload(_('HTML attachment scrubbed and removed'))
+            else:
+                # By leaving it alone, Pipermail will automatically escape it
+                pass
         # If the message isn't a multipart, then we'll strip it out as an
         # attachment that would have to be separately downloaded.  Pipermail
         # will transform the url into a hyperlink.
@@ -86,8 +93,10 @@ def process(mlist, msg, msgdata=None):
             finally:
                 os.umask(omask)
             desc = part.get('content-description', _('not available'))
+            filename = part.get_filename(_('not available'))
             part.set_payload(_("""\
 A non-text attachment was scrubbed...
+Name: %(filename)s
 Type: %(ctype)s
 Size: %(size)d bytes
 Desc: %(desc)s
@@ -118,34 +127,27 @@ Url : %(url)s
 def save_attachment(mlist, msg):
     # The directory to store the attachment in
     dir = os.path.join(mlist.archive_dir(), 'attachments')
-    # We need a lock to calculate the next attachment number
-    lock = LockFile.LockFile(os.path.join(mlist.archive_dir(),
-                                          'attachments.lock'))
-    lock.lock()
     try:
-        try:
-            os.mkdir(dir, 02775)
-        except OSError, e:
-            if e.errno <> errno.EEXIST: raise
-        # Open the attachments database file
-        dbfile = os.path.join(dir, 'attachments.pck')
-        try:
-            fp = open(dbfile)
-            d = cPickle.load(fp)
-            fp.close()
-        except IOError, e:
-            if e.errno <> errno.ENOENT: raise
-            d = {'version': ARCHIVE_FILE_VERSION,
-                 'next'   : 1,
-                 }
-        # Calculate the attachment file name
-        file = 'attachment-%04d' % d['next']
-        d['next'] += 1
-        fp = open(dbfile, 'w')
-        cPickle.dump(d, fp, 1)
-        fp.close()
-    finally:
-        lock.unlock()
+        os.mkdir(dir, 02775)
+    except OSError, e:
+        if e.errno <> errno.EEXIST: raise
+    # We need a directory to contain this message's attachments.  Base it
+    # on the Message-ID: so that all attachments for the same message end
+    # up in the same directory (we'll uniquify the filenames in that
+    # directory as needed).  We use the first 2 and last 2 bytes of the
+    # SHA1 has of the message id as the basis of the directory name.
+    # Clashes here don't really matter too much, and that still gives us a
+    # 32-bit space to work with.
+    msgid = msg['message-id']
+    if msgid is None:
+        msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
+    # We assume that the message id actually /is/ unique!
+    digest = sha.new(msgid).hexdigest()
+    msgdir = digest[:4] + digest[-4:]
+    try:
+        os.mkdir(os.path.join(dir, msgdir), 02775)
+    except OSError, e:
+        if e.errno <> errno.EEXIST: raise
     # Figure out the attachment type and get the decoded data
     decodedpayload = msg.get_payload(decode=1)
     # BAW: mimetypes ought to handle non-standard, but commonly found types,
@@ -156,9 +158,76 @@ def save_attachment(mlist, msg):
         # We don't know what it is, so assume it's just a shapeless
         # application/octet-stream
         ext = '.bin'
-    fp = open(os.path.join(dir, file + ext), 'w')
+    path = None
+    # We need a lock to calculate the next attachment number
+    lockfile = os.path.join(dir, msgdir, 'attachments.lock')
+    lock = LockFile.LockFile(lockfile)
+    lock.lock()
+    try:
+        # Now base the filename on what's in the attachment, uniquifying it if
+        # necessary.
+        filename = msg.get_filename()
+        if not filename:
+            filename = 'attachment' + ext
+        else:
+            # Sanitize the filename given in the message headers
+            parts = pre.split(filename)
+            filename = parts[-1]
+            # Allow only alphanumerics, dash, underscore, and dot
+            filename = sre.sub('', filename)
+            # If the filename's extension doesn't match the type we guessed,
+            # which one should we go with?  Not sure.  Let's do this at least:
+            # if the filename /has/ no extension, then tack on the one we
+            # guessed.
+            if not os.path.splitext(filename)[1]:
+                filename += ext
+            # BAW: Anything else we need to be worried about?
+        counter = 0
+        extra = ''
+        while 1:
+            path = os.path.join(dir, msgdir, filename + extra)
+            # Generally it is not a good idea to test for file existance
+            # before just trying to create it, but the alternatives aren't
+            # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
+            # NFS-safe).  Besides, we have an exclusive lock now, so we're
+            # guaranteed that no other process will be racing with us.
+            if os.path.exists(path):
+                counter += 1
+                extra = '-%04d%s' % (counter, ext)
+            else:
+                break
+    finally:
+        lock.unlock()
+    # `path' now contains the unique filename for the attachment.  There's
+    # just one more step we need to do.  If the part is text/html and
+    # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
+    # here), then send the attachment through the filter program for
+    # sanitization
+    if msg.get_type() == 'text/html':
+        base, ext = os.path.splitext(path)
+        tmppath = base + '-tmp' + ext
+        fp = open(tmppath, 'w')
+        try:
+            fp.write(decodedpayload)
+            fp.close()
+            cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
+            progfp = os.popen(cmd, 'r')
+            decodedpayload = progfp.read()
+            status = progfp.close()
+            if status:
+                syslog('error',
+                       'HTML sanitizer exited with non-zero status: %s',
+                       status)
+        finally:
+            os.unlink(tmppath)
+        # BAW: Since we've now sanitized the document, it should be plain
+        # text.  Blarg, we really want the sanitizer to tell us what the type
+        # if the return data is. :(
+        path = base + '.txt'
+        filename = os.path.splitext(filename)[0] + '.txt'
+    fp = open(path, 'w')
     fp.write(decodedpayload)
     fp.close()
     # Now calculate the url
-    url = mlist.GetBaseArchiveURL() + '/attachments/' + file + ext
+    url = mlist.GetBaseArchiveURL() + '/attachments/%s/%s' % (msgdir, filename)
     return url
author	bwarsaw	2001-10-26 22:05:23 +0000
committer	bwarsaw	2001-10-26 22:05:23 +0000
commit	463431ea0dfbeb1c3d0df192a68d93d84ca65a9a (patch)
tree	01d0db9175b6624cb2c23af2cdc395c3e6710d85
parent	d0ad817a50ca2cb351a7f07e31717b507cd06b56 (diff)
download	mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.tar.gz mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.tar.zst mailman-463431ea0dfbeb1c3d0df192a68d93d84ca65a9a.zip