diff options
| -rw-r--r-- | Mailman/Handlers/Scrubber.py | 153 |
1 files changed, 111 insertions, 42 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index 911bee7e0..24a3c2b48 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -18,24 +18,30 @@ """ import os -import cgi +import re +import sha import errno -import cPickle import mimetypes +import tempfile from cStringIO import StringIO +from types import StringType -import email -import email.Errors from email.Parser import HeaderParser from email.Generator import Generator +from Mailman import mm_cfg +from Mailman import Utils from Mailman import LockFile from Mailman import Message from Mailman.Errors import DiscardMessage from Mailman.i18n import _ from Mailman.Logging.Syslog import syslog -ARCHIVE_FILE_VERSION = 1 +# Path characters for common platforms +pre = re.compile(r'[/\\:]') +# All other characters to strip out of Content-Disposition: filenames +# (essentially anything that isn't an alphanum, dot, slash, or underscore. +sre = re.compile(r'[^-\w.]') @@ -65,14 +71,15 @@ def process(mlist, msg, msgdata=None): # If the part is text/plain, we leave it alone if part.get_type('text/plain') == 'text/plain': pass - # I think it's generally a good idea to scrub out HTML. You never - # know what's in there -- web bugs, JavaScript nasties, etc. If the - # whole message is HTML, just discard the entire thing. Otherwise, - # just add an indication that the HTML part was removed. - elif part.get_type() == 'text/html': - if outer: - raise DiscardMessage - part.set_payload(_("An HTML attachment was scrubbed and removed")) + elif part.get_type() == 'text/html' and \ + not isinstance(mm_cfg.ARCHIVE_HTML_SANITIZER, StringType): + if mm_cfg.ARCHIVE_HTML_SANITIZER == 0: + if outer: + raise DiscardMessage + part.set_payload(_('HTML attachment scrubbed and removed')) + else: + # By leaving it alone, Pipermail will automatically escape it + pass # If the message isn't a multipart, then we'll strip it out as an # attachment that would have to be separately downloaded. Pipermail # will transform the url into a hyperlink. @@ -86,8 +93,10 @@ def process(mlist, msg, msgdata=None): finally: os.umask(omask) desc = part.get('content-description', _('not available')) + filename = part.get_filename(_('not available')) part.set_payload(_("""\ A non-text attachment was scrubbed... +Name: %(filename)s Type: %(ctype)s Size: %(size)d bytes Desc: %(desc)s @@ -118,34 +127,27 @@ Url : %(url)s def save_attachment(mlist, msg): # The directory to store the attachment in dir = os.path.join(mlist.archive_dir(), 'attachments') - # We need a lock to calculate the next attachment number - lock = LockFile.LockFile(os.path.join(mlist.archive_dir(), - 'attachments.lock')) - lock.lock() try: - try: - os.mkdir(dir, 02775) - except OSError, e: - if e.errno <> errno.EEXIST: raise - # Open the attachments database file - dbfile = os.path.join(dir, 'attachments.pck') - try: - fp = open(dbfile) - d = cPickle.load(fp) - fp.close() - except IOError, e: - if e.errno <> errno.ENOENT: raise - d = {'version': ARCHIVE_FILE_VERSION, - 'next' : 1, - } - # Calculate the attachment file name - file = 'attachment-%04d' % d['next'] - d['next'] += 1 - fp = open(dbfile, 'w') - cPickle.dump(d, fp, 1) - fp.close() - finally: - lock.unlock() + os.mkdir(dir, 02775) + except OSError, e: + if e.errno <> errno.EEXIST: raise + # We need a directory to contain this message's attachments. Base it + # on the Message-ID: so that all attachments for the same message end + # up in the same directory (we'll uniquify the filenames in that + # directory as needed). We use the first 2 and last 2 bytes of the + # SHA1 has of the message id as the basis of the directory name. + # Clashes here don't really matter too much, and that still gives us a + # 32-bit space to work with. + msgid = msg['message-id'] + if msgid is None: + msgid = msg['Message-ID'] = Utils.unique_message_id(mlist) + # We assume that the message id actually /is/ unique! + digest = sha.new(msgid).hexdigest() + msgdir = digest[:4] + digest[-4:] + try: + os.mkdir(os.path.join(dir, msgdir), 02775) + except OSError, e: + if e.errno <> errno.EEXIST: raise # Figure out the attachment type and get the decoded data decodedpayload = msg.get_payload(decode=1) # BAW: mimetypes ought to handle non-standard, but commonly found types, @@ -156,9 +158,76 @@ def save_attachment(mlist, msg): # We don't know what it is, so assume it's just a shapeless # application/octet-stream ext = '.bin' - fp = open(os.path.join(dir, file + ext), 'w') + path = None + # We need a lock to calculate the next attachment number + lockfile = os.path.join(dir, msgdir, 'attachments.lock') + lock = LockFile.LockFile(lockfile) + lock.lock() + try: + # Now base the filename on what's in the attachment, uniquifying it if + # necessary. + filename = msg.get_filename() + if not filename: + filename = 'attachment' + ext + else: + # Sanitize the filename given in the message headers + parts = pre.split(filename) + filename = parts[-1] + # Allow only alphanumerics, dash, underscore, and dot + filename = sre.sub('', filename) + # If the filename's extension doesn't match the type we guessed, + # which one should we go with? Not sure. Let's do this at least: + # if the filename /has/ no extension, then tack on the one we + # guessed. + if not os.path.splitext(filename)[1]: + filename += ext + # BAW: Anything else we need to be worried about? + counter = 0 + extra = '' + while 1: + path = os.path.join(dir, msgdir, filename + extra) + # Generally it is not a good idea to test for file existance + # before just trying to create it, but the alternatives aren't + # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't + # NFS-safe). Besides, we have an exclusive lock now, so we're + # guaranteed that no other process will be racing with us. + if os.path.exists(path): + counter += 1 + extra = '-%04d%s' % (counter, ext) + else: + break + finally: + lock.unlock() + # `path' now contains the unique filename for the attachment. There's + # just one more step we need to do. If the part is text/html and + # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be + # here), then send the attachment through the filter program for + # sanitization + if msg.get_type() == 'text/html': + base, ext = os.path.splitext(path) + tmppath = base + '-tmp' + ext + fp = open(tmppath, 'w') + try: + fp.write(decodedpayload) + fp.close() + cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath} + progfp = os.popen(cmd, 'r') + decodedpayload = progfp.read() + status = progfp.close() + if status: + syslog('error', + 'HTML sanitizer exited with non-zero status: %s', + status) + finally: + os.unlink(tmppath) + # BAW: Since we've now sanitized the document, it should be plain + # text. Blarg, we really want the sanitizer to tell us what the type + # if the return data is. :( + path = base + '.txt' + filename = os.path.splitext(filename)[0] + '.txt' + fp = open(path, 'w') fp.write(decodedpayload) fp.close() # Now calculate the url - url = mlist.GetBaseArchiveURL() + '/attachments/' + file + ext + url = mlist.GetBaseArchiveURL() + '/attachments/%s/%s' % (msgdir, filename) return url |
