diff options
| author | bwarsaw | 2002-08-13 14:33:32 +0000 |
|---|---|---|
| committer | bwarsaw | 2002-08-13 14:33:32 +0000 |
| commit | 5aa88aadb141f4df5860bd4c6a82951ced75a01b (patch) | |
| tree | 8dff19cf82d2a1849d848d610668d44aa8a55160 | |
| parent | 556c849e09d080fca8fc076703dc4a7a7fff7819 (diff) | |
| download | mailman-5aa88aadb141f4df5860bd4c6a82951ced75a01b.tar.gz mailman-5aa88aadb141f4df5860bd4c6a82951ced75a01b.tar.zst mailman-5aa88aadb141f4df5860bd4c6a82951ced75a01b.zip | |
| -rw-r--r-- | Mailman/Handlers/Scrubber.py | 103 |
1 files changed, 63 insertions, 40 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index 511187e8c..ddf51d796 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -20,12 +20,14 @@ import os import re import sha +import time import errno import mimetypes import tempfile from cStringIO import StringIO from types import IntType +from email.Utils import parsedate from email.Parser import HeaderParser from email.Generator import Generator @@ -67,9 +69,39 @@ class ScrubberGenerator(Generator): +def calculate_attachments_dir(mlist, msg, msgdata): + # Calculate the directory that attachments for this message will go + # under. To avoid inode limitations, the scheme will be: + # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files> + # Start by calculating the date-based and msgid-hash components. + msgdate = msg.get('Date') + if msgdate is None: + now = time.gmtime(msgdata.get('received_time', time.time())) + else: + now = parsedate(msgdate) + datedir = time.strftime('%Y%m%d', now) + # As for the msgid hash, we'll base this part on the Message-ID: so that + # all attachments for the same message end up in the same directory (we'll + # uniquify the filenames in that directory as needed). We use the first 2 + # and last 2 bytes of the SHA1 hash of the message id as the basis of the + # directory name. Clashes here don't really matter too much, and that + # still gives us a 32-bit space to work with. + msgid = msg['message-id'] + if msgid is None: + msgid = msg['Message-ID'] = Utils.unique_message_id(mlist) + # We assume that the message id actually /is/ unique! + digest = sha.new(msgid).hexdigest() + return os.path.join('attachments', datedir, digest[:4] + digest[-4:]) + + + def process(mlist, msg, msgdata=None): sanitize = mm_cfg.ARCHIVE_HTML_SANITIZER outer = 1 + if msgdata is None: + msgdata = {} + dir = calculate_attachments_dir(mlist, msg, msgdata) + # Now walk over all subparts of this message and scrub out various types for part in msg.walk(): ctype = part.get_type(part.get_default_type()) # If the part is text/plain, we leave it alone @@ -89,7 +121,7 @@ def process(mlist, msg, msgdata=None): # lists. omask = os.umask(002) try: - url = save_attachment(mlist, part, filter_html=0) + url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) part.set_payload(_("""\ @@ -111,7 +143,7 @@ URL: %(url)s part.set_payload(payload) omask = os.umask(002) try: - url = save_attachment(mlist, part, filter_html=0) + url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) part.set_payload(_("""\ @@ -123,7 +155,7 @@ URL: %(url)s submsg = part.get_payload(0) omask = os.umask(002) try: - url = save_attachment(mlist, part) + url = save_attachment(mlist, part, dir) finally: os.umask(omask) subject = submsg.get('subject', _('no subject')) @@ -151,7 +183,7 @@ Url: %(url)s size = len(payload) omask = os.umask(002) try: - url = save_attachment(mlist, part) + url = save_attachment(mlist, part, dir) finally: os.umask(omask) desc = part.get('content-description', _('not available')) @@ -186,33 +218,23 @@ Url : %(url)s -def save_attachment(mlist, msg, filter_html=1): - # The directory to store the attachment in - dir = os.path.join(mlist.archive_dir(), 'attachments') +def makedirs(dir): + # Create all the directories to store this attachment in try: - os.mkdir(dir, 02775) - except OSError, e: - if e.errno <> errno.EEXIST: raise - # We need a directory to contain this message's attachments. Base it - # on the Message-ID: so that all attachments for the same message end - # up in the same directory (we'll uniquify the filenames in that - # directory as needed). We use the first 2 and last 2 bytes of the - # SHA1 has of the message id as the basis of the directory name. - # Clashes here don't really matter too much, and that still gives us a - # 32-bit space to work with. - msgid = msg['message-id'] - if msgid is None: - msgid = msg['Message-ID'] = Utils.unique_message_id(mlist) - # We assume that the message id actually /is/ unique! - digest = sha.new(msgid).hexdigest() - msgdir = digest[:4] + digest[-4:] - try: - path = os.path.join(dir, msgdir) - os.mkdir(path, 02775) - # For FreeBSD, which is broken wrt the mode arg of mkdir() - os.chmod(path, 02775) + os.makedirs(dir, 02775) except OSError, e: if e.errno <> errno.EEXIST: raise + # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the + # mode arg of mkdir(). + def twiddle(arg, dirname, names): + os.chmod(dirname, 02775) + os.path.walk(dir, twiddle, None) + + + +def save_attachment(mlist, msg, dir, filter_html=1): + fsdir = os.path.join(mlist.archive_dir(), dir) + makedirs(fsdir) # Figure out the attachment type and get the decoded data decodedpayload = msg.get_payload(decode=1) # BAW: mimetypes ought to handle non-standard, but commonly found types, @@ -230,7 +252,7 @@ def save_attachment(mlist, msg, filter_html=1): ext = '.bin' path = None # We need a lock to calculate the next attachment number - lockfile = os.path.join(dir, msgdir, 'attachments.lock') + lockfile = os.path.join(fsdir, 'attachments.lock') lock = LockFile.LockFile(lockfile) lock.lock() try: @@ -238,7 +260,7 @@ def save_attachment(mlist, msg, filter_html=1): # necessary. filename = msg.get_filename() if not filename: - filename = 'attachment' + ext + filebase = 'attachment' else: # Sanitize the filename given in the message headers parts = pre.split(filename) @@ -246,16 +268,17 @@ def save_attachment(mlist, msg, filter_html=1): # Allow only alphanumerics, dash, underscore, and dot filename = sre.sub('', filename) # If the filename's extension doesn't match the type we guessed, - # which one should we go with? Not sure. Let's do this at least: - # if the filename /has/ no extension, then tack on the one we - # guessed. - if not os.path.splitext(filename)[1]: - filename += ext - # BAW: Anything else we need to be worried about? + # which one should we go with? For now, let's go with the one we + # guessed so attachments can't lie about their type. Also, if the + # filename /has/ no extension, then tack on the one we guessed. + filebase, ignore = os.path.splitext(filename) + # Now we're looking for a unique name for this file on the file + # system. If msgdir/filebase.ext isn't unique, we'll add a counter + # after filebase, e.g. msgdir/filebase-cnt.ext counter = 0 extra = '' while 1: - path = os.path.join(dir, msgdir, filename + extra) + path = os.path.join(fsdir, filebase + extra + ext) # Generally it is not a good idea to test for file existance # before just trying to create it, but the alternatives aren't # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't @@ -263,7 +286,7 @@ def save_attachment(mlist, msg, filter_html=1): # guaranteed that no other process will be racing with us. if os.path.exists(path): counter += 1 - extra = '-%04d%s' % (counter, ext) + extra = '-%04d' % counter else: break finally: @@ -293,8 +316,8 @@ def save_attachment(mlist, msg, filter_html=1): # BAW: Since we've now sanitized the document, it should be plain # text. Blarg, we really want the sanitizer to tell us what the type # if the return data is. :( + ext = '.txt' path = base + '.txt' - filename = os.path.splitext(filename)[0] + '.txt' # Is it a message/rfc822 attachment? elif msg.get_type() == 'message/rfc822': submsg = msg.get_payload() @@ -308,5 +331,5 @@ def save_attachment(mlist, msg, filter_html=1): # Private archives will likely have a trailing slash. Normalize. if baseurl[-1] <> '/': baseurl += '/' - url = baseurl + 'attachments/%s/%s' % (msgdir, filename) + url = baseurl + '%s/%s%s%s' % (dir, filebase, extra, ext) return url |
