summaryrefslogtreecommitdiff
path: root/Mailman/Handlers/Scrubber.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Handlers/Scrubber.py')
-rw-r--r--Mailman/Handlers/Scrubber.py103
1 files changed, 63 insertions, 40 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 511187e8c..ddf51d796 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -20,12 +20,14 @@
import os
import re
import sha
+import time
import errno
import mimetypes
import tempfile
from cStringIO import StringIO
from types import IntType
+from email.Utils import parsedate
from email.Parser import HeaderParser
from email.Generator import Generator
@@ -67,9 +69,39 @@ class ScrubberGenerator(Generator):
+def calculate_attachments_dir(mlist, msg, msgdata):
+ # Calculate the directory that attachments for this message will go
+ # under. To avoid inode limitations, the scheme will be:
+ # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
+ # Start by calculating the date-based and msgid-hash components.
+ msgdate = msg.get('Date')
+ if msgdate is None:
+ now = time.gmtime(msgdata.get('received_time', time.time()))
+ else:
+ now = parsedate(msgdate)
+ datedir = time.strftime('%Y%m%d', now)
+ # As for the msgid hash, we'll base this part on the Message-ID: so that
+ # all attachments for the same message end up in the same directory (we'll
+ # uniquify the filenames in that directory as needed). We use the first 2
+ # and last 2 bytes of the SHA1 hash of the message id as the basis of the
+ # directory name. Clashes here don't really matter too much, and that
+ # still gives us a 32-bit space to work with.
+ msgid = msg['message-id']
+ if msgid is None:
+ msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
+ # We assume that the message id actually /is/ unique!
+ digest = sha.new(msgid).hexdigest()
+ return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
+
+
+
def process(mlist, msg, msgdata=None):
sanitize = mm_cfg.ARCHIVE_HTML_SANITIZER
outer = 1
+ if msgdata is None:
+ msgdata = {}
+ dir = calculate_attachments_dir(mlist, msg, msgdata)
+ # Now walk over all subparts of this message and scrub out various types
for part in msg.walk():
ctype = part.get_type(part.get_default_type())
# If the part is text/plain, we leave it alone
@@ -89,7 +121,7 @@ def process(mlist, msg, msgdata=None):
# lists.
omask = os.umask(002)
try:
- url = save_attachment(mlist, part, filter_html=0)
+ url = save_attachment(mlist, part, dir, filter_html=0)
finally:
os.umask(omask)
part.set_payload(_("""\
@@ -111,7 +143,7 @@ URL: %(url)s
part.set_payload(payload)
omask = os.umask(002)
try:
- url = save_attachment(mlist, part, filter_html=0)
+ url = save_attachment(mlist, part, dir, filter_html=0)
finally:
os.umask(omask)
part.set_payload(_("""\
@@ -123,7 +155,7 @@ URL: %(url)s
submsg = part.get_payload(0)
omask = os.umask(002)
try:
- url = save_attachment(mlist, part)
+ url = save_attachment(mlist, part, dir)
finally:
os.umask(omask)
subject = submsg.get('subject', _('no subject'))
@@ -151,7 +183,7 @@ Url: %(url)s
size = len(payload)
omask = os.umask(002)
try:
- url = save_attachment(mlist, part)
+ url = save_attachment(mlist, part, dir)
finally:
os.umask(omask)
desc = part.get('content-description', _('not available'))
@@ -186,33 +218,23 @@ Url : %(url)s
-def save_attachment(mlist, msg, filter_html=1):
- # The directory to store the attachment in
- dir = os.path.join(mlist.archive_dir(), 'attachments')
+def makedirs(dir):
+ # Create all the directories to store this attachment in
try:
- os.mkdir(dir, 02775)
- except OSError, e:
- if e.errno <> errno.EEXIST: raise
- # We need a directory to contain this message's attachments. Base it
- # on the Message-ID: so that all attachments for the same message end
- # up in the same directory (we'll uniquify the filenames in that
- # directory as needed). We use the first 2 and last 2 bytes of the
- # SHA1 has of the message id as the basis of the directory name.
- # Clashes here don't really matter too much, and that still gives us a
- # 32-bit space to work with.
- msgid = msg['message-id']
- if msgid is None:
- msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
- # We assume that the message id actually /is/ unique!
- digest = sha.new(msgid).hexdigest()
- msgdir = digest[:4] + digest[-4:]
- try:
- path = os.path.join(dir, msgdir)
- os.mkdir(path, 02775)
- # For FreeBSD, which is broken wrt the mode arg of mkdir()
- os.chmod(path, 02775)
+ os.makedirs(dir, 02775)
except OSError, e:
if e.errno <> errno.EEXIST: raise
+ # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the
+ # mode arg of mkdir().
+ def twiddle(arg, dirname, names):
+ os.chmod(dirname, 02775)
+ os.path.walk(dir, twiddle, None)
+
+
+
+def save_attachment(mlist, msg, dir, filter_html=1):
+ fsdir = os.path.join(mlist.archive_dir(), dir)
+ makedirs(fsdir)
# Figure out the attachment type and get the decoded data
decodedpayload = msg.get_payload(decode=1)
# BAW: mimetypes ought to handle non-standard, but commonly found types,
@@ -230,7 +252,7 @@ def save_attachment(mlist, msg, filter_html=1):
ext = '.bin'
path = None
# We need a lock to calculate the next attachment number
- lockfile = os.path.join(dir, msgdir, 'attachments.lock')
+ lockfile = os.path.join(fsdir, 'attachments.lock')
lock = LockFile.LockFile(lockfile)
lock.lock()
try:
@@ -238,7 +260,7 @@ def save_attachment(mlist, msg, filter_html=1):
# necessary.
filename = msg.get_filename()
if not filename:
- filename = 'attachment' + ext
+ filebase = 'attachment'
else:
# Sanitize the filename given in the message headers
parts = pre.split(filename)
@@ -246,16 +268,17 @@ def save_attachment(mlist, msg, filter_html=1):
# Allow only alphanumerics, dash, underscore, and dot
filename = sre.sub('', filename)
# If the filename's extension doesn't match the type we guessed,
- # which one should we go with? Not sure. Let's do this at least:
- # if the filename /has/ no extension, then tack on the one we
- # guessed.
- if not os.path.splitext(filename)[1]:
- filename += ext
- # BAW: Anything else we need to be worried about?
+ # which one should we go with? For now, let's go with the one we
+ # guessed so attachments can't lie about their type. Also, if the
+ # filename /has/ no extension, then tack on the one we guessed.
+ filebase, ignore = os.path.splitext(filename)
+ # Now we're looking for a unique name for this file on the file
+ # system. If msgdir/filebase.ext isn't unique, we'll add a counter
+ # after filebase, e.g. msgdir/filebase-cnt.ext
counter = 0
extra = ''
while 1:
- path = os.path.join(dir, msgdir, filename + extra)
+ path = os.path.join(fsdir, filebase + extra + ext)
# Generally it is not a good idea to test for file existance
# before just trying to create it, but the alternatives aren't
# wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
@@ -263,7 +286,7 @@ def save_attachment(mlist, msg, filter_html=1):
# guaranteed that no other process will be racing with us.
if os.path.exists(path):
counter += 1
- extra = '-%04d%s' % (counter, ext)
+ extra = '-%04d' % counter
else:
break
finally:
@@ -293,8 +316,8 @@ def save_attachment(mlist, msg, filter_html=1):
# BAW: Since we've now sanitized the document, it should be plain
# text. Blarg, we really want the sanitizer to tell us what the type
# if the return data is. :(
+ ext = '.txt'
path = base + '.txt'
- filename = os.path.splitext(filename)[0] + '.txt'
# Is it a message/rfc822 attachment?
elif msg.get_type() == 'message/rfc822':
submsg = msg.get_payload()
@@ -308,5 +331,5 @@ def save_attachment(mlist, msg, filter_html=1):
# Private archives will likely have a trailing slash. Normalize.
if baseurl[-1] <> '/':
baseurl += '/'
- url = baseurl + 'attachments/%s/%s' % (msgdir, filename)
+ url = baseurl + '%s/%s%s%s' % (dir, filebase, extra, ext)
return url