summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Mailman/Handlers/Scrubber.py153
1 files changed, 111 insertions, 42 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 911bee7e0..24a3c2b48 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -18,24 +18,30 @@
"""
import os
-import cgi
+import re
+import sha
import errno
-import cPickle
import mimetypes
+import tempfile
from cStringIO import StringIO
+from types import StringType
-import email
-import email.Errors
from email.Parser import HeaderParser
from email.Generator import Generator
+from Mailman import mm_cfg
+from Mailman import Utils
from Mailman import LockFile
from Mailman import Message
from Mailman.Errors import DiscardMessage
from Mailman.i18n import _
from Mailman.Logging.Syslog import syslog
-ARCHIVE_FILE_VERSION = 1
+# Path characters for common platforms
+pre = re.compile(r'[/\\:]')
+# All other characters to strip out of Content-Disposition: filenames
+# (essentially anything that isn't an alphanum, dot, slash, or underscore.
+sre = re.compile(r'[^-\w.]')
@@ -65,14 +71,15 @@ def process(mlist, msg, msgdata=None):
# If the part is text/plain, we leave it alone
if part.get_type('text/plain') == 'text/plain':
pass
- # I think it's generally a good idea to scrub out HTML. You never
- # know what's in there -- web bugs, JavaScript nasties, etc. If the
- # whole message is HTML, just discard the entire thing. Otherwise,
- # just add an indication that the HTML part was removed.
- elif part.get_type() == 'text/html':
- if outer:
- raise DiscardMessage
- part.set_payload(_("An HTML attachment was scrubbed and removed"))
+ elif part.get_type() == 'text/html' and \
+ not isinstance(mm_cfg.ARCHIVE_HTML_SANITIZER, StringType):
+ if mm_cfg.ARCHIVE_HTML_SANITIZER == 0:
+ if outer:
+ raise DiscardMessage
+ part.set_payload(_('HTML attachment scrubbed and removed'))
+ else:
+ # By leaving it alone, Pipermail will automatically escape it
+ pass
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded. Pipermail
# will transform the url into a hyperlink.
@@ -86,8 +93,10 @@ def process(mlist, msg, msgdata=None):
finally:
os.umask(omask)
desc = part.get('content-description', _('not available'))
+ filename = part.get_filename(_('not available'))
part.set_payload(_("""\
A non-text attachment was scrubbed...
+Name: %(filename)s
Type: %(ctype)s
Size: %(size)d bytes
Desc: %(desc)s
@@ -118,34 +127,27 @@ Url : %(url)s
def save_attachment(mlist, msg):
# The directory to store the attachment in
dir = os.path.join(mlist.archive_dir(), 'attachments')
- # We need a lock to calculate the next attachment number
- lock = LockFile.LockFile(os.path.join(mlist.archive_dir(),
- 'attachments.lock'))
- lock.lock()
try:
- try:
- os.mkdir(dir, 02775)
- except OSError, e:
- if e.errno <> errno.EEXIST: raise
- # Open the attachments database file
- dbfile = os.path.join(dir, 'attachments.pck')
- try:
- fp = open(dbfile)
- d = cPickle.load(fp)
- fp.close()
- except IOError, e:
- if e.errno <> errno.ENOENT: raise
- d = {'version': ARCHIVE_FILE_VERSION,
- 'next' : 1,
- }
- # Calculate the attachment file name
- file = 'attachment-%04d' % d['next']
- d['next'] += 1
- fp = open(dbfile, 'w')
- cPickle.dump(d, fp, 1)
- fp.close()
- finally:
- lock.unlock()
+ os.mkdir(dir, 02775)
+ except OSError, e:
+ if e.errno <> errno.EEXIST: raise
+ # We need a directory to contain this message's attachments. Base it
+ # on the Message-ID: so that all attachments for the same message end
+ # up in the same directory (we'll uniquify the filenames in that
+ # directory as needed). We use the first 2 and last 2 bytes of the
+ # SHA1 has of the message id as the basis of the directory name.
+ # Clashes here don't really matter too much, and that still gives us a
+ # 32-bit space to work with.
+ msgid = msg['message-id']
+ if msgid is None:
+ msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
+ # We assume that the message id actually /is/ unique!
+ digest = sha.new(msgid).hexdigest()
+ msgdir = digest[:4] + digest[-4:]
+ try:
+ os.mkdir(os.path.join(dir, msgdir), 02775)
+ except OSError, e:
+ if e.errno <> errno.EEXIST: raise
# Figure out the attachment type and get the decoded data
decodedpayload = msg.get_payload(decode=1)
# BAW: mimetypes ought to handle non-standard, but commonly found types,
@@ -156,9 +158,76 @@ def save_attachment(mlist, msg):
# We don't know what it is, so assume it's just a shapeless
# application/octet-stream
ext = '.bin'
- fp = open(os.path.join(dir, file + ext), 'w')
+ path = None
+ # We need a lock to calculate the next attachment number
+ lockfile = os.path.join(dir, msgdir, 'attachments.lock')
+ lock = LockFile.LockFile(lockfile)
+ lock.lock()
+ try:
+ # Now base the filename on what's in the attachment, uniquifying it if
+ # necessary.
+ filename = msg.get_filename()
+ if not filename:
+ filename = 'attachment' + ext
+ else:
+ # Sanitize the filename given in the message headers
+ parts = pre.split(filename)
+ filename = parts[-1]
+ # Allow only alphanumerics, dash, underscore, and dot
+ filename = sre.sub('', filename)
+ # If the filename's extension doesn't match the type we guessed,
+ # which one should we go with? Not sure. Let's do this at least:
+ # if the filename /has/ no extension, then tack on the one we
+ # guessed.
+ if not os.path.splitext(filename)[1]:
+ filename += ext
+ # BAW: Anything else we need to be worried about?
+ counter = 0
+ extra = ''
+ while 1:
+ path = os.path.join(dir, msgdir, filename + extra)
+ # Generally it is not a good idea to test for file existance
+ # before just trying to create it, but the alternatives aren't
+ # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
+ # NFS-safe). Besides, we have an exclusive lock now, so we're
+ # guaranteed that no other process will be racing with us.
+ if os.path.exists(path):
+ counter += 1
+ extra = '-%04d%s' % (counter, ext)
+ else:
+ break
+ finally:
+ lock.unlock()
+ # `path' now contains the unique filename for the attachment. There's
+ # just one more step we need to do. If the part is text/html and
+ # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
+ # here), then send the attachment through the filter program for
+ # sanitization
+ if msg.get_type() == 'text/html':
+ base, ext = os.path.splitext(path)
+ tmppath = base + '-tmp' + ext
+ fp = open(tmppath, 'w')
+ try:
+ fp.write(decodedpayload)
+ fp.close()
+ cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
+ progfp = os.popen(cmd, 'r')
+ decodedpayload = progfp.read()
+ status = progfp.close()
+ if status:
+ syslog('error',
+ 'HTML sanitizer exited with non-zero status: %s',
+ status)
+ finally:
+ os.unlink(tmppath)
+ # BAW: Since we've now sanitized the document, it should be plain
+ # text. Blarg, we really want the sanitizer to tell us what the type
+ # if the return data is. :(
+ path = base + '.txt'
+ filename = os.path.splitext(filename)[0] + '.txt'
+ fp = open(path, 'w')
fp.write(decodedpayload)
fp.close()
# Now calculate the url
- url = mlist.GetBaseArchiveURL() + '/attachments/' + file + ext
+ url = mlist.GetBaseArchiveURL() + '/attachments/%s/%s' % (msgdir, filename)
return url