diff options
Diffstat (limited to 'src/mailman/pipeline/scrubber.py')
| -rw-r--r-- | src/mailman/pipeline/scrubber.py | 499 |
1 files changed, 0 insertions, 499 deletions
diff --git a/src/mailman/pipeline/scrubber.py b/src/mailman/pipeline/scrubber.py deleted file mode 100644 index 0584c0a2c..000000000 --- a/src/mailman/pipeline/scrubber.py +++ /dev/null @@ -1,499 +0,0 @@ -# Copyright (C) 2001-2012 by the Free Software Foundation, Inc. -# -# This file is part of GNU Mailman. -# -# GNU Mailman is free software: you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation, either version 3 of the License, or (at your option) -# any later version. -# -# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. -# -# You should have received a copy of the GNU General Public License along with -# GNU Mailman. If not, see <http://www.gnu.org/licenses/>. - -"""Cleanse a message for archiving.""" - -from __future__ import absolute_import, unicode_literals - -__metaclass__ = type -__all__ = [ - 'Scrubber', - ] - - -import os -import re -import time -import hashlib -import logging -import binascii - -from email.charset import Charset -from email.utils import make_msgid, parsedate -from flufl.lock import Lock -from lazr.config import as_boolean -from mimetypes import guess_all_extensions -from string import Template -from zope.interface import implements - -from mailman.config import config -from mailman.core.errors import DiscardMessage -from mailman.core.i18n import _ -from mailman.interfaces.handler import IHandler -from mailman.utilities.filesystem import makedirs -from mailman.utilities.modules import find_name -from mailman.utilities.string import oneline, websafe - - -# Path characters for common platforms -pre = re.compile(r'[/\\:]') -# All other characters to strip out of Content-Disposition: filenames -# (essentially anything that isn't an alphanum, dot, dash, or underscore). -sre = re.compile(r'[^-\w.]') -# Regexp to strip out leading dots -dre = re.compile(r'^\.*') - -BR = '<br>\n' -SPACE = ' ' - -log = logging.getLogger('mailman.error') - - - -def guess_extension(ctype, ext): - """Find the extension mapped to the given content-type. - - mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, and - .wiz are all mapped to application/msword. This sucks for finding the - best reverse mapping. If the extension is one of the giving mappings, - we'll trust that, otherwise we'll just guess. :/ - """ - all_extensions = guess_all_extensions(ctype, strict=False) - if ext in all_extensions: - return ext - return (all_extensions[0] if len(all_extensions) > 0 else None) - - - -def safe_strftime(fmt, t): - """A time.strftime() that eats exceptions, returning None instead.""" - try: - return time.strftime(fmt, t) - except (TypeError, ValueError, OverflowError): - return None - - -def calculate_attachments_dir(msg, msgdata): - """Calculate the directory for attachements. - - Calculate the directory that attachments for this message will go under. - To avoid inode limitations, the scheme will be: - archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files> - Start by calculating the date-based and msgid-hash components. - """ - fmt = '%Y%m%d' - datestr = msg.get('Date') - if datestr: - now = parsedate(datestr) - else: - now = time.gmtime(msgdata.get('received_time', time.time())) - datedir = safe_strftime(fmt, now) - if not datedir: - datestr = msgdata.get('X-List-Received-Date') - if datestr: - datedir = safe_strftime(fmt, datestr) - if not datedir: - # What next? Unixfrom, I guess. - parts = msg.get_unixfrom().split() - try: - month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, - 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12, - }.get(parts[3], 0) - day = int(parts[4]) - year = int(parts[6]) - except (IndexError, ValueError): - # Best we can do I think - month = day = year = 0 - datedir = '%04d%02d%02d' % (year, month, day) - assert datedir - # As for the msgid hash, we'll base this part on the Message-ID: so that - # all attachments for the same message end up in the same directory (we'll - # uniquify the filenames in that directory as needed). We use the first 2 - # and last 2 bytes of the SHA1 hash of the message id as the basis of the - # directory name. Clashes here don't really matter too much, and that - # still gives us a 32-bit space to work with. - msgid = msg['message-id'] - if msgid is None: - msgid = msg['Message-ID'] = make_msgid() - # We assume that the message id actually /is/ unique! - digest = hashlib.sha1(msgid).hexdigest() - return os.path.join('attachments', datedir, digest[:4] + digest[-4:]) - - -def replace_payload_by_text(msg, text, charset): - """Replace the payload of the message with some text.""" - # TK: This is a common function in replacing the attachment and the main - # message by a text (scrubbing). - del msg['content-type'] - del msg['content-transfer-encoding'] - if isinstance(text, unicode): - text = text.encode(charset) - if not isinstance(charset, str): - charset = str(charset) - msg.set_payload(text, charset) - - - -def process(mlist, msg, msgdata=None): - """Process the message through the scrubber.""" - sanitize = int(config.scrubber.archive_html_sanitizer) - outer = True - if msgdata is None: - msgdata = {} - if msgdata: - # msgdata is available if it is in GLOBAL_PIPELINE - # ie. not in digest or archiver - # check if the list owner want to scrub regular delivery - if not mlist.scrub_nondigest: - return - attachments_dir = calculate_attachments_dir(msg, msgdata) - charset = format_param = delsp = None - lcset = mlist.preferred_language.charset - lcset_out = Charset(lcset).output_charset or lcset - # Now walk over all subparts of this message and scrub out various types - for part in msg.walk(): - ctype = part.get_content_type() - # If the part is text/plain, we leave it alone - if ctype == 'text/plain': - # We need to choose a charset for the scrubbed message, so we'll - # arbitrarily pick the charset of the first text/plain part in the - # message. - # - # Also get the RFC 3676 stuff from this part. This seems to - # work okay for scrub_nondigest. It will also work as far as - # scrubbing messages for the archive is concerned, but Pipermail - # doesn't pay any attention to the RFC 3676 parameters. The plain - # format digest is going to be a disaster in any case as some of - # messages will be format="flowed" and some not. ToDigest creates - # its own Content-Type: header for the plain digest which won't - # have RFC 3676 parameters. If the message Content-Type: headers - # are retained for display in the digest, the parameters will be - # there for information, but not for the MUA. This is the best we - # can do without having get_payload() process the parameters. - if charset is None: - charset = part.get_content_charset(lcset) - format_param = part.get_param('format') - delsp = part.get_param('delsp') - # TK: if part is attached then check charset and scrub if none - if part.get('content-disposition') and \ - not part.get_content_charset(): - url = save_attachment(mlist, part, attachments_dir) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -An embedded and charset-unspecified text was scrubbed... -Name: $filename -URL: $url -"""), lcset) - elif ctype == 'text/html' and isinstance(sanitize, int): - if sanitize == 0: - if outer: - raise DiscardMessage - replace_payload_by_text(part, - _('HTML attachment scrubbed and removed'), - # Adding charset arg and removing content-type - # sets content-type to text/plain - lcset) - elif sanitize == 2: - # By leaving it alone, Pipermail will automatically escape it - pass - elif sanitize == 3: - # Pull it out as an attachment but leave it unescaped. This - # is dangerous, but perhaps useful for heavily moderated - # lists. - url = save_attachment(mlist, part, attachments_dir, - filter_html=False) - replace_payload_by_text(part, _("""\ -An HTML attachment was scrubbed... -URL: $url -"""), lcset) - else: - # HTML-escape it and store it as an attachment, but make it - # look a /little/ bit prettier. :( - payload = websafe(part.get_payload(decode=True)) - # For whitespace in the margin, change spaces into - # non-breaking spaces, and tabs into 8 of those. Then use a - # mono-space font. Still looks hideous to me, but then I'd - # just as soon discard them. - lines = [s.replace(' ', ' ').replace('\t', ' ' * 8) - for s in payload.split('\n')] - payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n' - part.set_payload(payload) - # We're replacing the payload with the decoded payload so this - # will just get in the way. - del part['content-transfer-encoding'] - url = save_attachment(mlist, part, attachments_dir, - filter_html=False) - replace_payload_by_text(part, _("""\ -An HTML attachment was scrubbed... -URL: $url -"""), lcset) - elif ctype == 'message/rfc822': - # This part contains a submessage, so it too needs scrubbing - submsg = part.get_payload(0) - url = save_attachment(mlist, part, attachments_dir) - subject = submsg.get('subject', _('no subject')) - date = submsg.get('date', _('no date')) - who = submsg.get('from', _('unknown sender')) - size = len(str(submsg)) - replace_payload_by_text(part, _("""\ -An embedded message was scrubbed... -From: $who -Subject: $subject -Date: $date -Size: $size -URL: $url -"""), lcset) - # If the message isn't a multipart, then we'll strip it out as an - # attachment that would have to be separately downloaded. Pipermail - # will transform the url into a hyperlink. - elif part._payload and not part.is_multipart(): - payload = part.get_payload(decode=True) - ctype = part.get_content_type() - # XXX Under email 2.5, it is possible that payload will be None. - # This can happen when you have a Content-Type: multipart/* with - # only one part and that part has two blank lines between the - # first boundary and the end boundary. In email 3.0 you end up - # with a string in the payload. I think in this case it's safe to - # ignore the part. - if payload is None: - continue - size = len(payload) - url = save_attachment(mlist, part, attachments_dir) - desc = part.get('content-description', _('not available')) - desc = oneline(desc, lcset) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -A non-text attachment was scrubbed... -Name: $filename -Type: $ctype -Size: $size bytes -Desc: $desc -URL: $url -"""), lcset) - outer = False - # We still have to sanitize multipart messages to flat text because - # Pipermail can't handle messages with list payloads. This is a kludge; - # def (n) clever hack ;). - if msg.is_multipart() and sanitize != 2: - # By default we take the charset of the first text/plain part in the - # message, but if there was none, we'll use the list's preferred - # language's charset. - if not charset or charset == 'us-ascii': - charset = lcset_out - else: - # normalize to the output charset if input/output are different - charset = Charset(charset).output_charset or charset - # We now want to concatenate all the parts which have been scrubbed to - # text/plain, into a single text/plain payload. We need to make sure - # all the characters in the concatenated string are in the same - # encoding, so we'll use the 'replace' key in the coercion call. - # BAW: Martin's original patch suggested we might want to try - # generalizing to utf-8, and that's probably a good idea (eventually). - text = [] - charsets = [] - for part in msg.walk(): - # TK: bug-id 1099138 and multipart - # MAS test payload - if part may fail if there are no headers. - if not part._payload or part.is_multipart(): - continue - # All parts should be scrubbed to text/plain by now. - partctype = part.get_content_type() - if partctype != 'text/plain': - text.append(_('Skipped content of type $partctype\n')) - continue - try: - t = part.get_payload(decode=True) or '' - # MAS: TypeError exception can occur if payload is None. This - # was observed with a message that contained an attached - # message/delivery-status part. Because of the special parsing - # of this type, this resulted in a text/plain sub-part with a - # null body. See bug 1430236. - except (binascii.Error, TypeError): - t = part.get_payload() or '' - # Email problem was solved by Mark Sapiro. (TK) - partcharset = part.get_content_charset('us-ascii') - try: - t = unicode(t, partcharset, 'replace') - except (UnicodeError, LookupError, ValueError, TypeError, - AssertionError): - # We can get here if partcharset is bogus in come way. - # Replace funny characters. We use errors='replace'. - t = unicode(t, 'ascii', 'replace') - # Separation is useful - if isinstance(t, basestring): - if not t.endswith('\n'): - t += '\n' - text.append(t) - if partcharset not in charsets: - charsets.append(partcharset) - # Now join the text and set the payload - sep = _('-------------- next part --------------\n') - assert isinstance(sep, unicode), ( - 'Expected a unicode separator, got %s' % type(sep)) - rept = sep.join(text) - # Replace entire message with text and scrubbed notice. - # Try with message charsets and utf-8 - if 'utf-8' not in charsets: - charsets.append('utf-8') - for charset in charsets: - try: - replace_payload_by_text(msg, rept, charset) - break - # Bogus charset can throw several exceptions - except (UnicodeError, LookupError, ValueError, TypeError, - AssertionError): - pass - if format_param: - msg.set_param('format', format_param) - if delsp: - msg.set_param('delsp', delsp) - return msg - - - -def save_attachment(mlist, msg, attachments_dir, filter_html=True): - fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR, - mlist.fqdn_listname, attachments_dir) - makedirs(fsdir) - # Figure out the attachment type and get the decoded data - decodedpayload = msg.get_payload(decode=True) - # BAW: mimetypes ought to handle non-standard, but commonly found types, - # e.g. image/jpg (should be image/jpeg). For now we just store such - # things as application/octet-streams since that seems the safest. - ctype = msg.get_content_type() - # i18n file name is encoded - lcset = mlist.preferred_language.charset - filename = oneline(msg.get_filename(''), lcset) - filename, fnext = os.path.splitext(filename) - # For safety, we should confirm this is valid ext for content-type - # but we can use fnext if we introduce fnext filtering - if as_boolean(config.scrubber.use_attachment_filename_extension): - # HTML message doesn't have filename :-( - ext = fnext or guess_extension(ctype, fnext) - else: - ext = guess_extension(ctype, fnext) - if not ext: - # We don't know what it is, so assume it's just a shapeless - # application/octet-stream, unless the Content-Type: is - # message/rfc822, in which case we know we'll coerce the type to - # text/plain below. - if ctype == 'message/rfc822': - ext = '.txt' - else: - ext = '.bin' - # Allow only alphanumerics, dash, underscore, and dot - ext = sre.sub('', ext) - path = None - # We need a lock to calculate the next attachment number - with Lock(os.path.join(fsdir, 'attachments.lock')): - # Now base the filename on what's in the attachment, uniquifying it if - # necessary. - if (not filename or - not as_boolean(config.scrubber.use_attachment_filename)): - filebase = 'attachment' - else: - # Sanitize the filename given in the message headers - parts = pre.split(filename) - filename = parts[-1] - # Strip off leading dots - filename = dre.sub('', filename) - # Allow only alphanumerics, dash, underscore, and dot - filename = sre.sub('', filename) - # If the filename's extension doesn't match the type we guessed, - # which one should we go with? For now, let's go with the one we - # guessed so attachments can't lie about their type. Also, if the - # filename /has/ no extension, then tack on the one we guessed. - # The extension was removed from the name above. - filebase = filename - # Now we're looking for a unique name for this file on the file - # system. If msgdir/filebase.ext isn't unique, we'll add a counter - # after filebase, e.g. msgdir/filebase-cnt.ext - counter = 0 - extra = '' - while True: - path = os.path.join(fsdir, filebase + extra + ext) - # Generally it is not a good idea to test for file existance - # before just trying to create it, but the alternatives aren't - # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't - # NFS-safe). Besides, we have an exclusive lock now, so we're - # guaranteed that no other process will be racing with us. - if os.path.exists(path): - counter += 1 - extra = '-%04d' % counter - else: - break - # `path' now contains the unique filename for the attachment. There's - # just one more step we need to do. If the part is text/html and - # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be - # here), then send the attachment through the filter program for - # sanitization - if filter_html and ctype == 'text/html': - base, ext = os.path.splitext(path) - tmppath = base + '-tmp' + ext - fp = open(tmppath, 'w') - try: - fp.write(decodedpayload) - fp.close() - cmd = Template(config.mta.archive_html_sanitizer).safe_substitue( - filename=tmppath) - progfp = os.popen(cmd, 'r') - decodedpayload = progfp.read() - status = progfp.close() - if status: - log.error('HTML sanitizer exited with non-zero status: %s', - status) - finally: - os.unlink(tmppath) - # BAW: Since we've now sanitized the document, it should be plain - # text. Blarg, we really want the sanitizer to tell us what the type - # if the return data is. :( - ext = '.txt' - path = base + '.txt' - # Is it a message/rfc822 attachment? - elif ctype == 'message/rfc822': - submsg = msg.get_payload() - # BAW: I'm sure we can eventually do better than this. :( - decodedpayload = websafe(str(submsg)) - fp = open(path, 'w') - fp.write(decodedpayload) - fp.close() - # Now calculate the url to the list's archive. - scrubber_path = config.scrubber.archive_scrubber - base_url = find_name(scrubber_path).list_url(mlist) - if not base_url.endswith('/'): - base_url += '/' - # Trailing space will definitely be a problem with format=flowed. - # Bracket the URL instead. - url = '<' + base_url + '%s/%s%s%s>' % ( - attachments_dir, filebase, extra, ext) - return url - - - -class Scrubber: - """Cleanse a message for archiving.""" - - implements(IHandler) - - name = 'scrubber' - description = _('Cleanse a message for archiving.') - - def process(self, mlist, msg, msgdata): - """See `IHandler`.""" - process(mlist, msg, msgdata) |
