diff options
| author | Barry Warsaw | 2012-03-16 15:58:17 -0700 |
|---|---|---|
| committer | Barry Warsaw | 2012-03-16 15:58:17 -0700 |
| commit | 24991d17919f2715a7f2e875d2fb7fe72e53efcf (patch) | |
| tree | 80624655bf271f103f76377d9b09ce2ff0562dbb /src/mailman/pipeline | |
| parent | 44ec37f890c7d4d35504d8f2e56c01abe8c60940 (diff) | |
| download | mailman-24991d17919f2715a7f2e875d2fb7fe72e53efcf.tar.gz mailman-24991d17919f2715a7f2e875d2fb7fe72e53efcf.tar.zst mailman-24991d17919f2715a7f2e875d2fb7fe72e53efcf.zip | |
* Pipermail has been eradicated.
* Configuration variable `[mailman]filtered_messages_are_preservable`
controls whether messages which have their top-level `Content-Type`
filtered out can be preserved in the `bad` queue by list owners.
* Configuration section `[scrubber]` removed, as is the scrubber handler.
This handler was essentially incompatible with Mailman 3 since it required
coordination with Pipermail to store attachments on disk.
* Schema additions:
- mailinglist.filter_action
Diffstat (limited to 'src/mailman/pipeline')
| -rw-r--r-- | src/mailman/pipeline/docs/filtering.rst | 2 | ||||
| -rw-r--r-- | src/mailman/pipeline/docs/scrubber.rst | 230 | ||||
| -rw-r--r-- | src/mailman/pipeline/mime_delete.py | 71 | ||||
| -rw-r--r-- | src/mailman/pipeline/scrubber.py | 502 | ||||
| -rw-r--r-- | src/mailman/pipeline/tests/test_mimedel.py | 213 | ||||
| -rw-r--r-- | src/mailman/pipeline/tests/test_scrubber.py | 45 |
6 files changed, 258 insertions, 805 deletions
diff --git a/src/mailman/pipeline/docs/filtering.rst b/src/mailman/pipeline/docs/filtering.rst index 5b54424e4..fd0b33d3b 100644 --- a/src/mailman/pipeline/docs/filtering.rst +++ b/src/mailman/pipeline/docs/filtering.rst @@ -45,7 +45,7 @@ content type matches the filter, the entire message will be discarded. >>> process(mlist, msg, {}) Traceback (most recent call last): ... - DiscardMessage + DiscardMessage: The message's content type was explicitly disallowed However, if we turn off content filtering altogether, then the handler short-circuits. diff --git a/src/mailman/pipeline/docs/scrubber.rst b/src/mailman/pipeline/docs/scrubber.rst deleted file mode 100644 index 86a8161a7..000000000 --- a/src/mailman/pipeline/docs/scrubber.rst +++ /dev/null @@ -1,230 +0,0 @@ -============ -The scrubber -============ - -The scrubber is an integral part of Mailman, both in the normal delivery of -messages and in components such as the archiver. Its primary purpose is to -scrub attachments from messages so that binary goop doesn't end up in an -archive message. - - >>> mlist = create_list('_xtest@example.com') - >>> mlist.preferred_language = 'en' - -Helper functions for getting the attachment data. -:: - - >>> import os, re - >>> def read_attachment(filename, remove=True): - ... path = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR, - ... mlist.fqdn_listname, filename) - ... fp = open(path) - ... try: - ... data = fp.read() - ... finally: - ... fp.close() - ... if remove: - ... os.unlink(path) - ... return data - - >>> from urlparse import urlparse - >>> def read_url_from_message(msg): - ... url = None - ... for line in msg.get_payload().splitlines(): - ... mo = re.match('URL: <(?P<url>[^>]+)>', line) - ... if mo: - ... url = mo.group('url') - ... break - ... path = '/'.join(urlparse(url).path.split('/')[3:]) - ... return read_attachment(path) - - -Saving attachments -================== - -The Scrubber handler exposes a function called ``save_attachment()`` which can -be used to strip various types of attachments and store them in the archive -directory. This is a public interface used by components outside the normal -processing pipeline. - -Site administrators can decide whether the scrubber should use the attachment -filename suggested in the message's ``Content-Disposition:`` header or not. -If enabled, the filename will be used when this header attribute is present -(yes, this is an unfortunate double negative). -:: - - >>> config.push('test config', """ - ... [scrubber] - ... use_attachment_filename: yes - ... """) - >>> msg = message_from_string("""\ - ... Content-Type: image/gif; name="xtest.gif" - ... Content-Transfer-Encoding: base64 - ... Content-Disposition: attachment; filename="xtest.gif" - ... - ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw== - ... """) - - >>> from mailman.pipeline.scrubber import save_attachment - >>> print save_attachment(mlist, msg, 'dir') - <http://www.example.com/pipermail/_xtest@example.com/dir/xtest.gif> - >>> data = read_attachment('dir/xtest.gif') - >>> print data[:6] - GIF87a - >>> len(data) - 34 - -Saving the attachment does not alter the original message. - - >>> print msg.as_string() - Content-Type: image/gif; name="xtest.gif" - Content-Transfer-Encoding: base64 - Content-Disposition: attachment; filename="xtest.gif" - <BLANKLINE> - R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw== - -The site administrator can also configure Mailman to ignore the -``Content-Disposition:`` filename. This is the default. - - >>> config.pop('test config') - >>> config.push('test config', """ - ... [scrubber] - ... use_attachment_filename: no - ... """) - >>> msg = message_from_string("""\ - ... Content-Type: image/gif; name="xtest.gif" - ... Content-Transfer-Encoding: base64 - ... Content-Disposition: attachment; filename="xtest.gif" - ... - ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw== - ... """) - >>> print save_attachment(mlist, msg, 'dir') - <http://www.example.com/pipermail/_xtest@example.com/dir/attachment.gif> - >>> data = read_attachment('dir/xtest.gif') - Traceback (most recent call last): - IOError: [Errno ...] No such file or directory: - u'.../archives/private/_xtest@example.com/dir/xtest.gif' - >>> data = read_attachment('dir/attachment.gif') - >>> print data[:6] - GIF87a - >>> len(data) - 34 - - -Scrubbing image attachments -=========================== - -When scrubbing image attachments, the original message is modified to include -a reference to the attachment file as available through the on-line archive. - - >>> msg = message_from_string("""\ - ... MIME-Version: 1.0 - ... Content-Type: multipart/mixed; boundary="BOUNDARY" - ... - ... --BOUNDARY - ... Content-type: text/plain; charset=us-ascii - ... - ... This is a message. - ... --BOUNDARY - ... Content-Type: image/gif; name="xtest.gif" - ... Content-Transfer-Encoding: base64 - ... Content-Disposition: attachment; filename="xtest.gif" - ... - ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw== - ... --BOUNDARY-- - ... """) - >>> msgdata = {} - -The ``Scrubber.process()`` function is different than other handler process -functions in that it returns the scrubbed message. - - >>> from mailman.pipeline.scrubber import process - >>> scrubbed_msg = process(mlist, msg, msgdata) - >>> scrubbed_msg is msg - True - >>> print scrubbed_msg.as_string() - MIME-Version: 1.0 - Message-ID: ... - Content-Type: text/plain; charset="us-ascii" - Content-Transfer-Encoding: 7bit - <BLANKLINE> - This is a message. - -------------- next part -------------- - A non-text attachment was scrubbed... - Name: xtest.gif - Type: image/gif - Size: 34 bytes - Desc: not available - URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.gif> - <BLANKLINE> - -This is the same as the transformed message originally passed in. - - >>> print msg.as_string() - MIME-Version: 1.0 - Message-ID: ... - Content-Type: text/plain; charset="us-ascii" - Content-Transfer-Encoding: 7bit - <BLANKLINE> - This is a message. - -------------- next part -------------- - A non-text attachment was scrubbed... - Name: xtest.gif - Type: image/gif - Size: 34 bytes - Desc: not available - URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.gif> - <BLANKLINE> - >>> msgdata - {} - -The URL will point to the attachment sitting in the archive. - - >>> data = read_url_from_message(msg) - >>> data[:6] - 'GIF87a' - >>> len(data) - 34 - - -Scrubbing text attachments -========================== - -Similar to image attachments, text attachments will also be scrubbed, but the -placeholder will be slightly different. - - >>> msg = message_from_string("""\ - ... MIME-Version: 1.0 - ... Content-Type: multipart/mixed; boundary="BOUNDARY" - ... - ... --BOUNDARY - ... Content-type: text/plain; charset=us-ascii; format=flowed; delsp=no - ... - ... This is a message. - ... --BOUNDARY - ... Content-type: text/plain; name="xtext.txt" - ... Content-Disposition: attachment; filename="xtext.txt" - ... - ... This is a text attachment. - ... --BOUNDARY-- - ... """) - >>> scrubbed_msg = process(mlist, msg, {}) - >>> print scrubbed_msg.as_string() - MIME-Version: 1.0 - Message-ID: ... - Content-Transfer-Encoding: 7bit - Content-Type: text/plain; charset="us-ascii"; format="flowed"; delsp="no" - <BLANKLINE> - This is a message. - -------------- next part -------------- - An embedded and charset-unspecified text was scrubbed... - Name: xtext.txt - URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.txt> - <BLANKLINE> - >>> read_url_from_message(msg) - 'This is a text attachment.' - - -Clean up -======== - - >>> config.pop('test config') diff --git a/src/mailman/pipeline/mime_delete.py b/src/mailman/pipeline/mime_delete.py index 402d13714..99fdc3ede 100644 --- a/src/mailman/pipeline/mime_delete.py +++ b/src/mailman/pipeline/mime_delete.py @@ -37,14 +37,18 @@ import errno import logging import tempfile -from email.Iterators import typed_subpart_iterator +from email.iterators import typed_subpart_iterator +from email.mime.message import MIMEMessage +from email.mime.text import MIMEText +from lazr.config import as_boolean from os.path import splitext from zope.interface import implements from mailman.config import config from mailman.core import errors from mailman.core.i18n import _ -from mailman.core.switchboard import Switchboard +from mailman.email.message import OwnerNotification +from mailman.interfaces.action import FilterAction from mailman.interfaces.handler import IHandler from mailman.utilities.string import oneline from mailman.version import VERSION @@ -54,6 +58,44 @@ log = logging.getLogger('mailman.error') +def dispose(mlist, msg, msgdata, why): + if mlist.filter_action is FilterAction.reject: + # Bounce the message to the original author. + raise errors.RejectMessage(why) + elif mlist.filter_action is FilterAction.forward: + # Forward it on to the list moderators. + # FIXME 2012-03-16 BAW: Trunk uses .display_name + text=_("""\ +The attached message matched the $mlist.real_name mailing list's content +filtering rules and was prevented from being forwarded on to the list +membership. You are receiving the only remaining copy of the discarded +message. + +""") + subject=_('Content filter message notification') + notice = OwnerNotification(mlist, subject, roster=mlist.moderators) + notice.set_type('multipart/mixed') + notice.attach(MIMEText(text)) + notice.attach(MIMEMessage(msg)) + notice.send(mlist) + # Let this fall through so the original message gets discarded. + elif mlist.filter_action is FilterAction.preserve: + if as_boolean(config.mailman.filtered_messages_are_preservable): + # This is just like discarding the message except that a copy is + # placed in the 'bad' queue should the site administrator want to + # inspect the message. + filebase = config.switchboards['bad'].enqueue(msg, msgdata) + log.info('{0} preserved in file base {1}'.format( + msg.get('message-id', 'n/a'), filebase)) + else: + log.error( + '{1} invalid FilterAction: {0}. Treating as discard'.format( + mlist.fqdn_listname, mlist.filter_action.name)) + # Most cases also discard the message + raise errors.DiscardMessage(why) + + + def process(mlist, msg, msgdata): # We also don't care about our own digests or plaintext ctype = msg.get_content_type() @@ -227,31 +269,6 @@ def to_plaintext(msg): -def dispose(mlist, msg, msgdata, why): - # filter_action == 0 just discards, see below - if mlist.filter_action == 1: - # Bounce the message to the original author - raise errors.RejectMessage, why - if mlist.filter_action == 2: - # Forward it on to the list owner - listname = mlist.internal_name() - mlist.ForwardMessage( - msg, - text=_("""\ -The attached message matched the $listname mailing list's content filtering -rules and was prevented from being forwarded on to the list membership. You -are receiving the only remaining copy of the discarded message. - -"""), - subject=_('Content filtered message notification')) - if mlist.filter_action == 3 and \ - config.OWNERS_CAN_PRESERVE_FILTERED_MESSAGES: - badq = Switchboard('bad', config.BADQUEUE_DIR) - badq.enqueue(msg, msgdata) - # Most cases also discard the message - raise errors.DiscardMessage - - def get_file_ext(m): """ Get filename extension. Caution: some virus don't put filename diff --git a/src/mailman/pipeline/scrubber.py b/src/mailman/pipeline/scrubber.py deleted file mode 100644 index 76d10427e..000000000 --- a/src/mailman/pipeline/scrubber.py +++ /dev/null @@ -1,502 +0,0 @@ -# Copyright (C) 2001-2012 by the Free Software Foundation, Inc. -# -# This file is part of GNU Mailman. -# -# GNU Mailman is free software: you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation, either version 3 of the License, or (at your option) -# any later version. -# -# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. -# -# You should have received a copy of the GNU General Public License along with -# GNU Mailman. If not, see <http://www.gnu.org/licenses/>. - -"""Cleanse a message for archiving.""" - -from __future__ import absolute_import, unicode_literals - -__metaclass__ = type -__all__ = [ - 'Scrubber', - ] - - -import os -import re -import time -import hashlib -import logging -import binascii - -from email.charset import Charset -from email.utils import make_msgid, parsedate -from flufl.lock import Lock -from lazr.config import as_boolean -from mimetypes import guess_all_extensions -from string import Template -from zope.interface import implements - -from mailman.config import config -from mailman.core.errors import DiscardMessage -from mailman.core.i18n import _ -from mailman.interfaces.handler import IHandler -from mailman.utilities.filesystem import makedirs -from mailman.utilities.modules import find_name -from mailman.utilities.string import oneline, websafe - - -# Path characters for common platforms -pre = re.compile(r'[/\\:]') -# All other characters to strip out of Content-Disposition: filenames -# (essentially anything that isn't an alphanum, dot, dash, or underscore). -sre = re.compile(r'[^-\w.]') -# Regexp to strip out leading dots -dre = re.compile(r'^\.*') - -BR = '<br>\n' -SPACE = ' ' - -log = logging.getLogger('mailman.error') - - - -def guess_extension(ctype, ext): - """Find the extension mapped to the given content-type. - - mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, and - .wiz are all mapped to application/msword. This sucks for finding the - best reverse mapping. If the extension is one of the giving mappings, - we'll trust that, otherwise we'll just guess. :/ - """ - all_extensions = guess_all_extensions(ctype, strict=False) - if ext in all_extensions: - return ext - return (all_extensions[0] if len(all_extensions) > 0 else None) - - - -def safe_strftime(fmt, t): - """A time.strftime() that eats exceptions, returning None instead.""" - try: - return time.strftime(fmt, t) - except (TypeError, ValueError, OverflowError): - return None - - -def calculate_attachments_dir(msg, msgdata): - """Calculate the directory for attachements. - - Calculate the directory that attachments for this message will go under. - To avoid inode limitations, the scheme will be: - archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files> - Start by calculating the date-based and msgid-hash components. - """ - fmt = '%Y%m%d' - datestr = msg.get('Date') - if datestr: - now = parsedate(datestr) - else: - now = time.gmtime(msgdata.get('received_time', time.time())) - datedir = safe_strftime(fmt, now) - if not datedir: - datestr = msgdata.get('X-List-Received-Date') - if datestr: - datedir = safe_strftime(fmt, datestr) - if not datedir: - # What next? Unixfrom, I guess. - parts = msg.get_unixfrom().split() - try: - month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, - 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12, - }.get(parts[3], 0) - day = int(parts[4]) - year = int(parts[6]) - except (IndexError, ValueError): - # Best we can do I think - month = day = year = 0 - datedir = '%04d%02d%02d' % (year, month, day) - assert datedir - # As for the msgid hash, we'll base this part on the Message-ID: so that - # all attachments for the same message end up in the same directory (we'll - # uniquify the filenames in that directory as needed). We use the first 2 - # and last 2 bytes of the SHA1 hash of the message id as the basis of the - # directory name. Clashes here don't really matter too much, and that - # still gives us a 32-bit space to work with. - msgid = msg['message-id'] - if msgid is None: - msgid = msg['Message-ID'] = make_msgid() - # We assume that the message id actually /is/ unique! - digest = hashlib.sha1(msgid).hexdigest() - return os.path.join('attachments', datedir, digest[:4] + digest[-4:]) - - -def replace_payload_by_text(msg, text, charset): - """Replace the payload of the message with some text.""" - # TK: This is a common function in replacing the attachment and the main - # message by a text (scrubbing). - del msg['content-type'] - del msg['content-transfer-encoding'] - if isinstance(text, unicode): - text = text.encode(charset) - if not isinstance(charset, str): - charset = str(charset) - msg.set_payload(text, charset) - - - -def process(mlist, msg, msgdata=None): - """Process the message through the scrubber.""" - sanitize = int(config.scrubber.archive_html_sanitizer) - outer = True - if msgdata is None: - msgdata = {} - if msgdata: - # msgdata is available if it is in GLOBAL_PIPELINE - # ie. not in digest or archiver - # check if the list owner want to scrub regular delivery - if not mlist.scrub_nondigest: - return - attachments_dir = calculate_attachments_dir(msg, msgdata) - charset = format_param = delsp = None - lcset = mlist.preferred_language.charset - lcset_out = Charset(lcset).output_charset or lcset - # Now walk over all subparts of this message and scrub out various types - for part in msg.walk(): - ctype = part.get_content_type() - # If the part is text/plain, we leave it alone - if ctype == 'text/plain': - # We need to choose a charset for the scrubbed message, so we'll - # arbitrarily pick the charset of the first text/plain part in the - # message. - # - # Also get the RFC 3676 stuff from this part. This seems to - # work okay for scrub_nondigest. It will also work as far as - # scrubbing messages for the archive is concerned. The plain - # format digest is going to be a disaster in any case as some of - # messages will be format="flowed" and some not. ToDigest creates - # its own Content-Type: header for the plain digest which won't - # have RFC 3676 parameters. If the message Content-Type: headers - # are retained for display in the digest, the parameters will be - # there for information, but not for the MUA. This is the best we - # can do without having get_payload() process the parameters. - if charset is None: - charset = part.get_content_charset(lcset) - format_param = part.get_param('format') - delsp = part.get_param('delsp') - # TK: if part is attached then check charset and scrub if none - if part.get('content-disposition') and \ - not part.get_content_charset(): - url = save_attachment(mlist, part, attachments_dir) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -An embedded and charset-unspecified text was scrubbed... -Name: $filename -URL: $url -"""), lcset) - elif ctype == 'text/html' and isinstance(sanitize, int): - if sanitize == 0: - if outer: - raise DiscardMessage - replace_payload_by_text(part, - _('HTML attachment scrubbed and removed'), - # Adding charset arg and removing content-type - # sets content-type to text/plain - lcset) - elif sanitize == 2: - # By leaving it alone, Pipermail will automatically escape it. - # XXX 2012-03-13 BAW: Now that Pipermail has been removed, do - # we even need this? - pass - elif sanitize == 3: - # Pull it out as an attachment but leave it unescaped. This - # is dangerous, but perhaps useful for heavily moderated - # lists. - url = save_attachment(mlist, part, attachments_dir, - filter_html=False) - replace_payload_by_text(part, _("""\ -An HTML attachment was scrubbed... -URL: $url -"""), lcset) - else: - # HTML-escape it and store it as an attachment, but make it - # look a /little/ bit prettier. :( - payload = websafe(part.get_payload(decode=True)) - # For whitespace in the margin, change spaces into - # non-breaking spaces, and tabs into 8 of those. Then use a - # mono-space font. Still looks hideous to me, but then I'd - # just as soon discard them. - lines = [s.replace(' ', ' ').replace('\t', ' ' * 8) - for s in payload.split('\n')] - payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n' - part.set_payload(payload) - # We're replacing the payload with the decoded payload so this - # will just get in the way. - del part['content-transfer-encoding'] - url = save_attachment(mlist, part, attachments_dir, - filter_html=False) - replace_payload_by_text(part, _("""\ -An HTML attachment was scrubbed... -URL: $url -"""), lcset) - elif ctype == 'message/rfc822': - # This part contains a submessage, so it too needs scrubbing - submsg = part.get_payload(0) - url = save_attachment(mlist, part, attachments_dir) - subject = submsg.get('subject', _('no subject')) - date = submsg.get('date', _('no date')) - who = submsg.get('from', _('unknown sender')) - size = len(str(submsg)) - replace_payload_by_text(part, _("""\ -An embedded message was scrubbed... -From: $who -Subject: $subject -Date: $date -Size: $size -URL: $url -"""), lcset) - # If the message isn't a multipart, then we'll strip it out as an - # attachment that would have to be separately downloaded. - elif part._payload and not part.is_multipart(): - payload = part.get_payload(decode=True) - ctype = part.get_content_type() - # XXX Under email 2.5, it is possible that payload will be None. - # This can happen when you have a Content-Type: multipart/* with - # only one part and that part has two blank lines between the - # first boundary and the end boundary. In email 3.0 you end up - # with a string in the payload. I think in this case it's safe to - # ignore the part. - if payload is None: - continue - size = len(payload) - url = save_attachment(mlist, part, attachments_dir) - desc = part.get('content-description', _('not available')) - desc = oneline(desc, lcset) - filename = part.get_filename(_('not available')) - filename = oneline(filename, lcset) - replace_payload_by_text(part, _("""\ -A non-text attachment was scrubbed... -Name: $filename -Type: $ctype -Size: $size bytes -Desc: $desc -URL: $url -"""), lcset) - outer = False - # We still have to sanitize multipart messages to flat text because - # Pipermail can't handle messages with list payloads. This is a kludge; - # def (n) clever hack ;). - # - # XXX 2012-03-13 BAW: Now that Pipermail has been removed, do we even need - # this code? - if msg.is_multipart() and sanitize != 2: - # By default we take the charset of the first text/plain part in the - # message, but if there was none, we'll use the list's preferred - # language's charset. - if not charset or charset == 'us-ascii': - charset = lcset_out - else: - # normalize to the output charset if input/output are different - charset = Charset(charset).output_charset or charset - # We now want to concatenate all the parts which have been scrubbed to - # text/plain, into a single text/plain payload. We need to make sure - # all the characters in the concatenated string are in the same - # encoding, so we'll use the 'replace' key in the coercion call. - # BAW: Martin's original patch suggested we might want to try - # generalizing to utf-8, and that's probably a good idea (eventually). - text = [] - charsets = [] - for part in msg.walk(): - # TK: bug-id 1099138 and multipart - # MAS test payload - if part may fail if there are no headers. - if not part._payload or part.is_multipart(): - continue - # All parts should be scrubbed to text/plain by now. - partctype = part.get_content_type() - if partctype != 'text/plain': - text.append(_('Skipped content of type $partctype\n')) - continue - try: - t = part.get_payload(decode=True) or '' - # MAS: TypeError exception can occur if payload is None. This - # was observed with a message that contained an attached - # message/delivery-status part. Because of the special parsing - # of this type, this resulted in a text/plain sub-part with a - # null body. See bug 1430236. - except (binascii.Error, TypeError): - t = part.get_payload() or '' - # Email problem was solved by Mark Sapiro. (TK) - partcharset = part.get_content_charset('us-ascii') - try: - t = unicode(t, partcharset, 'replace') - except (UnicodeError, LookupError, ValueError, TypeError, - AssertionError): - # We can get here if partcharset is bogus in come way. - # Replace funny characters. We use errors='replace'. - t = unicode(t, 'ascii', 'replace') - # Separation is useful - if isinstance(t, basestring): - if not t.endswith('\n'): - t += '\n' - text.append(t) - if partcharset not in charsets: - charsets.append(partcharset) - # Now join the text and set the payload - sep = _('-------------- next part --------------\n') - assert isinstance(sep, unicode), ( - 'Expected a unicode separator, got %s' % type(sep)) - rept = sep.join(text) - # Replace entire message with text and scrubbed notice. - # Try with message charsets and utf-8 - if 'utf-8' not in charsets: - charsets.append('utf-8') - for charset in charsets: - try: - replace_payload_by_text(msg, rept, charset) - break - # Bogus charset can throw several exceptions - except (UnicodeError, LookupError, ValueError, TypeError, - AssertionError): - pass - if format_param: - msg.set_param('format', format_param) - if delsp: - msg.set_param('delsp', delsp) - return msg - - - -def save_attachment(mlist, msg, attachments_dir, filter_html=True): - fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR, - mlist.fqdn_listname, attachments_dir) - makedirs(fsdir) - # Figure out the attachment type and get the decoded data - decodedpayload = msg.get_payload(decode=True) - # BAW: mimetypes ought to handle non-standard, but commonly found types, - # e.g. image/jpg (should be image/jpeg). For now we just store such - # things as application/octet-streams since that seems the safest. - ctype = msg.get_content_type() - # i18n file name is encoded - lcset = mlist.preferred_language.charset - filename = oneline(msg.get_filename(''), lcset) - filename, fnext = os.path.splitext(filename) - # For safety, we should confirm this is valid ext for content-type - # but we can use fnext if we introduce fnext filtering - if as_boolean(config.scrubber.use_attachment_filename_extension): - # HTML message doesn't have filename :-( - ext = fnext or guess_extension(ctype, fnext) - else: - ext = guess_extension(ctype, fnext) - if not ext: - # We don't know what it is, so assume it's just a shapeless - # application/octet-stream, unless the Content-Type: is - # message/rfc822, in which case we know we'll coerce the type to - # text/plain below. - if ctype == 'message/rfc822': - ext = '.txt' - else: - ext = '.bin' - # Allow only alphanumerics, dash, underscore, and dot - ext = sre.sub('', ext) - path = None - # We need a lock to calculate the next attachment number - with Lock(os.path.join(fsdir, 'attachments.lock')): - # Now base the filename on what's in the attachment, uniquifying it if - # necessary. - if (not filename or - not as_boolean(config.scrubber.use_attachment_filename)): - filebase = 'attachment' - else: - # Sanitize the filename given in the message headers - parts = pre.split(filename) - filename = parts[-1] - # Strip off leading dots - filename = dre.sub('', filename) - # Allow only alphanumerics, dash, underscore, and dot - filename = sre.sub('', filename) - # If the filename's extension doesn't match the type we guessed, - # which one should we go with? For now, let's go with the one we - # guessed so attachments can't lie about their type. Also, if the - # filename /has/ no extension, then tack on the one we guessed. - # The extension was removed from the name above. - filebase = filename - # Now we're looking for a unique name for this file on the file - # system. If msgdir/filebase.ext isn't unique, we'll add a counter - # after filebase, e.g. msgdir/filebase-cnt.ext - counter = 0 - extra = '' - while True: - path = os.path.join(fsdir, filebase + extra + ext) - # Generally it is not a good idea to test for file existance - # before just trying to create it, but the alternatives aren't - # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't - # NFS-safe). Besides, we have an exclusive lock now, so we're - # guaranteed that no other process will be racing with us. - if os.path.exists(path): - counter += 1 - extra = '-%04d' % counter - else: - break - # `path' now contains the unique filename for the attachment. There's - # just one more step we need to do. If the part is text/html and - # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be - # here), then send the attachment through the filter program for - # sanitization - if filter_html and ctype == 'text/html': - base, ext = os.path.splitext(path) - tmppath = base + '-tmp' + ext - fp = open(tmppath, 'w') - try: - fp.write(decodedpayload) - fp.close() - cmd = Template(config.mta.archive_html_sanitizer).safe_substitue( - filename=tmppath) - progfp = os.popen(cmd, 'r') - decodedpayload = progfp.read() - status = progfp.close() - if status: - log.error('HTML sanitizer exited with non-zero status: %s', - status) - finally: - os.unlink(tmppath) - # BAW: Since we've now sanitized the document, it should be plain - # text. Blarg, we really want the sanitizer to tell us what the type - # if the return data is. :( - ext = '.txt' - path = base + '.txt' - # Is it a message/rfc822 attachment? - elif ctype == 'message/rfc822': - submsg = msg.get_payload() - # BAW: I'm sure we can eventually do better than this. :( - decodedpayload = websafe(str(submsg)) - fp = open(path, 'w') - fp.write(decodedpayload) - fp.close() - # Now calculate the url to the list's archive. - scrubber_path = config.scrubber.archive_scrubber - base_url = find_name(scrubber_path).list_url(mlist) - if not base_url.endswith('/'): - base_url += '/' - # Trailing space will definitely be a problem with format=flowed. - # Bracket the URL instead. - url = '<' + base_url + '%s/%s%s%s>' % ( - attachments_dir, filebase, extra, ext) - return url - - - -class Scrubber: - """Cleanse a message for archiving.""" - - implements(IHandler) - - name = 'scrubber' - description = _('Cleanse a message for archiving.') - - def process(self, mlist, msg, msgdata): - """See `IHandler`.""" - process(mlist, msg, msgdata) diff --git a/src/mailman/pipeline/tests/test_mimedel.py b/src/mailman/pipeline/tests/test_mimedel.py new file mode 100644 index 000000000..566c1a40c --- /dev/null +++ b/src/mailman/pipeline/tests/test_mimedel.py @@ -0,0 +1,213 @@ +# Copyright (C) 2012 by the Free Software Foundation, Inc. +# +# This file is part of GNU Mailman. +# +# GNU Mailman is free software: you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# GNU Mailman. If not, see <http://www.gnu.org/licenses/>. + +"""Test the mime_delete handler.""" + +from __future__ import absolute_import, print_function, unicode_literals + +__metaclass__ = type +__all__ = [ + 'TestDispose', + ] + + +import unittest + +from zope.component import getUtility + +from mailman.app.lifecycle import create_list +from mailman.config import config +from mailman.core import errors +from mailman.interfaces.action import FilterAction +from mailman.interfaces.member import MemberRole +from mailman.interfaces.usermanager import IUserManager +from mailman.pipeline import mime_delete +from mailman.testing.helpers import ( + LogFileMark, + get_queue_messages, + specialized_message_from_string as mfs) +from mailman.testing.layers import ConfigLayer + + + +class TestDispose(unittest.TestCase): + """Test the mime_delete handler.""" + + layer = ConfigLayer + + def setUp(self): + self._mlist = create_list('test@example.com') + self._msg = mfs("""\ +From: anne@example.com +To: test@example.com +Subject: A disposable message +Message-ID: <ant> + +""") + # Python 2.7 has assertMultiLineEqual. Let this work without bounds. + self.maxDiff = None + self.eq = getattr(self, 'assertMultiLineEqual', self.assertEqual) + config.push('dispose', """ + [mailman] + site_owner: noreply@example.com + """) + + def tearDown(self): + config.pop('dispose') + + def test_dispose_discard(self): + self._mlist.filter_action = FilterAction.discard + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'discarding') + except errors.DiscardMessage as error: + pass + else: + raise AssertionError('DiscardMessage exception expected') + self.assertEqual(error.message, 'discarding') + # There should be no messages in the 'bad' queue. + self.assertEqual(len(get_queue_messages('bad')), 0) + + def test_dispose_bounce(self): + self._mlist.filter_action = FilterAction.reject + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'rejecting') + except errors.RejectMessage as error: + pass + else: + raise AssertionError('RejectMessage exception expected') + self.assertEqual(error.message, 'rejecting') + # There should be no messages in the 'bad' queue. + self.assertEqual(len(get_queue_messages('bad')), 0) + + def test_dispose_forward(self): + # The disposed message gets forwarded to the list moderators. So + # first add some moderators. + user_manager = getUtility(IUserManager) + anne = user_manager.create_address('anne@example.com') + bart = user_manager.create_address('bart@example.com') + self._mlist.subscribe(anne, MemberRole.moderator) + self._mlist.subscribe(bart, MemberRole.moderator) + # Now set the filter action and dispose the message. + self._mlist.filter_action = FilterAction.forward + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'forwarding') + except errors.DiscardMessage as error: + pass + else: + raise AssertionError('DiscardMessage exception expected') + self.assertEqual(error.message, 'forwarding') + # There should now be a multipart message in the virgin queue destined + # for the mailing list owners. + messages = get_queue_messages('virgin') + self.assertEqual(len(messages), 1) + message = messages[0].msg + self.assertEqual(message.get_content_type(), 'multipart/mixed') + # Anne and Bart should be recipients of the message, but it will look + # like the message is going to the list owners. + self.assertEqual(message['to'], 'test-owner@example.com') + self.assertEqual(message.recipients, + set(['anne@example.com', 'bart@example.com'])) + # The list owner should be the sender. + self.assertEqual(message['from'], 'noreply@example.com') + self.assertEqual(message['subject'], + 'Content filter message notification') + # The body of the first part provides the moderators some details. + part0 = message.get_payload(0) + self.assertEqual(part0.get_content_type(), 'text/plain') + self.eq(part0.get_payload(), """\ +The attached message matched the Test mailing list's content +filtering rules and was prevented from being forwarded on to the list +membership. You are receiving the only remaining copy of the discarded +message. + +""") + # The second part is the container for the original message. + part1 = message.get_payload(1) + self.assertEqual(part1.get_content_type(), 'message/rfc822') + # And the first part of *that* message will be the original message. + original = part1.get_payload(0) + self.assertEqual(original['subject'], 'A disposable message') + self.assertEqual(original['message-id'], '<ant>') + + def test_dispose_non_preservable(self): + # Two actions can happen here, depending on a site-wide setting. If + # the site owner has indicated that filtered messages cannot be + # preserved, then this is the same as discarding them. + self._mlist.filter_action = FilterAction.preserve + config.push('non-preservable', """ + [mailman] + filtered_messages_are_preservable: no + """) + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'not preserved') + except errors.DiscardMessage as error: + pass + else: + raise AssertionError('DiscardMessage exception expected') + finally: + config.pop('non-preservable') + self.assertEqual(error.message, 'not preserved') + # There should be no messages in the 'bad' queue. + self.assertEqual(len(get_queue_messages('bad')), 0) + + def test_dispose_preservable(self): + # Two actions can happen here, depending on a site-wide setting. If + # the site owner has indicated that filtered messages can be + # preserved, then this is similar to discarding the message except + # that a copy is preserved in the 'bad' queue. + self._mlist.filter_action = FilterAction.preserve + config.push('preservable', """ + [mailman] + filtered_messages_are_preservable: yes + """) + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'preserved') + except errors.DiscardMessage as error: + pass + else: + raise AssertionError('DiscardMessage exception expected') + finally: + config.pop('preservable') + self.assertEqual(error.message, 'preserved') + # There should be no messages in the 'bad' queue. + messages = get_queue_messages('bad') + self.assertEqual(len(messages), 1) + message = messages[0].msg + self.assertEqual(message['subject'], 'A disposable message') + self.assertEqual(message['message-id'], '<ant>') + + def test_bad_action(self): + # This should never happen, but what if it does? + # FilterAction.accept, FilterAction.hold, and FilterAction.defer are + # not valid. They are treated as discard actions, but the problem is + # also logged. + for action in (FilterAction.accept, + FilterAction.hold, + FilterAction.defer): + self._mlist.filter_action = action + mark = LogFileMark('mailman.error') + try: + mime_delete.dispose(self._mlist, self._msg, {}, 'bad action') + except errors.DiscardMessage as error: + pass + else: + raise AssertionError('DiscardMessage exception expected') + self.assertEqual(error.message, 'bad action') + line = mark.readline()[:-1] + self.assertTrue(line.endswith( + '{0} invalid FilterAction: test@example.com. ' + 'Treating as discard'.format(action.name))) diff --git a/src/mailman/pipeline/tests/test_scrubber.py b/src/mailman/pipeline/tests/test_scrubber.py deleted file mode 100644 index 7ac5eb855..000000000 --- a/src/mailman/pipeline/tests/test_scrubber.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (C) 2012 by the Free Software Foundation, Inc. -# -# This file is part of GNU Mailman. -# -# GNU Mailman is free software: you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation, either version 3 of the License, or (at your option) -# any later version. -# -# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. -# -# You should have received a copy of the GNU General Public License along with -# GNU Mailman. If not, see <http://www.gnu.org/licenses/>. - -"""Scrubber module tests.""" - -from __future__ import absolute_import, print_function, unicode_literals - -__metaclass__ = type -__all__ = [ - 'TestScrubber', - ] - - -import unittest - -from mailman.pipeline import scrubber - - - -class TestScrubber(unittest.TestCase): - """Scrubber module tests.""" - - def test_guess_extension(self): - # A known extension should be found. - extension = scrubber.guess_extension('application/msword', '.doc') - self.assertEqual(extension, '.doc') - - def test_guess_missing_extension(self): - # Maybe some other extension is better. - extension = scrubber.guess_extension('application/msword', '.xxx') - self.assertEqual(extension, '.doc') |
