# Copyright (C) 2001-2012 by the Free Software Foundation, Inc.
#
# This file is part of GNU Mailman.
#
# GNU Mailman is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# GNU Mailman. If not, see .
"""Cleanse a message for archiving."""
from __future__ import absolute_import, unicode_literals
__metaclass__ = type
__all__ = [
'Scrubber',
]
import os
import re
import time
import hashlib
import logging
import binascii
from email.charset import Charset
from email.utils import make_msgid, parsedate
from flufl.lock import Lock
from lazr.config import as_boolean
from mimetypes import guess_all_extensions
from string import Template
from zope.interface import implements
from mailman.config import config
from mailman.core.errors import DiscardMessage
from mailman.core.i18n import _
from mailman.interfaces.handler import IHandler
from mailman.utilities.filesystem import makedirs
from mailman.utilities.modules import find_name
from mailman.utilities.string import oneline, websafe
# Path characters for common platforms
pre = re.compile(r'[/\\:]')
# All other characters to strip out of Content-Disposition: filenames
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
sre = re.compile(r'[^-\w.]')
# Regexp to strip out leading dots
dre = re.compile(r'^\.*')
BR = '
\n'
SPACE = ' '
log = logging.getLogger('mailman.error')
def guess_extension(ctype, ext):
"""Find the extension mapped to the given content-type.
mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, and
.wiz are all mapped to application/msword. This sucks for finding the
best reverse mapping. If the extension is one of the giving mappings,
we'll trust that, otherwise we'll just guess. :/
"""
all_extensions = guess_all_extensions(ctype, strict=False)
if ext in all_extensions:
return ext
return (all_extensions[0] if len(all_extensions) > 0 else None)
def safe_strftime(fmt, t):
"""A time.strftime() that eats exceptions, returning None instead."""
try:
return time.strftime(fmt, t)
except (TypeError, ValueError, OverflowError):
return None
def calculate_attachments_dir(msg, msgdata):
"""Calculate the directory for attachements.
Calculate the directory that attachments for this message will go under.
To avoid inode limitations, the scheme will be:
archives/private//attachments/YYYYMMDD//
Start by calculating the date-based and msgid-hash components.
"""
fmt = '%Y%m%d'
datestr = msg.get('Date')
if datestr:
now = parsedate(datestr)
else:
now = time.gmtime(msgdata.get('received_time', time.time()))
datedir = safe_strftime(fmt, now)
if not datedir:
datestr = msgdata.get('X-List-Received-Date')
if datestr:
datedir = safe_strftime(fmt, datestr)
if not datedir:
# What next? Unixfrom, I guess.
parts = msg.get_unixfrom().split()
try:
month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
}.get(parts[3], 0)
day = int(parts[4])
year = int(parts[6])
except (IndexError, ValueError):
# Best we can do I think
month = day = year = 0
datedir = '%04d%02d%02d' % (year, month, day)
assert datedir
# As for the msgid hash, we'll base this part on the Message-ID: so that
# all attachments for the same message end up in the same directory (we'll
# uniquify the filenames in that directory as needed). We use the first 2
# and last 2 bytes of the SHA1 hash of the message id as the basis of the
# directory name. Clashes here don't really matter too much, and that
# still gives us a 32-bit space to work with.
msgid = msg['message-id']
if msgid is None:
msgid = msg['Message-ID'] = make_msgid()
# We assume that the message id actually /is/ unique!
digest = hashlib.sha1(msgid).hexdigest()
return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
def replace_payload_by_text(msg, text, charset):
"""Replace the payload of the message with some text."""
# TK: This is a common function in replacing the attachment and the main
# message by a text (scrubbing).
del msg['content-type']
del msg['content-transfer-encoding']
if isinstance(text, unicode):
text = text.encode(charset)
if not isinstance(charset, str):
charset = str(charset)
msg.set_payload(text, charset)
def process(mlist, msg, msgdata=None):
"""Process the message through the scrubber."""
sanitize = int(config.scrubber.archive_html_sanitizer)
outer = True
if msgdata is None:
msgdata = {}
if msgdata:
# msgdata is available if it is in GLOBAL_PIPELINE
# ie. not in digest or archiver
# check if the list owner want to scrub regular delivery
if not mlist.scrub_nondigest:
return
attachments_dir = calculate_attachments_dir(msg, msgdata)
charset = format_param = delsp = None
lcset = mlist.preferred_language.charset
lcset_out = Charset(lcset).output_charset or lcset
# Now walk over all subparts of this message and scrub out various types
for part in msg.walk():
ctype = part.get_content_type()
# If the part is text/plain, we leave it alone
if ctype == 'text/plain':
# We need to choose a charset for the scrubbed message, so we'll
# arbitrarily pick the charset of the first text/plain part in the
# message.
#
# Also get the RFC 3676 stuff from this part. This seems to
# work okay for scrub_nondigest. It will also work as far as
# scrubbing messages for the archive is concerned, but Pipermail
# doesn't pay any attention to the RFC 3676 parameters. The plain
# format digest is going to be a disaster in any case as some of
# messages will be format="flowed" and some not. ToDigest creates
# its own Content-Type: header for the plain digest which won't
# have RFC 3676 parameters. If the message Content-Type: headers
# are retained for display in the digest, the parameters will be
# there for information, but not for the MUA. This is the best we
# can do without having get_payload() process the parameters.
if charset is None:
charset = part.get_content_charset(lcset)
format_param = part.get_param('format')
delsp = part.get_param('delsp')
# TK: if part is attached then check charset and scrub if none
if part.get('content-disposition') and \
not part.get_content_charset():
url = save_attachment(mlist, part, attachments_dir)
filename = part.get_filename(_('not available'))
filename = oneline(filename, lcset)
replace_payload_by_text(part, _("""\
An embedded and charset-unspecified text was scrubbed...
Name: $filename
URL: $url
"""), lcset)
elif ctype == 'text/html' and isinstance(sanitize, int):
if sanitize == 0:
if outer:
raise DiscardMessage
replace_payload_by_text(part,
_('HTML attachment scrubbed and removed'),
# Adding charset arg and removing content-type
# sets content-type to text/plain
lcset)
elif sanitize == 2:
# By leaving it alone, Pipermail will automatically escape it
pass
elif sanitize == 3:
# Pull it out as an attachment but leave it unescaped. This
# is dangerous, but perhaps useful for heavily moderated
# lists.
url = save_attachment(mlist, part, attachments_dir,
filter_html=False)
replace_payload_by_text(part, _("""\
An HTML attachment was scrubbed...
URL: $url
"""), lcset)
else:
# HTML-escape it and store it as an attachment, but make it
# look a /little/ bit prettier. :(
payload = websafe(part.get_payload(decode=True))
# For whitespace in the margin, change spaces into
# non-breaking spaces, and tabs into 8 of those. Then use a
# mono-space font. Still looks hideous to me, but then I'd
# just as soon discard them.
lines = [s.replace(' ', ' ').replace('\t', ' ' * 8)
for s in payload.split('\n')]
payload = '\n' + BR.join(lines) + '\n\n'
part.set_payload(payload)
# We're replacing the payload with the decoded payload so this
# will just get in the way.
del part['content-transfer-encoding']
url = save_attachment(mlist, part, attachments_dir,
filter_html=False)
replace_payload_by_text(part, _("""\
An HTML attachment was scrubbed...
URL: $url
"""), lcset)
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
submsg = part.get_payload(0)
url = save_attachment(mlist, part, attachments_dir)
subject = submsg.get('subject', _('no subject'))
date = submsg.get('date', _('no date'))
who = submsg.get('from', _('unknown sender'))
size = len(str(submsg))
replace_payload_by_text(part, _("""\
An embedded message was scrubbed...
From: $who
Subject: $subject
Date: $date
Size: $size
URL: $url
"""), lcset)
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded. Pipermail
# will transform the url into a hyperlink.
elif part._payload and not part.is_multipart():
payload = part.get_payload(decode=True)
ctype = part.get_content_type()
# XXX Under email 2.5, it is possible that payload will be None.
# This can happen when you have a Content-Type: multipart/* with
# only one part and that part has two blank lines between the
# first boundary and the end boundary. In email 3.0 you end up
# with a string in the payload. I think in this case it's safe to
# ignore the part.
if payload is None:
continue
size = len(payload)
url = save_attachment(mlist, part, attachments_dir)
desc = part.get('content-description', _('not available'))
desc = oneline(desc, lcset)
filename = part.get_filename(_('not available'))
filename = oneline(filename, lcset)
replace_payload_by_text(part, _("""\
A non-text attachment was scrubbed...
Name: $filename
Type: $ctype
Size: $size bytes
Desc: $desc
URL: $url
"""), lcset)
outer = False
# We still have to sanitize multipart messages to flat text because
# Pipermail can't handle messages with list payloads. This is a kludge;
# def (n) clever hack ;).
if msg.is_multipart() and sanitize != 2:
# By default we take the charset of the first text/plain part in the
# message, but if there was none, we'll use the list's preferred
# language's charset.
if not charset or charset == 'us-ascii':
charset = lcset_out
else:
# normalize to the output charset if input/output are different
charset = Charset(charset).output_charset or charset
# We now want to concatenate all the parts which have been scrubbed to
# text/plain, into a single text/plain payload. We need to make sure
# all the characters in the concatenated string are in the same
# encoding, so we'll use the 'replace' key in the coercion call.
# BAW: Martin's original patch suggested we might want to try
# generalizing to utf-8, and that's probably a good idea (eventually).
text = []
charsets = []
for part in msg.walk():
# TK: bug-id 1099138 and multipart
# MAS test payload - if part may fail if there are no headers.
if not part._payload or part.is_multipart():
continue
# All parts should be scrubbed to text/plain by now.
partctype = part.get_content_type()
if partctype != 'text/plain':
text.append(_('Skipped content of type $partctype\n'))
continue
try:
t = part.get_payload(decode=True) or ''
# MAS: TypeError exception can occur if payload is None. This
# was observed with a message that contained an attached
# message/delivery-status part. Because of the special parsing
# of this type, this resulted in a text/plain sub-part with a
# null body. See bug 1430236.
except (binascii.Error, TypeError):
t = part.get_payload() or ''
# Email problem was solved by Mark Sapiro. (TK)
partcharset = part.get_content_charset('us-ascii')
try:
t = unicode(t, partcharset, 'replace')
except (UnicodeError, LookupError, ValueError, TypeError,
AssertionError):
# We can get here if partcharset is bogus in come way.
# Replace funny characters. We use errors='replace'.
t = unicode(t, 'ascii', 'replace')
# Separation is useful
if isinstance(t, basestring):
if not t.endswith('\n'):
t += '\n'
text.append(t)
if partcharset not in charsets:
charsets.append(partcharset)
# Now join the text and set the payload
sep = _('-------------- next part --------------\n')
assert isinstance(sep, unicode), (
'Expected a unicode separator, got %s' % type(sep))
rept = sep.join(text)
# Replace entire message with text and scrubbed notice.
# Try with message charsets and utf-8
if 'utf-8' not in charsets:
charsets.append('utf-8')
for charset in charsets:
try:
replace_payload_by_text(msg, rept, charset)
break
# Bogus charset can throw several exceptions
except (UnicodeError, LookupError, ValueError, TypeError,
AssertionError):
pass
if format_param:
msg.set_param('format', format_param)
if delsp:
msg.set_param('delsp', delsp)
return msg
def save_attachment(mlist, msg, attachments_dir, filter_html=True):
fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR,
mlist.fqdn_listname, attachments_dir)
makedirs(fsdir)
# Figure out the attachment type and get the decoded data
decodedpayload = msg.get_payload(decode=True)
# BAW: mimetypes ought to handle non-standard, but commonly found types,
# e.g. image/jpg (should be image/jpeg). For now we just store such
# things as application/octet-streams since that seems the safest.
ctype = msg.get_content_type()
# i18n file name is encoded
lcset = mlist.preferred_language.charset
filename = oneline(msg.get_filename(''), lcset)
filename, fnext = os.path.splitext(filename)
# For safety, we should confirm this is valid ext for content-type
# but we can use fnext if we introduce fnext filtering
if as_boolean(config.scrubber.use_attachment_filename_extension):
# HTML message doesn't have filename :-(
ext = fnext or guess_extension(ctype, fnext)
else:
ext = guess_extension(ctype, fnext)
if not ext:
# We don't know what it is, so assume it's just a shapeless
# application/octet-stream, unless the Content-Type: is
# message/rfc822, in which case we know we'll coerce the type to
# text/plain below.
if ctype == 'message/rfc822':
ext = '.txt'
else:
ext = '.bin'
# Allow only alphanumerics, dash, underscore, and dot
ext = sre.sub('', ext)
path = None
# We need a lock to calculate the next attachment number
with Lock(os.path.join(fsdir, 'attachments.lock')):
# Now base the filename on what's in the attachment, uniquifying it if
# necessary.
if (not filename or
not as_boolean(config.scrubber.use_attachment_filename)):
filebase = 'attachment'
else:
# Sanitize the filename given in the message headers
parts = pre.split(filename)
filename = parts[-1]
# Strip off leading dots
filename = dre.sub('', filename)
# Allow only alphanumerics, dash, underscore, and dot
filename = sre.sub('', filename)
# If the filename's extension doesn't match the type we guessed,
# which one should we go with? For now, let's go with the one we
# guessed so attachments can't lie about their type. Also, if the
# filename /has/ no extension, then tack on the one we guessed.
# The extension was removed from the name above.
filebase = filename
# Now we're looking for a unique name for this file on the file
# system. If msgdir/filebase.ext isn't unique, we'll add a counter
# after filebase, e.g. msgdir/filebase-cnt.ext
counter = 0
extra = ''
while True:
path = os.path.join(fsdir, filebase + extra + ext)
# Generally it is not a good idea to test for file existance
# before just trying to create it, but the alternatives aren't
# wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
# NFS-safe). Besides, we have an exclusive lock now, so we're
# guaranteed that no other process will be racing with us.
if os.path.exists(path):
counter += 1
extra = '-%04d' % counter
else:
break
# `path' now contains the unique filename for the attachment. There's
# just one more step we need to do. If the part is text/html and
# ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
# here), then send the attachment through the filter program for
# sanitization
if filter_html and ctype == 'text/html':
base, ext = os.path.splitext(path)
tmppath = base + '-tmp' + ext
fp = open(tmppath, 'w')
try:
fp.write(decodedpayload)
fp.close()
cmd = Template(config.mta.archive_html_sanitizer).safe_substitue(
filename=tmppath)
progfp = os.popen(cmd, 'r')
decodedpayload = progfp.read()
status = progfp.close()
if status:
log.error('HTML sanitizer exited with non-zero status: %s',
status)
finally:
os.unlink(tmppath)
# BAW: Since we've now sanitized the document, it should be plain
# text. Blarg, we really want the sanitizer to tell us what the type
# if the return data is. :(
ext = '.txt'
path = base + '.txt'
# Is it a message/rfc822 attachment?
elif ctype == 'message/rfc822':
submsg = msg.get_payload()
# BAW: I'm sure we can eventually do better than this. :(
decodedpayload = websafe(str(submsg))
fp = open(path, 'w')
fp.write(decodedpayload)
fp.close()
# Now calculate the url to the list's archive.
scrubber_path = config.scrubber.archive_scrubber
base_url = find_name(scrubber_path).list_url(mlist)
if not base_url.endswith('/'):
base_url += '/'
# Trailing space will definitely be a problem with format=flowed.
# Bracket the URL instead.
url = '<' + base_url + '%s/%s%s%s>' % (
attachments_dir, filebase, extra, ext)
return url
class Scrubber:
"""Cleanse a message for archiving."""
implements(IHandler)
name = 'scrubber'
description = _('Cleanse a message for archiving.')
def process(self, mlist, msg, msgdata):
"""See `IHandler`."""
process(mlist, msg, msgdata)