summaryrefslogtreecommitdiff
path: root/mailman/pipeline/docs/scrubber.txt
diff options
context:
space:
mode:
Diffstat (limited to 'mailman/pipeline/docs/scrubber.txt')
-rw-r--r--mailman/pipeline/docs/scrubber.txt214
1 files changed, 214 insertions, 0 deletions
diff --git a/mailman/pipeline/docs/scrubber.txt b/mailman/pipeline/docs/scrubber.txt
new file mode 100644
index 000000000..744925f34
--- /dev/null
+++ b/mailman/pipeline/docs/scrubber.txt
@@ -0,0 +1,214 @@
+The scrubber
+============
+
+The scrubber is an integral part of Mailman, both in the normal delivery of
+messages and in components such as the archiver. Its primary purpose is to
+scrub attachments from messages so that binary goop doesn't end up in an
+archive message.
+
+ >>> from mailman.pipeline.scrubber import process, save_attachment
+ >>> from mailman.configuration import config
+ >>> mlist = config.db.list_manager.create(u'_xtest@example.com')
+ >>> mlist.preferred_language = u'en'
+
+Helper functions for getting the attachment data.
+
+ >>> import os, re
+ >>> def read_attachment(filename, remove=True):
+ ... path = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR,
+ ... mlist.fqdn_listname, filename)
+ ... fp = open(path)
+ ... try:
+ ... data = fp.read()
+ ... finally:
+ ... fp.close()
+ ... if remove:
+ ... os.unlink(path)
+ ... return data
+
+ >>> from urlparse import urlparse
+ >>> def read_url_from_message(msg):
+ ... url = None
+ ... for line in msg.get_payload().splitlines():
+ ... mo = re.match('URL: <(?P<url>[^>]+)>', line)
+ ... if mo:
+ ... url = mo.group('url')
+ ... break
+ ... path = '/'.join(urlparse(url).path.split('/')[3:])
+ ... return read_attachment(path)
+
+
+Saving attachments
+------------------
+
+The Scrubber handler exposes a function called save_attachments() which can be
+used to strip various types of attachments and store them in the archive
+directory. This is a public interface used by components outside the normal
+processing pipeline.
+
+Site administrators can decide whether the scrubber should use the attachment
+filename suggested in the message's Content-Disposition: header or not. If
+enabled, the filename will be used when this header attribute is present (yes,
+this is an unfortunate double negative).
+
+ >>> config.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME = False
+ >>> msg = message_from_string("""\
+ ... Content-Type: image/gif; name="xtest.gif"
+ ... Content-Transfer-Encoding: base64
+ ... Content-Disposition: attachment; filename="xtest.gif"
+ ...
+ ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw==
+ ... """)
+ >>> save_attachment(mlist, msg, 'dir')
+ u'<http://www.example.com/pipermail/_xtest@example.com/dir/xtest.gif>'
+ >>> data = read_attachment('dir/xtest.gif')
+ >>> data[:6]
+ 'GIF87a'
+ >>> len(data)
+ 34
+
+Saving the attachment does not alter the original message.
+
+ >>> print msg.as_string()
+ Content-Type: image/gif; name="xtest.gif"
+ Content-Transfer-Encoding: base64
+ Content-Disposition: attachment; filename="xtest.gif"
+ <BLANKLINE>
+ R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw==
+
+The site administrator can also configure Mailman to ignore the
+Content-Disposition: filename. This is the default for reasons described in
+the Defaults.py.in file.
+
+ >>> config.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME = True
+ >>> msg = message_from_string("""\
+ ... Content-Type: image/gif; name="xtest.gif"
+ ... Content-Transfer-Encoding: base64
+ ... Content-Disposition: attachment; filename="xtest.gif"
+ ...
+ ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw==
+ ... """)
+ >>> save_attachment(mlist, msg, 'dir')
+ u'<http://www.example.com/pipermail/_xtest@example.com/dir/attachment.gif>'
+ >>> data = read_attachment('dir/xtest.gif')
+ Traceback (most recent call last):
+ IOError: [Errno ...] No such file or directory:
+ u'.../archives/private/_xtest@example.com/dir/xtest.gif'
+ >>> data = read_attachment('dir/attachment.gif')
+ >>> data[:6]
+ 'GIF87a'
+ >>> len(data)
+ 34
+
+
+Scrubbing image attachments
+---------------------------
+
+When scrubbing image attachments, the original message is modified to include
+a reference to the attachment file as available through the on-line archive.
+
+ >>> msg = message_from_string("""\
+ ... MIME-Version: 1.0
+ ... Content-Type: multipart/mixed; boundary="BOUNDARY"
+ ...
+ ... --BOUNDARY
+ ... Content-type: text/plain; charset=us-ascii
+ ...
+ ... This is a message.
+ ... --BOUNDARY
+ ... Content-Type: image/gif; name="xtest.gif"
+ ... Content-Transfer-Encoding: base64
+ ... Content-Disposition: attachment; filename="xtest.gif"
+ ...
+ ... R0lGODdhAQABAIAAAAAAAAAAACwAAAAAAQABAAACAQUAOw==
+ ... --BOUNDARY--
+ ... """)
+ >>> msgdata = {}
+
+The Scrubber.process() function is different than other handler process
+functions in that it returns the scrubbed message.
+
+ >>> scrubbed_msg = process(mlist, msg, msgdata)
+ >>> scrubbed_msg is msg
+ True
+ >>> print scrubbed_msg.as_string()
+ MIME-Version: 1.0
+ Message-ID: ...
+ Content-Type: text/plain; charset="us-ascii"
+ Content-Transfer-Encoding: 7bit
+ <BLANKLINE>
+ This is a message.
+ -------------- next part --------------
+ A non-text attachment was scrubbed...
+ Name: xtest.gif
+ Type: image/gif
+ Size: 34 bytes
+ Desc: not available
+ URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.gif>
+ <BLANKLINE>
+
+This is the same as the transformed message originally passed in.
+
+ >>> print msg.as_string()
+ MIME-Version: 1.0
+ Message-ID: ...
+ Content-Type: text/plain; charset="us-ascii"
+ Content-Transfer-Encoding: 7bit
+ <BLANKLINE>
+ This is a message.
+ -------------- next part --------------
+ A non-text attachment was scrubbed...
+ Name: xtest.gif
+ Type: image/gif
+ Size: 34 bytes
+ Desc: not available
+ URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.gif>
+ <BLANKLINE>
+ >>> msgdata
+ {}
+
+The URL will point to the attachment sitting in the archive.
+
+ >>> data = read_url_from_message(msg)
+ >>> data[:6]
+ 'GIF87a'
+ >>> len(data)
+ 34
+
+
+Scrubbing text attachments
+--------------------------
+
+Similar to image attachments, text attachments will also be scrubbed, but the
+placeholder will be slightly different.
+
+ >>> msg = message_from_string("""\
+ ... MIME-Version: 1.0
+ ... Content-Type: multipart/mixed; boundary="BOUNDARY"
+ ...
+ ... --BOUNDARY
+ ... Content-type: text/plain; charset=us-ascii; format=flowed; delsp=no
+ ...
+ ... This is a message.
+ ... --BOUNDARY
+ ... Content-type: text/plain; name="xtext.txt"
+ ... Content-Disposition: attachment; filename="xtext.txt"
+ ...
+ ... This is a text attachment.
+ ... --BOUNDARY--
+ ... """)
+ >>> scrubbed_msg = process(mlist, msg, {})
+ >>> print scrubbed_msg.as_string()
+ MIME-Version: 1.0
+ Message-ID: ...
+ Content-Transfer-Encoding: 7bit
+ Content-Type: text/plain; charset="us-ascii"; format="flowed"; delsp="no"
+ <BLANKLINE>
+ This is a message.
+ -------------- next part --------------
+ An embedded and charset-unspecified text was scrubbed...
+ Name: xtext.txt
+ URL: <http://www.example.com/pipermail/_xtest@example.com/attachments/.../attachment.txt>
+ <BLANKLINE>
+ >>> read_url_from_message(msg)
+ 'This is a text attachment.'