process(): Implement ARCHIVE_HTML_SANITIZER == 2, meaning "leave it

inline but HTML-escape it. Also, expand on the == 1 value (HTML-escape an attachment) a bit so the output looks a little nicer. Pipermail actually does a better job here, but we can't use it. save_attachment(): Grows a filter_html option which says whether to filter text/html parts or not. Default is 1, but if ARCHIVE_HTML_SANITIZER == 2 above, we don't want to filter it through the program.
author: bwarsaw 2001-10-27 03:32:56 +0000
committer: bwarsaw 2001-10-27 03:32:56 +0000
commit: d420c4b18a4e2e15ecb46ab56e752d6453077fac (patch)
tree: 7a6e9bffbbb38899268b3d38b34897944f2ffd29
parent: ac79a2601e268a91ec76837527f2014b9759447a (diff)
download: mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.tar.gz
mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.tar.zst
mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.zip
1 files changed, 31 insertions, 6 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 24a3c2b48..9dd37b55b 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -20,6 +20,7 @@
 import os
 import re
 import sha
+import cgi
 import errno
 import mimetypes
 import tempfile
@@ -43,6 +44,8 @@ pre = re.compile(r'[/\\:]')
 # (essentially anything that isn't an alphanum, dot, slash, or underscore.
 sre = re.compile(r'[^-\w.]')
 
+BR = '<br>\n'
+
 
 
 # We're using a subclass of the standard Generator because we want to suppress
@@ -66,20 +69,42 @@ class ScrubberGenerator(Generator):
 
 
 def process(mlist, msg, msgdata=None):
+    sanitize = mm_cfg.ARCHIVE_HTML_SANITIZER
     outer = 1
     for part in msg.walk():
         # If the part is text/plain, we leave it alone
         if part.get_type('text/plain') == 'text/plain':
             pass
-        elif part.get_type() == 'text/html' and \
-             not isinstance(mm_cfg.ARCHIVE_HTML_SANITIZER, StringType):
-            if mm_cfg.ARCHIVE_HTML_SANITIZER == 0:
+        elif part.get_type() == 'text/html' and sanitize in (0, 1, 2):
+            if sanitize == 0:
                 if outer:
                     raise DiscardMessage
                 part.set_payload(_('HTML attachment scrubbed and removed'))
-            else:
+            elif sanitize == 2:
                 # By leaving it alone, Pipermail will automatically escape it
                 pass
+            else:
+                # HTML-escape it and store it as an attachment, but make it
+                # look a /little/ bit prettier. :(
+                payload = cgi.escape(part.get_payload())
+                # For whitespace in the margin, change spaces into
+                # non-breaking spaces, and tabs into 8 of those.  Then use a
+                # mono-space font.  Still looks hideous to me, but then I'd
+                # just as soon discard them.
+                def doreplace(s):
+                    return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
+                lines = [doreplace(s) for s in payload.split('\n')]
+                payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+                part.set_payload(payload)
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part, filter_html=0)
+                finally:
+                    os.umask(omask)
+                part.set_payload(_("""\
+An HTML attachment was scrubbed.
+URL: %(url)s
+"""))
         # If the message isn't a multipart, then we'll strip it out as an
         # attachment that would have to be separately downloaded.  Pipermail
         # will transform the url into a hyperlink.
@@ -124,7 +149,7 @@ Url : %(url)s
 
 
 
-def save_attachment(mlist, msg):
+def save_attachment(mlist, msg, filter_html=1):
     # The directory to store the attachment in
     dir = os.path.join(mlist.archive_dir(), 'attachments')
     try:
@@ -203,7 +228,7 @@ def save_attachment(mlist, msg):
     # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
     # here), then send the attachment through the filter program for
     # sanitization
-    if msg.get_type() == 'text/html':
+    if filter_html and msg.get_type() == 'text/html':
         base, ext = os.path.splitext(path)
         tmppath = base + '-tmp' + ext
         fp = open(tmppath, 'w')
author	bwarsaw	2001-10-27 03:32:56 +0000
committer	bwarsaw	2001-10-27 03:32:56 +0000
commit	d420c4b18a4e2e15ecb46ab56e752d6453077fac (patch)
tree	7a6e9bffbbb38899268b3d38b34897944f2ffd29
parent	ac79a2601e268a91ec76837527f2014b9759447a (diff)
download	mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.tar.gz mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.tar.zst mailman-d420c4b18a4e2e15ecb46ab56e752d6453077fac.zip