author: Barry Warsaw 2008-01-26 18:35:18 -0500
committer: Barry Warsaw 2008-01-26 18:35:18 -0500
commit: 3952c95a23a74b8686b55a3a4f1873238e6d6610 (patch)
tree: c994bb588b94da2eac9d851a7da4400861dee25a
parent: df637148d8fa2d5c101a990ee6766ea8547f000a (diff)
download: mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.gz
mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.zst
mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.zip
9 files changed, 281 insertions, 132 deletions
diff --git a/Mailman/Defaults.py b/Mailman/Defaults.py
index 75e1c768f..82fb6886f 100644
--- a/Mailman/Defaults.py
+++ b/Mailman/Defaults.py
@@ -164,16 +164,21 @@ DEFAULT_URL_HOST    = '@URLHOST@'
 # Spam avoidance defaults
 #####
 
-# This variable contains a list of 2-tuple of the format (header, regex) which
-# the Mailman/Handlers/SpamDetect.py module uses to match against the current
-# message.  If the regex matches the given header in the current message, then
-# it is flagged as spam.  header is case-insensitive and should not include
-# the trailing colon.  regex is always matched with re.IGNORECASE.
+# This variable contains a list of tuple of the format:
 #
-# Note that the more searching done, the slower the whole process gets.  Spam
-# detection is run against all messages coming to either the list, or the
-# -owners address, unless the message is explicitly approved.
-KNOWN_SPAMMERS = []
+#   (header, pattern[, chain])
+#
+# which is used to match against the current message's headers.  If the
+# pattern matches the given header in the current message, then the named
+# chain is jumped to.  header is case-insensitive and should not include the
+# trailing colon.  pattern is always matched with re.IGNORECASE.  chain is
+# optional; if not given the 'hold' chain is used, but if given it may be any
+# existing chain, such as 'discard', 'reject', or 'accept'.
+#
+# Note that the more searching done, the slower the whole process gets.
+# Header matching is run against all messages coming to either the list, or
+# the -owners address, unless the message is explicitly approved.
+HEADER_MATCHES = []
 
 
 
diff --git a/Mailman/Handlers/SpamDetect.py b/Mailman/Handlers/SpamDetect.py
index b7e687284..f45b52a32 100644
--- a/Mailman/Handlers/SpamDetect.py
+++ b/Mailman/Handlers/SpamDetect.py
@@ -88,14 +88,6 @@ class HeaderGenerator(Generator):
 def process(mlist, msg, msgdata):
     if msgdata.get('approved'):
         return
-    # First do site hard coded header spam checks
-    for header, regex in config.KNOWN_SPAMMERS:
-        cre = re.compile(regex, re.IGNORECASE)
-        for value in msg.get_all(header, []):
-            mo = cre.search(value)
-            if mo:
-                # we've detected spam, so throw the message away
-                raise SpamDetected
     # Now do header_filter_rules
     # TK: Collect headers in sub-parts because attachment filename
     # extension may be a clue to possible virus/spam.
diff --git a/Mailman/app/chains.py b/Mailman/app/chains.py
index 38c7325ab..c327f8234 100644
--- a/Mailman/app/chains.py
+++ b/Mailman/app/chains.py
@@ -33,6 +33,7 @@ __metaclass__ = type
 __i18n_templates__ = True
 
 
+import re
 import logging
 
 from email.mime.message import MIMEMessage
@@ -49,7 +50,7 @@ from Mailman.app.replybot import autorespond_to_sender, can_acknowledge
 from Mailman.configuration import config
 from Mailman.i18n import _
 from Mailman.interfaces import (
-    IChain, IChainLink, IMutableChain, IPendable, LinkAction)
+    IChain, IChainLink, IMutableChain, IPendable, IRule, LinkAction)
 from Mailman.queue import Switchboard
 
 log = logging.getLogger('mailman.vette')
@@ -91,8 +92,19 @@ class TerminalChainBase:
         yield Link('truth', LinkAction.stop)
 
     def process(self, mlist, msg, msgdata):
+        """Process the message for the given mailing list.
+
+        This must be overridden by subclasses.
+        """
         raise NotImplementedError
 
+    def get_rule(self, name):
+        """See `IChain`.
+
+        This always returns the globally registered named rule.
+        """
+        return config.rules[name]
+
 
 class DiscardChain(TerminalChainBase):
     """Discard a message."""
@@ -283,7 +295,7 @@ class AcceptChain(TerminalChainBase):
 
 
 class Chain:
-    """Default built-in moderation chain."""
+    """Generic chain base class."""
     implements(IMutableChain)
 
     def __init__(self, name, description):
@@ -307,9 +319,124 @@ class Chain:
         for link in self._links:
             yield link
 
+    def get_rule(self, name):
+        """See `IChain`.
+
+        This always returns the globally registered named rule.
+        """
+        return config.rules[name]
+
+
+
+class BuiltInChain(Chain):
+    """Default built-in chain."""
+
+    def __init__(self):
+        super(BuiltInChain, self).__init__(
+            'built-in', _('The built-in moderation chain.'))
+        self.append_link(Link('approved', LinkAction.jump, 'accept'))
+        self.append_link(Link('emergency', LinkAction.jump, 'hold'))
+        self.append_link(Link('loop', LinkAction.jump, 'discard'))
+        # Do all of the following before deciding whether to hold the message
+        # for moderation.
+        self.append_link(Link('administrivia', LinkAction.defer))
+        self.append_link(Link('implicit-dest', LinkAction.defer))
+        self.append_link(Link('max-recipients', LinkAction.defer))
+        self.append_link(Link('max-size', LinkAction.defer))
+        self.append_link(Link('news-moderation', LinkAction.defer))
+        self.append_link(Link('no-subject', LinkAction.defer))
+        self.append_link(Link('suspicious-header', LinkAction.defer))
+        # Now if any of the above hit, jump to the hold chain.
+        self.append_link(Link('any', LinkAction.jump, 'hold'))
+        # Take a detour through the self header matching chain, which we'll
+        # create later.
+        self.append_link(Link('truth', LinkAction.detour, 'header-match'))
+        # Finally, the builtin chain selfs to acceptance.
+        self.append_link(Link('truth', LinkAction.jump, 'accept'))
+
+
+
+class HeaderMatchRule:
+    """Header matching rule used by header-match chain."""
+    implements(IRule)
+
+    # Sequential rule counter.
+    _count = 1
+
+    def __init__(self, header, pattern):
+        self._header = header
+        self._pattern = pattern
+        self.name = 'header-match-%002d' % HeaderMatchRule._count
+        HeaderMatchRule._count += 1
+        self.description = u'%s: %s' % (header, pattern)
+        # XXX I think we should do better here, somehow recording that a
+        # particular header matched a particular pattern, but that gets ugly
+        # with RFC 2822 headers.  It also doesn't match well with the rule
+        # name concept.  For now, we just record the rather useless numeric
+        # rule name.  I suppose we could do the better hit recording in the
+        # check() method, and set self.record = False.
+        self.record = True
+
+    def check(self, mlist, msg, msgdata):
+        """See `IRule`."""
+        for value in msg.get_all(self._header, []):
+            if re.search(self._pattern, value, re.IGNORECASE):
+                return True
+        return False
+
+
+class HeaderMatchChain(Chain):
+    """Default header matching chain.
+
+    This could be extended by header match rules in the database.
+    """
+
+    def __init__(self):
+        super(HeaderMatchChain, self).__init__(
+            'header-match', _('The built-in header matching chain'))
+        # The header match rules are not global, so don't register them.
+        # These are the only rules that the header match chain can execute.
+        self._links = []
+        self._rules = {}
+        # Initialize header check rules with those from the global
+        # HEADER_MATCHES variable.
+        for entry in config.HEADER_MATCHES:
+            if len(entry) == 2:
+                header, pattern = entry
+                chain = 'hold'
+            elif len(entry) == 3:
+                header, pattern, chain = entry
+                # We don't assert that the chain exists here because the jump
+                # chain may not yet have been created.
+            else:
+                raise AssertionError(
+                    'Bad entry for HEADER_MATCHES: %s' % entry)
+            self.extend(header, pattern, chain)
+
+    def extend(self, header, pattern, chain='hold'):
+        """Extend the existing header matches.
+
+        :param header: The case-insensitive header field name.
+        :param pattern: The pattern to match the header's value again.  The
+            match is not anchored and is done case-insensitively.
+        :param chain: Option chain to jump to if the pattern matches any of
+            the named header values.  If not given, the 'hold' chain is used.
+        """
+        rule = HeaderMatchRule(header, pattern)
+        self._rules[rule.name] = rule
+        link = Link(rule.name, LinkAction.jump, chain)
+        self._links.append(link)
+
+    def get_rule(self, name):
+        """See `IChain`.
+
+        Only local rules are findable by this chain.
+        """
+        return self._rules[name]
+
 
 
-def process(start_chain, mlist, msg, msgdata):
+def process(mlist, msg, msgdata, start_chain='built-in'):
     """Process the message through a chain.
 
     :param start_chain: The name of the chain to start the processing with.
@@ -317,34 +444,45 @@ def process(start_chain, mlist, msg, msgdata):
     :param msg: The Message object.
     :param msgdata: The message metadata dictionary.
     """
-    # Find the starting chain.
-    current_chain = iter(config.chains[start_chain])
+    # Set up some bookkeeping.
     chain_stack = []
     msgdata['rule_hits'] = hits = []
     msgdata['rule_misses'] = misses = []
-    while current_chain:
+    # Find the starting chain and begin iterating through its links.
+    chain = config.chains[start_chain]
+    chain_iter = iter(chain)
+    # Loop until we've reached the end of all processing chains.
+    while chain:
+        # Iterate over all links in the chain.  Do this outside a for-loop so
+        # we can capture a chain's link iterator in mid-flight.  This supports
+        # the 'detour' link action
         try:
-            link = current_chain.next()
+            link = chain_iter.next()
         except StopIteration:
             # This chain is exhausted.  Pop the last chain on the stack and
-            # continue.
+            # continue iterating through it.  If there's nothing left on the
+            # chain stack then we're completely finished processing.
             if len(chain_stack) == 0:
                 return
-            current_chain = chain_stack.pop()
+            chain, chain_iter = chain_stack.pop()
             continue
         # Process this link.
-        rule = config.rules[link.rule]
+        rule = chain.get_rule(link.rule)
         if rule.check(mlist, msg, msgdata):
             if rule.record:
                 hits.append(link.rule)
             # The rule matched so run its action.
             if link.action is LinkAction.jump:
-                current_chain = iter(config.chains[link.chain])
+                chain = config.chains[link.chain]
+                chain_iter = iter(chain)
+                continue
             elif link.action is LinkAction.detour:
-                # Push the current chain so that we can return to it when the
-                # next chain is finished.
-                chain_stack.append(current_chain)
-                current_chain = iter(config.chains[link.chain])
+                # Push the current chain so that we can return to it when
+                # the next chain is finished.
+                chain_stack.append((chain, chain_iter))
+                chain = config.chains[link.chain]
+                chain_iter = iter(chain)
+                continue
             elif link.action is LinkAction.stop:
                 # Stop all processing.
                 return
@@ -354,7 +492,7 @@ def process(start_chain, mlist, msg, msgdata):
             elif link.action is LinkAction.run:
                 link.function(mlist, msg, msgdata)
             else:
-                raise AssertionError('Unknown link action: %s' % link.action)
+                raise AssertionError('Bad link action: %s' % link.action)
         else:
             # The rule did not match; keep going.
             if rule.record:
@@ -370,21 +508,10 @@ def initialize():
             'Duplicate chain name: %s' % chain.name)
         config.chains[chain.name] = chain
     # Set up a couple of other default chains.
-    default = Chain('built-in', _('The built-in moderation chain.'))
-    default.append_link(Link('approved', LinkAction.jump, 'accept'))
-    default.append_link(Link('emergency', LinkAction.jump, 'hold'))
-    default.append_link(Link('loop', LinkAction.jump, 'discard'))
-    # Do all these before deciding whether to hold the message for moderation.
-    default.append_link(Link('administrivia', LinkAction.defer))
-    default.append_link(Link('implicit-dest', LinkAction.defer))
-    default.append_link(Link('max-recipients', LinkAction.defer))
-    default.append_link(Link('max-size', LinkAction.defer))
-    default.append_link(Link('news-moderation', LinkAction.defer))
-    default.append_link(Link('no-subject', LinkAction.defer))
-    default.append_link(Link('suspicious-header', LinkAction.defer))
-    # Now if any of the above hit, jump to the hold chain.
-    default.append_link(Link('any', LinkAction.jump, 'hold'))
-    # Finally, the builtin chain defaults to acceptance.
-    default.append_link(Link('truth', LinkAction.jump, 'accept'))
+    chain = BuiltInChain()
+    config.chains[chain.name] = chain
+    # Create and initialize the header matching chain.
+    chain = HeaderMatchChain()
+    config.chains[chain.name] = chain
     # XXX Read chains from the database and initialize them.
     pass
diff --git a/Mailman/docs/antispam.txt b/Mailman/docs/antispam.txt
deleted file mode 100644
index 3ad5e982e..000000000
--- a/Mailman/docs/antispam.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Anti-spam defences
-==================
-
-By design, Mailman does not have very sophisticated anti-spam measures because
-this type of filtering is done much more efficiently at the MTA level.  For
-example, if Mailman were to do spam detection, it could not reject the message
-at SMTP time.
-
-Still, Mailman does employ a small number of rather ham-handed anti-spam
-measures.
-
-    >>> from Mailman.Handlers.SpamDetect import process
-    >>> from Mailman.queue import Switchboard
-    >>> from Mailman.configuration import config
-    >>> mlist = config.db.list_manager.create(u'_xtest@example.com')
-
-
-Short circuiting
-----------------
-
-If a message is pre-approved, this handler does nothing.
-
-    >>> msg = message_from_string("""\
-    ... From: aperson@example.com
-    ...
-    ... An important message.
-    ... """)
-    >>> msgdata = {'approved': True}
-    >>> process(mlist, msg, msgdata)
-    >>> print msg.as_string()
-    From: aperson@example.com
-    <BLANKLINE>
-    An important message.
-    <BLANKLINE>
-    >>> msgdata
-    {'approved': True}
-
-
-Header matching
----------------
-
-There is a global configuration variable that can be set to a list of header
-matches.  Each item in that list is a 2-tuple of the header to match and a
-regular expression.  For example, if we wanted to block all message that come
-from 'aperson' regardless of the domain, we'd do something like the following
-in our mailman.cfg file:
-
-    >>> old_value = config.KNOWN_SPAMMERS[:]
-    >>> config.KNOWN_SPAMMERS.append(('from', 'aperson'))
-
-Now if the same message is posted to the mailing list, and that message is not
-pre-approved.  The handler will throw an exception that signals the message is
-spam.
-
-    >>> msgdata = {}
-    >>> process(mlist, msg, msgdata)
-    Traceback (most recent call last):
-    ...
-    SpamDetected
-    >>> print msg.as_string()
-    From: aperson@example.com
-    <BLANKLINE>
-    An important message.
-    <BLANKLINE>
-    >>> msgdata
-    {}
-
-    # Restore global state
-    config.KNOWN_SPAMMERS = old_value
-
-
-Header filter rules
--------------------
-
-XXX Need tests.
diff --git a/Mailman/docs/chains.txt b/Mailman/docs/chains.txt
index 0a93683f7..e676957d8 100644
--- a/Mailman/docs/chains.txt
+++ b/Mailman/docs/chains.txt
@@ -36,12 +36,14 @@ The Discard chain simply throws the message away.
     ... An important message.
     ... """)
 
+    >>> from Mailman.app.chains import process
+
     # XXX This checks the vette log file because there is no other evidence
     # that this chain has done anything.
     >>> import os
     >>> fp = open(os.path.join(config.LOG_DIR, 'vette'))
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'discard')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... DISCARD: <first>
@@ -62,7 +64,7 @@ this action.
     >>> chain.description
     u'Reject/bounce a message and stop processing.'
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'reject')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... REJECT: <first>
@@ -111,7 +113,7 @@ sender and the list moderators.
     u'Hold a message and stop processing.'
 
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'hold')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... HOLD: _xtest@example.com post from aperson@example.com held,
@@ -262,7 +264,7 @@ processed and sent on to the list membership.
     >>> chain.description
     u'Accept a message.'
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'accept')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... ACCEPT: <first>
@@ -311,7 +313,7 @@ all default rules.  This message will end up in the prep queue.
 
     >>> file_pos = fp.tell()
     >>> from Mailman.app.chains import process
-    >>> process('built-in', mlist, msg, {})
+    >>> process(mlist, msg, {})
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... ACCEPT: <first>
diff --git a/Mailman/interfaces/chain.py b/Mailman/interfaces/chain.py
index 8c0837820..eca663b30 100644
--- a/Mailman/interfaces/chain.py
+++ b/Mailman/interfaces/chain.py
@@ -69,6 +69,16 @@ class IChain(Interface):
         :return: an IChainLink.
         """
 
+    def get_rule(name):
+        """Lookup and return the named rule.
+
+        :param name: The name of the rule to return.  This may be a globally
+            registered rule name, in which case it must be unique, or it may
+            be a rule defined locally to the chain.
+        :return: The named `IRule`.
+        :raises: KeyError if the named rule cannot be found.
+        """
+
 
 
 class IMutableChain(IChain):
diff --git a/Mailman/rules/docs/emergency.txt b/Mailman/rules/docs/emergency.txt
index 1375c3bc9..aecbcb90d 100644
--- a/Mailman/rules/docs/emergency.txt
+++ b/Mailman/rules/docs/emergency.txt
@@ -21,7 +21,7 @@ rule matches if the flag is set on the mailing list.
 
     >>> from Mailman.app.chains import process
     >>> mlist.emergency = True
-    >>> process('built-in', mlist, msg, {})
+    >>> process(mlist, msg, {}, 'built-in')
 
 There are two messages in the virgin queue.  The one addressed to the original
 sender will contain a token we can use to grab the held message out of the
@@ -69,6 +69,6 @@ However, if the message metadata has a 'moderator_approved' key set, then even
 if the mailing list has its emergency flag set, the message still goes through
 to the membership.
 
-    >>> process('built-in', mlist, msg, dict(moderator_approved=True))
+    >>> process(mlist, msg, dict(moderator_approved=True), 'built-in')
     >>> len(virginq.files)
     0
diff --git a/Mailman/rules/docs/header-matching.txt b/Mailman/rules/docs/header-matching.txt
new file mode 100644
index 000000000..b32feabe5
--- /dev/null
+++ b/Mailman/rules/docs/header-matching.txt
@@ -0,0 +1,89 @@
+Header matching
+===============
+
+Mailman can do pattern based header matching during its normal rule
+processing.  There is a set of site-wide default header matchines specified in
+the configuaration file under the HEADER_MATCHES variable.
+
+    >>> from Mailman.app.lifecycle import create_list
+    >>> mlist = create_list(u'_xtest@example.com')
+
+Because the default HEADER_MATCHES variable is empty when the configuration
+file is read, we'll just extend the current header matching chain with a
+pattern that matches 4 or more stars, discarding the message if it hits.
+
+    >>> from Mailman.configuration import config
+    >>> chain = config.chains['header-match']
+    >>> chain.extend('x-spam-score', '[*]{4,}', 'discard')
+
+First, if the message has no X-Spam-Score header, the message passes through
+the chain untouched (i.e. no disposition).
+
+    >>> msg = message_from_string("""\
+    ... From: aperson@example.com
+    ... To: _xtest@example.com
+    ... Subject: Not spam
+    ... Message-ID: <one>
+    ...
+    ... This is a message.
+    ... """)
+
+    >>> from Mailman.app.chains import process
+
+Pass through is seen as nothing being in the log file after processing.
+
+    # XXX This checks the vette log file because there is no other evidence
+    # that this chain has done anything.
+    >>> import os
+    >>> fp = open(os.path.join(config.LOG_DIR, 'vette'))
+    >>> fp.seek(0, 2)
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG:
+    <BLANKLINE>
+
+Now, if the header exists but does not match, then it also passes through
+untouched.
+
+    >>> msg['X-Spam-Score'] = '***'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is almost spam'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<two>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG:
+    <BLANKLINE>
+
+But now if the header matches, then the message gets discarded.
+
+    >>> msg['X-Spam-Score'] = '****'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is spam, but barely'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<three>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG: ... DISCARD: <three>
+    <BLANKLINE>
+
+For kicks, let's show a message that's really spammy.
+
+    >>> msg['X-Spam-Score'] = '**********'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is really spammy'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<four>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG: ... DISCARD: <four>
+    <BLANKLINE>
+
diff --git a/Mailman/tests/test_documentation.py b/Mailman/tests/test_documentation.py
index 9faf1d588..ad00ba19c 100644
--- a/Mailman/tests/test_documentation.py
+++ b/Mailman/tests/test_documentation.py
@@ -18,7 +18,6 @@
 """Harness for testing Mailman's documentation."""
 
 import os
-import pdb
 import doctest
 import unittest
author	Barry Warsaw	2008-01-26 18:35:18 -0500
committer	Barry Warsaw	2008-01-26 18:35:18 -0500
commit	3952c95a23a74b8686b55a3a4f1873238e6d6610 (patch)
tree	c994bb588b94da2eac9d851a7da4400861dee25a
parent	df637148d8fa2d5c101a990ee6766ea8547f000a (diff)
download	mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.gz mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.zst mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.zip