Port header matching previously described by the misnamed KNONW_SPAMMERS

variable to the new chain processing system. Create a header-matching chain which handles global site header matching rules defined by HEADER_MATCHES. Rip this logic out of the SpamDetect handler; really this entire handler needs to go away but the rest of it isn't ported yet. IChains now have a get_rule() method which allows them to return private (i.e. not globally registered) rules. This is used by the header matching chain. Mailman.app.chains.process() has had its parameter list reordered to be more like all other function signatures that take a mailing list, message, and message metadata.
author: Barry Warsaw 2008-01-26 18:35:18 -0500
committer: Barry Warsaw 2008-01-26 18:35:18 -0500
commit: 3952c95a23a74b8686b55a3a4f1873238e6d6610 (patch)
tree: c994bb588b94da2eac9d851a7da4400861dee25a
parent: df637148d8fa2d5c101a990ee6766ea8547f000a (diff)
download: mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.gz
mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.zst
mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.zip
9 files changed, 281 insertions, 132 deletions
diff --git a/Mailman/Defaults.py b/Mailman/Defaults.py
index 75e1c768f..82fb6886f 100644
--- a/Mailman/Defaults.py
+++ b/Mailman/Defaults.py
@@ -164,16 +164,21 @@ DEFAULT_URL_HOST    = '@URLHOST@'
 # Spam avoidance defaults
 #####
 
-# This variable contains a list of 2-tuple of the format (header, regex) which
-# the Mailman/Handlers/SpamDetect.py module uses to match against the current
-# message.  If the regex matches the given header in the current message, then
-# it is flagged as spam.  header is case-insensitive and should not include
-# the trailing colon.  regex is always matched with re.IGNORECASE.
+# This variable contains a list of tuple of the format:
 #
-# Note that the more searching done, the slower the whole process gets.  Spam
-# detection is run against all messages coming to either the list, or the
-# -owners address, unless the message is explicitly approved.
-KNOWN_SPAMMERS = []
+#   (header, pattern[, chain])
+#
+# which is used to match against the current message's headers.  If the
+# pattern matches the given header in the current message, then the named
+# chain is jumped to.  header is case-insensitive and should not include the
+# trailing colon.  pattern is always matched with re.IGNORECASE.  chain is
+# optional; if not given the 'hold' chain is used, but if given it may be any
+# existing chain, such as 'discard', 'reject', or 'accept'.
+#
+# Note that the more searching done, the slower the whole process gets.
+# Header matching is run against all messages coming to either the list, or
+# the -owners address, unless the message is explicitly approved.
+HEADER_MATCHES = []
 
 
 
diff --git a/Mailman/Handlers/SpamDetect.py b/Mailman/Handlers/SpamDetect.py
index b7e687284..f45b52a32 100644
--- a/Mailman/Handlers/SpamDetect.py
+++ b/Mailman/Handlers/SpamDetect.py
@@ -88,14 +88,6 @@ class HeaderGenerator(Generator):
 def process(mlist, msg, msgdata):
     if msgdata.get('approved'):
         return
-    # First do site hard coded header spam checks
-    for header, regex in config.KNOWN_SPAMMERS:
-        cre = re.compile(regex, re.IGNORECASE)
-        for value in msg.get_all(header, []):
-            mo = cre.search(value)
-            if mo:
-                # we've detected spam, so throw the message away
-                raise SpamDetected
     # Now do header_filter_rules
     # TK: Collect headers in sub-parts because attachment filename
     # extension may be a clue to possible virus/spam.
diff --git a/Mailman/app/chains.py b/Mailman/app/chains.py
index 38c7325ab..c327f8234 100644
--- a/Mailman/app/chains.py
+++ b/Mailman/app/chains.py
@@ -33,6 +33,7 @@ __metaclass__ = type
 __i18n_templates__ = True
 
 
+import re
 import logging
 
 from email.mime.message import MIMEMessage
@@ -49,7 +50,7 @@ from Mailman.app.replybot import autorespond_to_sender, can_acknowledge
 from Mailman.configuration import config
 from Mailman.i18n import _
 from Mailman.interfaces import (
-    IChain, IChainLink, IMutableChain, IPendable, LinkAction)
+    IChain, IChainLink, IMutableChain, IPendable, IRule, LinkAction)
 from Mailman.queue import Switchboard
 
 log = logging.getLogger('mailman.vette')
@@ -91,8 +92,19 @@ class TerminalChainBase:
         yield Link('truth', LinkAction.stop)
 
     def process(self, mlist, msg, msgdata):
+        """Process the message for the given mailing list.
+
+        This must be overridden by subclasses.
+        """
         raise NotImplementedError
 
+    def get_rule(self, name):
+        """See `IChain`.
+
+        This always returns the globally registered named rule.
+        """
+        return config.rules[name]
+
 
 class DiscardChain(TerminalChainBase):
     """Discard a message."""
@@ -283,7 +295,7 @@ class AcceptChain(TerminalChainBase):
 
 
 class Chain:
-    """Default built-in moderation chain."""
+    """Generic chain base class."""
     implements(IMutableChain)
 
     def __init__(self, name, description):
@@ -307,9 +319,124 @@ class Chain:
         for link in self._links:
             yield link
 
+    def get_rule(self, name):
+        """See `IChain`.
+
+        This always returns the globally registered named rule.
+        """
+        return config.rules[name]
+
+
+
+class BuiltInChain(Chain):
+    """Default built-in chain."""
+
+    def __init__(self):
+        super(BuiltInChain, self).__init__(
+            'built-in', _('The built-in moderation chain.'))
+        self.append_link(Link('approved', LinkAction.jump, 'accept'))
+        self.append_link(Link('emergency', LinkAction.jump, 'hold'))
+        self.append_link(Link('loop', LinkAction.jump, 'discard'))
+        # Do all of the following before deciding whether to hold the message
+        # for moderation.
+        self.append_link(Link('administrivia', LinkAction.defer))
+        self.append_link(Link('implicit-dest', LinkAction.defer))
+        self.append_link(Link('max-recipients', LinkAction.defer))
+        self.append_link(Link('max-size', LinkAction.defer))
+        self.append_link(Link('news-moderation', LinkAction.defer))
+        self.append_link(Link('no-subject', LinkAction.defer))
+        self.append_link(Link('suspicious-header', LinkAction.defer))
+        # Now if any of the above hit, jump to the hold chain.
+        self.append_link(Link('any', LinkAction.jump, 'hold'))
+        # Take a detour through the self header matching chain, which we'll
+        # create later.
+        self.append_link(Link('truth', LinkAction.detour, 'header-match'))
+        # Finally, the builtin chain selfs to acceptance.
+        self.append_link(Link('truth', LinkAction.jump, 'accept'))
+
+
+
+class HeaderMatchRule:
+    """Header matching rule used by header-match chain."""
+    implements(IRule)
+
+    # Sequential rule counter.
+    _count = 1
+
+    def __init__(self, header, pattern):
+        self._header = header
+        self._pattern = pattern
+        self.name = 'header-match-%002d' % HeaderMatchRule._count
+        HeaderMatchRule._count += 1
+        self.description = u'%s: %s' % (header, pattern)
+        # XXX I think we should do better here, somehow recording that a
+        # particular header matched a particular pattern, but that gets ugly
+        # with RFC 2822 headers.  It also doesn't match well with the rule
+        # name concept.  For now, we just record the rather useless numeric
+        # rule name.  I suppose we could do the better hit recording in the
+        # check() method, and set self.record = False.
+        self.record = True
+
+    def check(self, mlist, msg, msgdata):
+        """See `IRule`."""
+        for value in msg.get_all(self._header, []):
+            if re.search(self._pattern, value, re.IGNORECASE):
+                return True
+        return False
+
+
+class HeaderMatchChain(Chain):
+    """Default header matching chain.
+
+    This could be extended by header match rules in the database.
+    """
+
+    def __init__(self):
+        super(HeaderMatchChain, self).__init__(
+            'header-match', _('The built-in header matching chain'))
+        # The header match rules are not global, so don't register them.
+        # These are the only rules that the header match chain can execute.
+        self._links = []
+        self._rules = {}
+        # Initialize header check rules with those from the global
+        # HEADER_MATCHES variable.
+        for entry in config.HEADER_MATCHES:
+            if len(entry) == 2:
+                header, pattern = entry
+                chain = 'hold'
+            elif len(entry) == 3:
+                header, pattern, chain = entry
+                # We don't assert that the chain exists here because the jump
+                # chain may not yet have been created.
+            else:
+                raise AssertionError(
+                    'Bad entry for HEADER_MATCHES: %s' % entry)
+            self.extend(header, pattern, chain)
+
+    def extend(self, header, pattern, chain='hold'):
+        """Extend the existing header matches.
+
+        :param header: The case-insensitive header field name.
+        :param pattern: The pattern to match the header's value again.  The
+            match is not anchored and is done case-insensitively.
+        :param chain: Option chain to jump to if the pattern matches any of
+            the named header values.  If not given, the 'hold' chain is used.
+        """
+        rule = HeaderMatchRule(header, pattern)
+        self._rules[rule.name] = rule
+        link = Link(rule.name, LinkAction.jump, chain)
+        self._links.append(link)
+
+    def get_rule(self, name):
+        """See `IChain`.
+
+        Only local rules are findable by this chain.
+        """
+        return self._rules[name]
+
 
 
-def process(start_chain, mlist, msg, msgdata):
+def process(mlist, msg, msgdata, start_chain='built-in'):
     """Process the message through a chain.
 
     :param start_chain: The name of the chain to start the processing with.
@@ -317,34 +444,45 @@ def process(start_chain, mlist, msg, msgdata):
     :param msg: The Message object.
     :param msgdata: The message metadata dictionary.
     """
-    # Find the starting chain.
-    current_chain = iter(config.chains[start_chain])
+    # Set up some bookkeeping.
     chain_stack = []
     msgdata['rule_hits'] = hits = []
     msgdata['rule_misses'] = misses = []
-    while current_chain:
+    # Find the starting chain and begin iterating through its links.
+    chain = config.chains[start_chain]
+    chain_iter = iter(chain)
+    # Loop until we've reached the end of all processing chains.
+    while chain:
+        # Iterate over all links in the chain.  Do this outside a for-loop so
+        # we can capture a chain's link iterator in mid-flight.  This supports
+        # the 'detour' link action
         try:
-            link = current_chain.next()
+            link = chain_iter.next()
         except StopIteration:
             # This chain is exhausted.  Pop the last chain on the stack and
-            # continue.
+            # continue iterating through it.  If there's nothing left on the
+            # chain stack then we're completely finished processing.
             if len(chain_stack) == 0:
                 return
-            current_chain = chain_stack.pop()
+            chain, chain_iter = chain_stack.pop()
             continue
         # Process this link.
-        rule = config.rules[link.rule]
+        rule = chain.get_rule(link.rule)
         if rule.check(mlist, msg, msgdata):
             if rule.record:
                 hits.append(link.rule)
             # The rule matched so run its action.
             if link.action is LinkAction.jump:
-                current_chain = iter(config.chains[link.chain])
+                chain = config.chains[link.chain]
+                chain_iter = iter(chain)
+                continue
             elif link.action is LinkAction.detour:
-                # Push the current chain so that we can return to it when the
-                # next chain is finished.
-                chain_stack.append(current_chain)
-                current_chain = iter(config.chains[link.chain])
+                # Push the current chain so that we can return to it when
+                # the next chain is finished.
+                chain_stack.append((chain, chain_iter))
+                chain = config.chains[link.chain]
+                chain_iter = iter(chain)
+                continue
             elif link.action is LinkAction.stop:
                 # Stop all processing.
                 return
@@ -354,7 +492,7 @@ def process(start_chain, mlist, msg, msgdata):
             elif link.action is LinkAction.run:
                 link.function(mlist, msg, msgdata)
             else:
-                raise AssertionError('Unknown link action: %s' % link.action)
+                raise AssertionError('Bad link action: %s' % link.action)
         else:
             # The rule did not match; keep going.
             if rule.record:
@@ -370,21 +508,10 @@ def initialize():
             'Duplicate chain name: %s' % chain.name)
         config.chains[chain.name] = chain
     # Set up a couple of other default chains.
-    default = Chain('built-in', _('The built-in moderation chain.'))
-    default.append_link(Link('approved', LinkAction.jump, 'accept'))
-    default.append_link(Link('emergency', LinkAction.jump, 'hold'))
-    default.append_link(Link('loop', LinkAction.jump, 'discard'))
-    # Do all these before deciding whether to hold the message for moderation.
-    default.append_link(Link('administrivia', LinkAction.defer))
-    default.append_link(Link('implicit-dest', LinkAction.defer))
-    default.append_link(Link('max-recipients', LinkAction.defer))
-    default.append_link(Link('max-size', LinkAction.defer))
-    default.append_link(Link('news-moderation', LinkAction.defer))
-    default.append_link(Link('no-subject', LinkAction.defer))
-    default.append_link(Link('suspicious-header', LinkAction.defer))
-    # Now if any of the above hit, jump to the hold chain.
-    default.append_link(Link('any', LinkAction.jump, 'hold'))
-    # Finally, the builtin chain defaults to acceptance.
-    default.append_link(Link('truth', LinkAction.jump, 'accept'))
+    chain = BuiltInChain()
+    config.chains[chain.name] = chain
+    # Create and initialize the header matching chain.
+    chain = HeaderMatchChain()
+    config.chains[chain.name] = chain
     # XXX Read chains from the database and initialize them.
     pass
diff --git a/Mailman/docs/antispam.txt b/Mailman/docs/antispam.txt
deleted file mode 100644
index 3ad5e982e..000000000
--- a/Mailman/docs/antispam.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Anti-spam defences
-==================
-
-By design, Mailman does not have very sophisticated anti-spam measures because
-this type of filtering is done much more efficiently at the MTA level.  For
-example, if Mailman were to do spam detection, it could not reject the message
-at SMTP time.
-
-Still, Mailman does employ a small number of rather ham-handed anti-spam
-measures.
-
-    >>> from Mailman.Handlers.SpamDetect import process
-    >>> from Mailman.queue import Switchboard
-    >>> from Mailman.configuration import config
-    >>> mlist = config.db.list_manager.create(u'_xtest@example.com')
-
-
-Short circuiting
-----------------
-
-If a message is pre-approved, this handler does nothing.
-
-    >>> msg = message_from_string("""\
-    ... From: aperson@example.com
-    ...
-    ... An important message.
-    ... """)
-    >>> msgdata = {'approved': True}
-    >>> process(mlist, msg, msgdata)
-    >>> print msg.as_string()
-    From: aperson@example.com
-    <BLANKLINE>
-    An important message.
-    <BLANKLINE>
-    >>> msgdata
-    {'approved': True}
-
-
-Header matching
----------------
-
-There is a global configuration variable that can be set to a list of header
-matches.  Each item in that list is a 2-tuple of the header to match and a
-regular expression.  For example, if we wanted to block all message that come
-from 'aperson' regardless of the domain, we'd do something like the following
-in our mailman.cfg file:
-
-    >>> old_value = config.KNOWN_SPAMMERS[:]
-    >>> config.KNOWN_SPAMMERS.append(('from', 'aperson'))
-
-Now if the same message is posted to the mailing list, and that message is not
-pre-approved.  The handler will throw an exception that signals the message is
-spam.
-
-    >>> msgdata = {}
-    >>> process(mlist, msg, msgdata)
-    Traceback (most recent call last):
-    ...
-    SpamDetected
-    >>> print msg.as_string()
-    From: aperson@example.com
-    <BLANKLINE>
-    An important message.
-    <BLANKLINE>
-    >>> msgdata
-    {}
-
-    # Restore global state
-    config.KNOWN_SPAMMERS = old_value
-
-
-Header filter rules
--------------------
-
-XXX Need tests.
diff --git a/Mailman/docs/chains.txt b/Mailman/docs/chains.txt
index 0a93683f7..e676957d8 100644
--- a/Mailman/docs/chains.txt
+++ b/Mailman/docs/chains.txt
@@ -36,12 +36,14 @@ The Discard chain simply throws the message away.
     ... An important message.
     ... """)
 
+    >>> from Mailman.app.chains import process
+
     # XXX This checks the vette log file because there is no other evidence
     # that this chain has done anything.
     >>> import os
     >>> fp = open(os.path.join(config.LOG_DIR, 'vette'))
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'discard')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... DISCARD: <first>
@@ -62,7 +64,7 @@ this action.
     >>> chain.description
     u'Reject/bounce a message and stop processing.'
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'reject')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... REJECT: <first>
@@ -111,7 +113,7 @@ sender and the list moderators.
     u'Hold a message and stop processing.'
 
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'hold')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... HOLD: _xtest@example.com post from aperson@example.com held,
@@ -262,7 +264,7 @@ processed and sent on to the list membership.
     >>> chain.description
     u'Accept a message.'
     >>> file_pos = fp.tell()
-    >>> chain.process(mlist, msg, {})
+    >>> process(mlist, msg, {}, 'accept')
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... ACCEPT: <first>
@@ -311,7 +313,7 @@ all default rules.  This message will end up in the prep queue.
 
     >>> file_pos = fp.tell()
     >>> from Mailman.app.chains import process
-    >>> process('built-in', mlist, msg, {})
+    >>> process(mlist, msg, {})
     >>> fp.seek(file_pos)
     >>> print 'LOG:', fp.read()
     LOG: ... ACCEPT: <first>
diff --git a/Mailman/interfaces/chain.py b/Mailman/interfaces/chain.py
index 8c0837820..eca663b30 100644
--- a/Mailman/interfaces/chain.py
+++ b/Mailman/interfaces/chain.py
@@ -69,6 +69,16 @@ class IChain(Interface):
         :return: an IChainLink.
         """
 
+    def get_rule(name):
+        """Lookup and return the named rule.
+
+        :param name: The name of the rule to return.  This may be a globally
+            registered rule name, in which case it must be unique, or it may
+            be a rule defined locally to the chain.
+        :return: The named `IRule`.
+        :raises: KeyError if the named rule cannot be found.
+        """
+
 
 
 class IMutableChain(IChain):
diff --git a/Mailman/rules/docs/emergency.txt b/Mailman/rules/docs/emergency.txt
index 1375c3bc9..aecbcb90d 100644
--- a/Mailman/rules/docs/emergency.txt
+++ b/Mailman/rules/docs/emergency.txt
@@ -21,7 +21,7 @@ rule matches if the flag is set on the mailing list.
 
     >>> from Mailman.app.chains import process
     >>> mlist.emergency = True
-    >>> process('built-in', mlist, msg, {})
+    >>> process(mlist, msg, {}, 'built-in')
 
 There are two messages in the virgin queue.  The one addressed to the original
 sender will contain a token we can use to grab the held message out of the
@@ -69,6 +69,6 @@ However, if the message metadata has a 'moderator_approved' key set, then even
 if the mailing list has its emergency flag set, the message still goes through
 to the membership.
 
-    >>> process('built-in', mlist, msg, dict(moderator_approved=True))
+    >>> process(mlist, msg, dict(moderator_approved=True), 'built-in')
     >>> len(virginq.files)
     0
diff --git a/Mailman/rules/docs/header-matching.txt b/Mailman/rules/docs/header-matching.txt
new file mode 100644
index 000000000..b32feabe5
--- /dev/null
+++ b/Mailman/rules/docs/header-matching.txt
@@ -0,0 +1,89 @@
+Header matching
+===============
+
+Mailman can do pattern based header matching during its normal rule
+processing.  There is a set of site-wide default header matchines specified in
+the configuaration file under the HEADER_MATCHES variable.
+
+    >>> from Mailman.app.lifecycle import create_list
+    >>> mlist = create_list(u'_xtest@example.com')
+
+Because the default HEADER_MATCHES variable is empty when the configuration
+file is read, we'll just extend the current header matching chain with a
+pattern that matches 4 or more stars, discarding the message if it hits.
+
+    >>> from Mailman.configuration import config
+    >>> chain = config.chains['header-match']
+    >>> chain.extend('x-spam-score', '[*]{4,}', 'discard')
+
+First, if the message has no X-Spam-Score header, the message passes through
+the chain untouched (i.e. no disposition).
+
+    >>> msg = message_from_string("""\
+    ... From: aperson@example.com
+    ... To: _xtest@example.com
+    ... Subject: Not spam
+    ... Message-ID: <one>
+    ...
+    ... This is a message.
+    ... """)
+
+    >>> from Mailman.app.chains import process
+
+Pass through is seen as nothing being in the log file after processing.
+
+    # XXX This checks the vette log file because there is no other evidence
+    # that this chain has done anything.
+    >>> import os
+    >>> fp = open(os.path.join(config.LOG_DIR, 'vette'))
+    >>> fp.seek(0, 2)
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG:
+    <BLANKLINE>
+
+Now, if the header exists but does not match, then it also passes through
+untouched.
+
+    >>> msg['X-Spam-Score'] = '***'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is almost spam'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<two>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG:
+    <BLANKLINE>
+
+But now if the header matches, then the message gets discarded.
+
+    >>> msg['X-Spam-Score'] = '****'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is spam, but barely'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<three>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG: ... DISCARD: <three>
+    <BLANKLINE>
+
+For kicks, let's show a message that's really spammy.
+
+    >>> msg['X-Spam-Score'] = '**********'
+    >>> del msg['subject']
+    >>> msg['Subject'] = 'This is really spammy'
+    >>> del msg['message-id']
+    >>> msg['Message-ID'] = '<four>'
+    >>> file_pos = fp.tell()
+    >>> process(mlist, msg, {}, 'header-match')
+    >>> fp.seek(file_pos)
+    >>> print 'LOG:', fp.read()
+    LOG: ... DISCARD: <four>
+    <BLANKLINE>
+
diff --git a/Mailman/tests/test_documentation.py b/Mailman/tests/test_documentation.py
index 9faf1d588..ad00ba19c 100644
--- a/Mailman/tests/test_documentation.py
+++ b/Mailman/tests/test_documentation.py
@@ -18,7 +18,6 @@
 """Harness for testing Mailman's documentation."""
 
 import os
-import pdb
 import doctest
 import unittest
author	Barry Warsaw	2008-01-26 18:35:18 -0500
committer	Barry Warsaw	2008-01-26 18:35:18 -0500
commit	3952c95a23a74b8686b55a3a4f1873238e6d6610 (patch)
tree	c994bb588b94da2eac9d851a7da4400861dee25a
parent	df637148d8fa2d5c101a990ee6766ea8547f000a (diff)
download	mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.gz mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.tar.zst mailman-3952c95a23a74b8686b55a3a4f1873238e6d6610.zip