author: Barry Warsaw 2012-03-23 19:25:27 -0400
committer: Barry Warsaw 2012-03-23 19:25:27 -0400
commit: 25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc (patch)
tree: f51fe7931545e23058cdb65595c7caed9b43bb0e /src/mailman/handlers/tagger.py
parent: aa2d0ad067adfd2515ed3c256cd0bca296058479 (diff)
parent: e005e1b12fa0bd82d2e126df476b5505b440ce36 (diff)
download: mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.tar.gz
mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.tar.zst
mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.zip
1 files changed, 191 insertions, 0 deletions
diff --git a/src/mailman/handlers/tagger.py b/src/mailman/handlers/tagger.py
new file mode 100644
index 000000000..49e004a12
--- /dev/null
+++ b/src/mailman/handlers/tagger.py
@@ -0,0 +1,191 @@
+# Copyright (C) 2001-2012 by the Free Software Foundation, Inc.
+#
+# This file is part of GNU Mailman.
+#
+# GNU Mailman is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# GNU Mailman.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Extract topics from the original mail message."""
+
+from __future__ import absolute_import, unicode_literals
+
+__metaclass__ = type
+__all__ = [
+    'Tagger',
+    ]
+
+
+import re
+import email.iterators
+import email.parser
+
+from zope.interface import implements
+
+from mailman.core.i18n import _
+from mailman.interfaces.handler import IHandler
+
+
+OR = '|'
+CRNL = '\r\n'
+EMPTYBYTES = b''
+NLTAB = '\n\t'
+
+
+
+def process(mlist, msg, msgdata):
+    """Tag the message for topics."""
+    if not mlist.topics_enabled:
+        return
+    # Extract the Subject:, Keywords:, and possibly body text
+    matchlines = []
+    matchlines.append(msg.get('subject', None))
+    matchlines.append(msg.get('keywords', None))
+    if mlist.topics_bodylines_limit == 0:
+        # Don't scan any body lines
+        pass
+    elif mlist.topics_bodylines_limit < 0:
+        # Scan all body lines
+        matchlines.extend(scanbody(msg))
+    else:
+        # Scan just some of the body lines
+        matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
+    # Filter out any 'false' items.
+    matchlines = [item for item in matchlines if item]
+    # For each regular expression in the topics list, see if any of the lines
+    # of interest from the message match the regexp.  If so, the message gets
+    # added to the specific topics bucket.
+    hits = {}
+    for name, pattern, desc, emptyflag in mlist.topics:
+        pattern = OR.join(pattern.splitlines())
+        cre = re.compile(pattern, re.IGNORECASE)
+        for line in matchlines:
+            if cre.search(line):
+                hits[name] = 1
+                break
+    if hits:
+        # Sort the keys and make them available both in the message metadata
+        # and in a message header.
+        msgdata['topichits'] = sorted(hits)
+        msg['X-Topics'] = NLTAB.join(sorted(hits))
+
+
+
+def scanbody(msg, numlines=None):
+    """Scan the body for keywords."""
+    # We only scan the body of the message if it is of MIME type text/plain,
+    # or if the outer type is multipart/alternative and there is a text/plain
+    # part.  Anything else, and the body is ignored for header-scan purposes.
+    found = None
+    if msg.get_content_type() == 'text/plain':
+        found = msg
+    elif msg.is_multipart()\
+         and msg.get_content_type() == 'multipart/alternative':
+        for found in msg.get_payload():
+            if found.get_content_type() == 'text/plain':
+                break
+        else:
+            found = None
+    if not found:
+        return []
+    # Now that we have a Message object that meets our criteria, let's extract
+    # the first numlines of body text.
+    lines = []
+    lineno = 0
+    reader = list(email.iterators.body_line_iterator(msg))
+    while numlines is None or lineno < numlines:
+        try:
+            line = bytes(reader.pop(0))
+        except IndexError:
+            break
+        # Blank lines don't count
+        if not line.strip():
+            continue
+        lineno += 1
+        lines.append(line)
+    # Concatenate those body text lines with newlines, and then create a new
+    # message object from those lines.
+    p = _ForgivingParser()
+    msg = p.parsestr(EMPTYBYTES.join(lines))
+    return msg.get_all('subject', []) + msg.get_all('keywords', [])
+
+
+
+class _ForgivingParser(email.parser.HeaderParser):
+    """An lax email parser.
+
+    Be a little more forgiving about non-header/continuation lines, since
+    we'll just read as much as we can from 'header-like' lines in the body.
+    """
+    # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
+    # specialize the way it returns?
+    def _parseheaders(self, container, fp):
+        """See `email.parser.HeaderParser`."""
+        # Parse the headers, returning a list of header/value pairs.  None as
+        # the header means the Unix-From header.
+        lastheader = ''
+        lastvalue = []
+        lineno = 0
+        while 1:
+            # Don't strip the line before we test for the end condition,
+            # because whitespace-only header lines are RFC compliant
+            # continuation lines.
+            line = fp.readline()
+            if not line:
+                break
+            line = line.splitlines()[0]
+            if not line:
+                break
+            # Ignore the trailing newline
+            lineno += 1
+            # Check for initial Unix From_ line
+            if line.startswith('From '):
+                if lineno == 1:
+                    container.set_unixfrom(line)
+                    continue
+                else:
+                    break
+            # Header continuation line
+            if line[0] in ' \t':
+                if not lastheader:
+                    break
+                lastvalue.append(line)
+                continue
+            # Normal, non-continuation header.  BAW: this should check to make
+            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
+            # should expose the header matching algorithm in the API, and
+            # allow for a non-strict parsing mode (that ignores the line
+            # instead of raising the exception).
+            i = line.find(':')
+            if i < 0:
+                break
+            if lastheader:
+                container[lastheader] = NLTAB.join(lastvalue)
+            lastheader = line[:i]
+            lastvalue = [line[i+1:].lstrip()]
+        # Make sure we retain the last header
+        if lastheader:
+            container[lastheader] = NLTAB.join(lastvalue)
+
+
+
+class Tagger:
+    """Tag messages with topic matches."""
+
+    implements(IHandler)
+
+    name = 'tagger'
+    description = _('Tag messages with topic matches.')
+
+    def process(self, mlist, msg, msgdata):
+        """See `IHandler`."""
+        process(mlist, msg, msgdata)
author	Barry Warsaw	2012-03-23 19:25:27 -0400
committer	Barry Warsaw	2012-03-23 19:25:27 -0400
commit	25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc (patch)
tree	f51fe7931545e23058cdb65595c7caed9b43bb0e /src/mailman/handlers/tagger.py
parent	aa2d0ad067adfd2515ed3c256cd0bca296058479 (diff)
parent	e005e1b12fa0bd82d2e126df476b5505b440ce36 (diff)
download	mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.tar.gz mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.tar.zst mailman-25a392bf5c1a8d4d2bc63d51697350fc7dbd48bc.zip