# Copyright (C) 2001-2010 by the Free Software Foundation, Inc. # # This file is part of GNU Mailman. # # GNU Mailman is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along with # GNU Mailman. If not, see . """Extract topics from the original mail message.""" from __future__ import absolute_import, unicode_literals __metaclass__ = type __all__ = [ 'Tagger', ] import re import email.iterators import email.parser from zope.interface import implements from mailman.core.i18n import _ from mailman.interfaces.handler import IHandler OR = '|' CRNL = '\r\n' EMPTYBYTES = b'' NLTAB = '\n\t' def process(mlist, msg, msgdata): """Tag the message for topics.""" if not mlist.topics_enabled: return # Extract the Subject:, Keywords:, and possibly body text matchlines = [] matchlines.append(msg.get('subject', None)) matchlines.append(msg.get('keywords', None)) if mlist.topics_bodylines_limit == 0: # Don't scan any body lines pass elif mlist.topics_bodylines_limit < 0: # Scan all body lines matchlines.extend(scanbody(msg)) else: # Scan just some of the body lines matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit)) # Filter out any 'false' items. matchlines = [item for item in matchlines if item] # For each regular expression in the topics list, see if any of the lines # of interest from the message match the regexp. If so, the message gets # added to the specific topics bucket. hits = {} # pylint: disable-msg=W0612 for name, pattern, desc, emptyflag in mlist.topics: pattern = OR.join(pattern.splitlines()) cre = re.compile(pattern, re.IGNORECASE) for line in matchlines: if cre.search(line): hits[name] = 1 break if hits: # Sort the keys and make them available both in the message metadata # and in a message header. msgdata['topichits'] = sorted(hits) msg['X-Topics'] = NLTAB.join(sorted(hits)) def scanbody(msg, numlines=None): """Scan the body for keywords.""" # We only scan the body of the message if it is of MIME type text/plain, # or if the outer type is multipart/alternative and there is a text/plain # part. Anything else, and the body is ignored for header-scan purposes. found = None if msg.get_content_type() == 'text/plain': found = msg elif msg.is_multipart()\ and msg.get_content_type() == 'multipart/alternative': for found in msg.get_payload(): if found.get_content_type() == 'text/plain': break else: found = None if not found: return [] # Now that we have a Message object that meets our criteria, let's extract # the first numlines of body text. lines = [] lineno = 0 reader = list(email.iterators.body_line_iterator(msg)) while numlines is None or lineno < numlines: try: line = bytes(reader.pop(0)) except IndexError: break # Blank lines don't count if not line.strip(): continue lineno += 1 lines.append(line) # Concatenate those body text lines with newlines, and then create a new # message object from those lines. p = _ForgivingParser() msg = p.parsestr(EMPTYBYTES.join(lines)) return msg.get_all('subject', []) + msg.get_all('keywords', []) class _ForgivingParser(email.parser.HeaderParser): """An lax email parser. Be a little more forgiving about non-header/continuation lines, since we'll just read as much as we can from 'header-like' lines in the body. """ # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to # specialize the way it returns? def _parseheaders(self, container, fp): """See `email.parser.HeaderParser`.""" # Parse the headers, returning a list of header/value pairs. None as # the header means the Unix-From header. lastheader = '' lastvalue = [] lineno = 0 while 1: # Don't strip the line before we test for the end condition, # because whitespace-only header lines are RFC compliant # continuation lines. line = fp.readline() if not line: break line = line.splitlines()[0] if not line: break # Ignore the trailing newline lineno += 1 # Check for initial Unix From_ line if line.startswith('From '): if lineno == 1: container.set_unixfrom(line) continue else: break # Header continuation line if line[0] in ' \t': if not lastheader: break lastvalue.append(line) continue # Normal, non-continuation header. BAW: this should check to make # sure it's a legal header, e.g. doesn't contain spaces. Also, we # should expose the header matching algorithm in the API, and # allow for a non-strict parsing mode (that ignores the line # instead of raising the exception). i = line.find(':') if i < 0: break if lastheader: container[lastheader] = NLTAB.join(lastvalue) lastheader = line[:i] lastvalue = [line[i+1:].lstrip()] # Make sure we retain the last header if lastheader: container[lastheader] = NLTAB.join(lastvalue) class Tagger: """Tag messages with topic matches.""" implements(IHandler) name = 'tagger' description = _('Tag messages with topic matches.') def process(self, mlist, msg, msgdata): """See `IHandler`.""" process(mlist, msg, msgdata)