src/mailman/handlers/tagger.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

# Copyright (C) 2001-2016 by the Free Software Foundation, Inc.
#
# This file is part of GNU Mailman.
#
# GNU Mailman is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# GNU Mailman.  If not, see <http://www.gnu.org/licenses/>.

"""Extract topics from the original mail message."""

__all__ = [
    'Tagger',
    ]


import re
import email.iterators
import email.parser

from mailman.core.i18n import _
from mailman.interfaces.handler import IHandler
from zope.interface import implementer


OR = '|'
CRNL = '\r\n'
EMPTYSTRING = ''
NLTAB = '\n\t'


def process(mlist, msg, msgdata):
    """Tag the message for topics."""
    if not mlist.topics_enabled:
        return
    # Extract the Subject:, Keywords:, and possibly body text
    matchlines = []
    matchlines.append(msg.get('subject', None))
    matchlines.append(msg.get('keywords', None))
    if mlist.topics_bodylines_limit == 0:
        # Don't scan any body lines
        pass
    elif mlist.topics_bodylines_limit < 0:
        # Scan all body lines
        matchlines.extend(scanbody(msg))
    else:
        # Scan just some of the body lines
        matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
    # Filter out any 'false' items.
    matchlines = [item for item in matchlines if item]
    # For each regular expression in the topics list, see if any of the lines
    # of interest from the message match the regexp.  If so, the message gets
    # added to the specific topics bucket.
    hits = {}
    for name, pattern, desc, emptyflag in mlist.topics:
        pattern = OR.join(pattern.splitlines())
        cre = re.compile(pattern, re.IGNORECASE)
        for line in matchlines:
            if cre.search(line):
                hits[name] = 1
                break
    if hits:
        # Sort the keys and make them available both in the message metadata
        # and in a message header.
        msgdata['topichits'] = sorted(hits)
        msg['X-Topics'] = NLTAB.join(sorted(hits))


def scanbody(msg, numlines=None):
    """Scan the body for keywords."""
    # We only scan the body of the message if it is of MIME type text/plain,
    # or if the outer type is multipart/alternative and there is a text/plain
    # part.  Anything else, and the body is ignored for header-scan purposes.
    found = None
    if msg.get_content_type() == 'text/plain':
        found = msg
    elif msg.is_multipart()\
         and msg.get_content_type() == 'multipart/alternative':
        for found in msg.get_payload():
            if found.get_content_type() == 'text/plain':
                break
        else:
            found = None
    if not found:
        return []
    # Now that we have a Message object that meets our criteria, let's extract
    # the first numlines of body text.
    lines = []
    lineno = 0
    reader = list(email.iterators.body_line_iterator(msg))
    while numlines is None or lineno < numlines:
        try:
            line = reader.pop(0)
        except IndexError:
            break
        # Blank lines don't count
        if not line.strip():
            continue
        lineno += 1
        lines.append(line)
    # Concatenate those body text lines with newlines, and then create a new
    # message object from those lines.
    p = _ForgivingParser()
    msg = p.parsestr(EMPTYSTRING.join(lines))
    return msg.get_all('subject', []) + msg.get_all('keywords', [])


class _ForgivingParser(email.parser.HeaderParser):
    """An lax email parser.

    Be a little more forgiving about non-header/continuation lines, since
    we'll just read as much as we can from 'header-like' lines in the body.
    """
    # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
    # specialize the way it returns?
    def _parseheaders(self, container, fp):
        """See `email.parser.HeaderParser`."""
        # Parse the headers, returning a list of header/value pairs.  None as
        # the header means the Unix-From header.
        lastheader = ''
        lastvalue = []
        lineno = 0
        while 1:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
            # continuation lines.
            line = fp.readline()
            if not line:
                break
            line = line.splitlines()[0]
            if not line:
                break
            # Ignore the trailing newline
            lineno += 1
            # Check for initial Unix From_ line
            if line.startswith('From '):
                if lineno == 1:
                    container.set_unixfrom(line)
                    continue
                else:
                    break
            # Header continuation line
            if line[0] in ' \t':
                if not lastheader:
                    break
                lastvalue.append(line)
                continue
            # Normal, non-continuation header.  BAW: this should check to make
            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
            # should expose the header matching algorithm in the API, and
            # allow for a non-strict parsing mode (that ignores the line
            # instead of raising the exception).
            i = line.find(':')
            if i < 0:
                break
            if lastheader:
                container[lastheader] = NLTAB.join(lastvalue)
            lastheader = line[:i]
            lastvalue = [line[i+1:].lstrip()]
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NLTAB.join(lastvalue)


@implementer(IHandler)
class Tagger:
    """Tag messages with topic matches."""

    name = 'tagger'
    description = _('Tag messages with topic matches.')

    def process(self, mlist, msg, msgdata):
        """See `IHandler`."""
        process(mlist, msg, msgdata)