diff options
| author | Barry Warsaw | 2008-01-24 05:18:02 -0500 |
|---|---|---|
| committer | Barry Warsaw | 2008-01-24 05:18:02 -0500 |
| commit | 36ebacf60b8fe78e20b9c18f6c08d73271922fc4 (patch) | |
| tree | ad8f58134915af75a1dfff73fd801904750173a2 /Mailman/Handlers/Tagger.py | |
| parent | a76cbbcac84319245a0afb4a4dee32d4d4c79622 (diff) | |
| download | mailman-36ebacf60b8fe78e20b9c18f6c08d73271922fc4.tar.gz mailman-36ebacf60b8fe78e20b9c18f6c08d73271922fc4.tar.zst mailman-36ebacf60b8fe78e20b9c18f6c08d73271922fc4.zip | |
Diffstat (limited to 'Mailman/Handlers/Tagger.py')
| -rw-r--r-- | Mailman/Handlers/Tagger.py | 157 |
1 files changed, 0 insertions, 157 deletions
diff --git a/Mailman/Handlers/Tagger.py b/Mailman/Handlers/Tagger.py deleted file mode 100644 index 023148fd7..000000000 --- a/Mailman/Handlers/Tagger.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (C) 2001-2007 by the Free Software Foundation, Inc. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, -# USA. - -"""Extract topics from the original mail message.""" - -import re -import email -import email.Errors -import email.Iterators -import email.Parser - -OR = '|' -CRNL = '\r\n' -EMPTYSTRING = '' -NLTAB = '\n\t' - - - -def process(mlist, msg, msgdata): - if not mlist.topics_enabled: - return - # Extract the Subject:, Keywords:, and possibly body text - matchlines = [] - matchlines.append(msg.get('subject', None)) - matchlines.append(msg.get('keywords', None)) - if mlist.topics_bodylines_limit == 0: - # Don't scan any body lines - pass - elif mlist.topics_bodylines_limit < 0: - # Scan all body lines - matchlines.extend(scanbody(msg)) - else: - # Scan just some of the body lines - matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit)) - matchlines = filter(None, matchlines) - # For each regular expression in the topics list, see if any of the lines - # of interest from the message match the regexp. If so, the message gets - # added to the specific topics bucket. - hits = {} - for name, pattern, desc, emptyflag in mlist.topics: - pattern = OR.join(pattern.splitlines()) - cre = re.compile(pattern, re.IGNORECASE) - for line in matchlines: - if cre.search(line): - hits[name] = 1 - break - if hits: - msgdata['topichits'] = hits.keys() - msg['X-Topics'] = NLTAB.join(hits.keys()) - - - -def scanbody(msg, numlines=None): - # We only scan the body of the message if it is of MIME type text/plain, - # or if the outer type is multipart/alternative and there is a text/plain - # part. Anything else, and the body is ignored for header-scan purposes. - found = None - if msg.get_content_type() == 'text/plain': - found = msg - elif msg.is_multipart()\ - and msg.get_content_type() == 'multipart/alternative': - for found in msg.get_payload(): - if found.get_content_type() == 'text/plain': - break - else: - found = None - if not found: - return [] - # Now that we have a Message object that meets our criteria, let's extract - # the first numlines of body text. - lines = [] - lineno = 0 - reader = list(email.Iterators.body_line_iterator(msg)) - while numlines is None or lineno < numlines: - try: - line = reader.pop(0) - except IndexError: - break - # Blank lines don't count - if not line.strip(): - continue - lineno += 1 - lines.append(line) - # Concatenate those body text lines with newlines, and then create a new - # message object from those lines. - p = _ForgivingParser() - msg = p.parsestr(EMPTYSTRING.join(lines)) - return msg.get_all('subject', []) + msg.get_all('keywords', []) - - - -class _ForgivingParser(email.Parser.HeaderParser): - # Be a little more forgiving about non-header/continuation lines, since - # we'll just read as much as we can from "header-like" lines in the body. - # - # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to - # specialize the way it returns? - def _parseheaders(self, container, fp): - # Parse the headers, returning a list of header/value pairs. None as - # the header means the Unix-From header. - lastheader = '' - lastvalue = [] - lineno = 0 - while 1: - # Don't strip the line before we test for the end condition, - # because whitespace-only header lines are RFC compliant - # continuation lines. - line = fp.readline() - if not line: - break - line = line.splitlines()[0] - if not line: - break - # Ignore the trailing newline - lineno += 1 - # Check for initial Unix From_ line - if line.startswith('From '): - if lineno == 1: - container.set_unixfrom(line) - continue - else: - break - # Header continuation line - if line[0] in ' \t': - if not lastheader: - break - lastvalue.append(line) - continue - # Normal, non-continuation header. BAW: this should check to make - # sure it's a legal header, e.g. doesn't contain spaces. Also, we - # should expose the header matching algorithm in the API, and - # allow for a non-strict parsing mode (that ignores the line - # instead of raising the exception). - i = line.find(':') - if i < 0: - break - if lastheader: - container[lastheader] = NLTAB.join(lastvalue) - lastheader = line[:i] - lastvalue = [line[i+1:].lstrip()] - # Make sure we retain the last header - if lastheader: - container[lastheader] = NLTAB.join(lastvalue) |
