summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbwarsaw2002-04-11 18:19:03 +0000
committerbwarsaw2002-04-11 18:19:03 +0000
commitc9cb90d532eb7290483e7f9137ed406a0082f095 (patch)
tree9337b917c0de135fcd84318c739c6801e96768e0
parent6883a47f152f61b933237d47d72b6c8fe010c793 (diff)
downloadmailman-c9cb90d532eb7290483e7f9137ed406a0082f095.tar.gz
mailman-c9cb90d532eb7290483e7f9137ed406a0082f095.tar.zst
mailman-c9cb90d532eb7290483e7f9137ed406a0082f095.zip
-rw-r--r--Mailman/Handlers/Tagger.py68
1 files changed, 62 insertions, 6 deletions
diff --git a/Mailman/Handlers/Tagger.py b/Mailman/Handlers/Tagger.py
index f88a857a2..e419901b9 100644
--- a/Mailman/Handlers/Tagger.py
+++ b/Mailman/Handlers/Tagger.py
@@ -19,7 +19,9 @@
import re
import email
+import email.Errors
import email.Iterators
+import email.Parser
from Mailman.Logging.Syslog import syslog
@@ -30,7 +32,7 @@ NLTAB = '\n\t'
def process(mlist, msg, msgdata):
- if not mlist.topics:
+ if not mlist.topics_enabled:
return
# Extract the Subject:, Keywords:, and possibly body text
matchlines = []
@@ -91,12 +93,66 @@ def scanbody(msg, numlines=None):
if not line.strip():
continue
lineno += 1
- # Stop scanning if we find a line that would not be recognized as
- # either a header or a continuation line
- if line[0] not in ' \t' and line.find(':') < 0:
- break
lines.append(line)
# Concatenate those body text lines with newlines, and then create a new
# message object from those lines.
- msg = email.message_from_string(EMPTYSTRING.join(lines))
+ p = _ForgivingParser()
+ msg = p.parsestr(EMPTYSTRING.join(lines))
+ syslog('debug', 'lines: %s', ''.join(lines))
+ syslog('debug', 'msg: %s', msg.as_string())
return msg.get_all('subject', []) + msg.get_all('keywords', [])
+
+
+
+class _ForgivingParser(email.Parser.HeaderParser):
+ # Be a little more forgiving about non-header/continuation lines, since
+ # we'll just read as much as we can from "header-like" lines in the body.
+ #
+ # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
+ # specialize the way it returns?
+ def _parseheaders(self, container, fp):
+ # Parse the headers, returning a list of header/value pairs. None as
+ # the header means the Unix-From header.
+ lastheader = ''
+ lastvalue = []
+ lineno = 0
+ while 1:
+ # Don't strip the line before we test for the end condition,
+ # because whitespace-only header lines are RFC compliant
+ # continuation lines.
+ line = fp.readline()
+ if not line:
+ break
+ line = line.splitlines()[0]
+ if not line:
+ break
+ # Ignore the trailing newline
+ lineno += 1
+ # Check for initial Unix From_ line
+ if line.startswith('From '):
+ if lineno == 1:
+ container.set_unixfrom(line)
+ continue
+ else:
+ break
+ # Header continuation line
+ if line[0] in ' \t':
+ if not lastheader:
+ break
+ lastvalue.append(line)
+ continue
+ # Normal, non-continuation header. BAW: this should check to make
+ # sure it's a legal header, e.g. doesn't contain spaces. Also, we
+ # should expose the header matching algorithm in the API, and
+ # allow for a non-strict parsing mode (that ignores the line
+ # instead of raising the exception).
+ i = line.find(':')
+ if i < 0:
+ break
+ if lastheader:
+ container[lastheader] = NLTAB.join(lastvalue)
+ lastheader = line[:i]
+ lastvalue = [line[i+1:].lstrip()]
+ # Make sure we retain the last header
+ if lastheader:
+ container[lastheader] = NLTAB.join(lastvalue)