summaryrefslogtreecommitdiff
path: root/mailman/pipeline/docs/tagger.txt
diff options
context:
space:
mode:
Diffstat (limited to 'mailman/pipeline/docs/tagger.txt')
-rw-r--r--mailman/pipeline/docs/tagger.txt237
1 files changed, 237 insertions, 0 deletions
diff --git a/mailman/pipeline/docs/tagger.txt b/mailman/pipeline/docs/tagger.txt
new file mode 100644
index 000000000..778f7cc73
--- /dev/null
+++ b/mailman/pipeline/docs/tagger.txt
@@ -0,0 +1,237 @@
+Message tagger
+==============
+
+Mailman has a topics system which works like this: a mailing list
+administrator sets up one or more topics, which is essentially a named regular
+expression. The topic name can be any arbitrary string, and the name serves
+double duty as the 'topic tag'. Each message that flows the mailing list has
+its Subject: and Keywords: headers compared against these regular
+expressions. The message then gets tagged with the topic names of each hit.
+
+ >>> from mailman.pipeline.tagger import process
+ >>> from mailman.queue import Switchboard
+ >>> from mailman.configuration import config
+ >>> mlist = config.db.list_manager.create(u'_xtest@example.com')
+
+Topics must be enabled for Mailman to do any topic matching, even if topics
+are defined.
+
+ >>> mlist.topics = [('bar fight', '.*bar.*', 'catch any bars', False)]
+ >>> mlist.topics_enabled = False
+ >>> mlist.topics_bodylines_limit = 0
+
+ >>> msg = message_from_string("""\
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ...
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ Subject: foobar
+ Keywords: barbaz
+ <BLANKLINE>
+ <BLANKLINE>
+ >>> msgdata
+ {}
+
+However, once topics are enabled, message will be tagged. There are two
+artifacts of tagging; an X-Topics: header is added with the topic name, and
+the message metadata gets a key with a list of matching topic names.
+
+ >>> mlist.topics_enabled = True
+ >>> msg = message_from_string("""\
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ...
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ Subject: foobar
+ Keywords: barbaz
+ X-Topics: bar fight
+ <BLANKLINE>
+ <BLANKLINE>
+ >>> msgdata['topichits']
+ ['bar fight']
+
+
+Scanning body lines
+-------------------
+
+The tagger can also look at a certain number of body lines, but only for
+Subject: and Keyword: header-like lines. When set to zero, no body lines are
+scanned.
+
+ >>> msg = message_from_string("""\
+ ... From: aperson@example.com
+ ... Subject: nothing
+ ... Keywords: at all
+ ...
+ ... X-Ignore: something else
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ From: aperson@example.com
+ Subject: nothing
+ Keywords: at all
+ <BLANKLINE>
+ X-Ignore: something else
+ Subject: foobar
+ Keywords: barbaz
+ <BLANKLINE>
+ >>> msgdata
+ {}
+
+But let the tagger scan a few body lines and the matching headers will be
+found.
+
+ >>> mlist.topics_bodylines_limit = 5
+ >>> msg = message_from_string("""\
+ ... From: aperson@example.com
+ ... Subject: nothing
+ ... Keywords: at all
+ ...
+ ... X-Ignore: something else
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ From: aperson@example.com
+ Subject: nothing
+ Keywords: at all
+ X-Topics: bar fight
+ <BLANKLINE>
+ X-Ignore: something else
+ Subject: foobar
+ Keywords: barbaz
+ <BLANKLINE>
+ >>> msgdata['topichits']
+ ['bar fight']
+
+However, scanning stops at the first body line that doesn't look like a
+header.
+
+ >>> msg = message_from_string("""\
+ ... From: aperson@example.com
+ ... Subject: nothing
+ ... Keywords: at all
+ ...
+ ... This is not a header
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ From: aperson@example.com
+ Subject: nothing
+ Keywords: at all
+ <BLANKLINE>
+ This is not a header
+ Subject: foobar
+ Keywords: barbaz
+ >>> msgdata
+ {}
+
+When set to a negative number, all body lines will be scanned.
+
+ >>> mlist.topics_bodylines_limit = -1
+ >>> lots_of_headers = '\n'.join(['X-Ignore: zip'] * 100)
+ >>> msg = message_from_string("""\
+ ... From: aperson@example.com
+ ... Subject: nothing
+ ... Keywords: at all
+ ...
+ ... %s
+ ... Subject: foobar
+ ... Keywords: barbaz
+ ... """ % lots_of_headers)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> # Rather than print out 100 X-Ignore: headers, let's just prove that
+ >>> # the X-Topics: header exists, meaning that the tagger did its job.
+ >>> msg['x-topics']
+ u'bar fight'
+ >>> msgdata['topichits']
+ ['bar fight']
+
+
+Scanning sub-parts
+------------------
+
+The tagger will also scan the body lines of text subparts in a multipart
+message, using the same rules as if all those body lines lived in a single
+text payload.
+
+ >>> msg = message_from_string("""\
+ ... Subject: Was
+ ... Keywords: Raw
+ ... Content-Type: multipart/alternative; boundary="BOUNDARY"
+ ...
+ ... --BOUNDARY
+ ... From: sabo
+ ... To: obas
+ ...
+ ... Subject: farbaw
+ ... Keywords: barbaz
+ ...
+ ... --BOUNDARY--
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg.as_string()
+ Subject: Was
+ Keywords: Raw
+ Content-Type: multipart/alternative; boundary="BOUNDARY"
+ X-Topics: bar fight
+ <BLANKLINE>
+ --BOUNDARY
+ From: sabo
+ To: obas
+ <BLANKLINE>
+ Subject: farbaw
+ Keywords: barbaz
+ <BLANKLINE>
+ --BOUNDARY--
+ <BLANKLINE>
+ >>> msgdata['topichits']
+ ['bar fight']
+
+But the tagger will not descend into non-text parts.
+
+ >>> msg = message_from_string("""\
+ ... Subject: Was
+ ... Keywords: Raw
+ ... Content-Type: multipart/alternative; boundary=BOUNDARY
+ ...
+ ... --BOUNDARY
+ ... From: sabo
+ ... To: obas
+ ... Content-Type: message/rfc822
+ ...
+ ... Subject: farbaw
+ ... Keywords: barbaz
+ ...
+ ... --BOUNDARY
+ ... From: sabo
+ ... To: obas
+ ... Content-Type: message/rfc822
+ ...
+ ... Subject: farbaw
+ ... Keywords: barbaz
+ ...
+ ... --BOUNDARY--
+ ... """)
+ >>> msgdata = {}
+ >>> process(mlist, msg, msgdata)
+ >>> print msg['x-topics']
+ None
+ >>> msgdata
+ {}