diff options
| author | cotton | 1998-10-09 13:04:35 +0000 |
|---|---|---|
| committer | cotton | 1998-10-09 13:04:35 +0000 |
| commit | 13ca1d9ac610544b62b5b578676dc5ba9de32985 (patch) | |
| tree | 6f66d5279b44c79c6b9449c1f58c399f92f2986a | |
| parent | 03cb1006a1b4060c046ae5db24ab34bcb1cd8870 (diff) | |
| download | mailman-13ca1d9ac610544b62b5b578676dc5ba9de32985.tar.gz mailman-13ca1d9ac610544b62b5b578676dc5ba9de32985.tar.zst mailman-13ca1d9ac610544b62b5b578676dc5ba9de32985.zip | |
update pipermail to the current version for web archiving.
scott
| -rw-r--r-- | Mailman/pipermail.py | 1111 |
1 files changed, 589 insertions, 522 deletions
diff --git a/Mailman/pipermail.py b/Mailman/pipermail.py index 33a41b120..c4bdec1cb 100644 --- a/Mailman/pipermail.py +++ b/Mailman/pipermail.py @@ -1,559 +1,626 @@ -#! /usr/bin/env python -# -# Pipermail 0.0.2-mm -# -# **NOTE** -# -# This internal version of pipermail has been deprecated in favor of use of -# an external version of pipermail, available from: -# http://starship.skyport.net/crew/amk/maintained/pipermail.html -# The external version should be pointed at the created archive files. -# -# -# Some minor mods have been made for use with the Mailman mailing list manager. -# All changes will have JV by them. -# -# (C) Copyright 1996, A.M. Kuchling (amk@magnet.com) -# Home page at http://amarok.magnet.com/python/pipermail.html -# -# HTML code for frames courtesy of Scott Hassan (hassan@cs.stanford.edu) -# -# TODO: -# * Prev. article, next. article pointers in each article -# * I suspect there may be problems with rfc822.py's getdate() method; -# take a look at the threads "Greenaway and the net (fwd)" or -# "Pillow Book pictures". To be looked into... -# * Anything else Hypermail can do that we can't? -# * General code cleanups -# * Profiling & optimization -# * Should there be an option to enable/disable frames? -# * Like any truly useful program, Pipermail should have an ILU interface. -# * There's now an option to keep from preserving line breaks, -# so paragraphs in messages would be reflowed by the browser. -# Unfortunately, this mangles .sigs horribly, and pipermail doesn't yet -# put in paragraph breaks. Putting in the breaks will only require a -# half hour or so; I have no clue as to how to preserve .sigs. -# * Outside URLs shouldn't appear in the display frame. How to fix? +#!/usr/local/bin/python + +import os, sys, pickle, string, re + +__version__='0.05' +VERSION=__version__ +CACHESIZE=100 # Number of slots in the cache + +msgid_pat=re.compile(r'(<.*>)') +def strip_separators(s): + "Remove quotes or parenthesization from a Message-ID string" + if s==None or s=="": return "" + if s[0] in '"<([' and s[-1] in '">)]': s=s[1:-1] + return s + +smallNameParts = ['van', 'von', 'der', 'de'] + +def fixAuthor(author): + "Canonicalize a name into Last, First format" + # If there's a comma, guess that it's already in "Last, First" format + if ',' in author: return author + L=string.split(author) + i=len(L)-1 + if i==0: return author # The string's one word--forget it + if string.upper(author)==author or string.lower(author)==author: + # Damn, the name is all upper- or lower-case. + while i>0 and string.lower(L[i-1]) in smallNameParts: i=i-1 + else: + # Mixed case; assume that small parts of the last name will be + # in lowercase, and check them against the list. + while i>0 and (L[i-1][0] in string.lowercase or + string.lower(L[i-1]) in smallNameParts): + i=i-1 + author=string.join(L[-1:]+L[i:-1], ' ')+', '+string.join(L[:i], ' ') + return author + +# Abstract class for databases + +class Database: + def __init__(self): pass + def close(self): pass + def getArticle(self, archive, msgid): pass + def hasArticle(self, archive, msgid): pass + def addArticle(self, archive, article, subjectkey, authorkey): pass + def firstdate(self, archive): pass + def lastdate(self, archive): pass + def first(self, archive, index): pass + def next(self, archive, index): pass + def numArticles(self, archive): pass + def newArchive(self, archive): pass + def setThreadKey(self, archive, key, msgid): pass + def getOldestArticle(self, subject): pass + +# The Article class encapsulates a single posting. The attributes +# are: # +# sequence : Sequence number, unique for each article in a set of archives +# subject : Subject +# datestr : The posting date, in human-readable format +# date : The posting date, in purely numeric format +# headers : Any other headers of interest +# author : The author's name (and possibly organization) +# email : The author's e-mail address +# msgid : A unique message ID +# in_reply_to : If !="", this is the msgid of the article being replied to +# references: A (possibly empty) list of msgid's of earlier articles in the thread +# body : A list of strings making up the message body -VERSION = "0.0.2.mm" +class Article: + import time + __last_article_time=time.time() + def __init__(self, message=None, sequence=0, keepHeaders=[]): + import time + if message==None: return + self.sequence=sequence -import posixpath, time, os, string, sys, rfc822 + self.parentID = None ; self.threadKey = None + # otherwise the current sequence number is used. + id=strip_separators(message.getheader('Message-Id')) + if id=="": self.msgid=str(self.sequence) + else: self.msgid=id -# JV -- to get HOME_PAGE -import mm_cfg + if message.has_key('Subject'): self.subject=str(message['Subject']) + else: self.subject='No subject' + if self.subject=="": self.subject='No subject' -class ListDict: - def __init__(self): self.dict={} - def keys(self): return self.dict.keys() - def __setitem__(self, key, value): - "Add the value to a list for the key, creating the list if needed." - if not self.dict.has_key(key): self.dict[key]=[value] - else: self.dict[key].append(value) - def __getitem__(self, key): - "Return the list matching a key" - return self.dict[key] - -def PrintUsage(): - print """Pipermail %s -usage: pipermail [options] -options: -a URL : URL to other archives - -b URL : URL to archive information - -c file : name of configuration file (default: ~/.pmrc) - -d dir : directory where the output files will be placed - (default: archive/) - -l name : name of the output archive - -m file : name of input file - -s file : name where the archive state is stored - (default: <input file>+'.pipermail' - -u : Select 'update' mode - -v : verbose mode of operation - """ % (VERSION,) - sys.exit(0) + if message.has_key('Date'): + self.datestr=str(message['Date']) + date=message.getdate_tz('Date') + else: + self.datestr='None' + date=None + if date!=None: + date, tzoffset=date[:9], date[-1] + date=time.mktime(date)-tzoffset + else: + date=self.__last_article_time+1 ; print 'Article without date:', self.msgid + + self.__last_article_time=date + self.date='%011i' % (date,) + + # Figure out the e-mail address and poster's name + self.author, self.email=message.getaddr('From') + e=message.getheader('Reply-To') + if e!=None: self.email=e + self.email=strip_separators(self.email) + self.author=strip_separators(self.author) -# Compile various important regexp patterns -import regex, regsub -# Starting <html> directive -htmlpat=regex.compile('^[ \t]*<html>[ \t]*$') -# Ending </html> directive -nohtmlpat=regex.compile('^[ \t]*</html>[ \t]*$') -# Match quoted text -quotedpat=regex.compile('^[>|:]+') -# Parenthesized human name -paren_name_pat=regex.compile('.*\([(].*[)]\).*') -# Subject lines preceded with 'Re:' -REpat=regex.compile('[ \t]*[Rr][Ee][ \t]*:[ \t]*') -# Lines in the configuration file: set pm_XXX = <something> -cfg_line_pat=regex.compile('^[ \t]*[sS][eE][tT][ \t]*[Pp][Mm]_\([a-zA-Z0-9]*\)' - '[ \t]*=[ \t]*\(.*\)[ \t\n]*$') -# E-mail addresses and URLs in text -emailpat=regex.compile('\([-+,.a-zA-Z0-9]*@[-+.a-zA-Z0-9]*\)') -urlpat=regex.compile('\([a-zA-Z0-9]+://[^ \t\n]+\)') # URLs in text -# Blank lines -blankpat=regex.compile('^[ \t\n]*$') + if self.author=="": self.author=self.email -def ReadCfgFile(prefs): - import posixpath - try: - f=open(posixpath.expanduser(prefs['CONFIGFILE']), 'r') - except IOError, (num, msg): - if num==2: return - else: raise IOError, (num, msg) - line=0 - while(1): - L=f.readline() ; line=line+1 - if L=="": break - if string.strip(L)=="": continue # Skip blank lines - match=cfg_line_pat.match(L) - if match==-1: - print "Syntax error in line %i of %s" %(line, prefs['CONFIGFILE']) - print L + # Save the 'In-Reply-To:' and 'References:' lines + i_r_t=message.getheader('In-Reply-To') + if i_r_t==None: self.in_reply_to='' else: - varname, value=cfg_line_pat.group(1,2) - varname=string.upper(varname) - if not prefs.has_key(varname): - print ("Unknown variable name %s in line %i of %s" - %(varname, line, prefs['CONFIGFILE'])) - print L - else: - prefs[varname]=eval(value) - f.close() + match=msgid_pat.search(i_r_t) + if match==None: self.in_reply_to='' + else: self.in_reply_to=strip_separators(match.group(1)) + + references=message.getheader('References') + if references==None: self.references=[] + else: self.references=map(strip_separators, string.split(references)) + + # Save any other interesting headers + self.headers={} + for i in keepHeaders: + if message.has_key(i): self.headers[i]=message[i] + + # Read the message body + self.body=[] + message.rewindbody() + while (1): + line=message.fp.readline() + if line=="": break + self.body.append(line) + def __repr__(self): + return '<Article ID='+repr(self.msgid)+'>' + +# Pipermail formatter class + +class T: + DIRMODE=0755 # Mode to give to created directories + FILEMODE=0644 # Mode to give to created files + INDEX_EXT = ".html" # Extension for indexes + + def __init__(self, basedir=None, reload=1, database=None): + # If basedir isn't provided, assume the current directory + if basedir==None: self.basedir=os.getcwd() + else: + basedir=os.path.expanduser(basedir) + self.basedir=basedir + self.database=database + + # If the directory doesn't exist, create it + try: os.stat(self.basedir) + except os.error, errdata: + errno, errmsg = errdata + if errno!=2: raise os.error, errdata + else: + self.message('Creating archive directory '+self.basedir) + os.mkdir(self.basedir, self.DIRMODE) + + # Try to load previously pickled state + try: + if not reload: raise IOError + f=open(os.path.join(self.basedir, 'pipermail.pck'), 'r') + self.message('Reloading pickled archive state') + d=pickle.load(f) + f.close() + for key, value in d.items(): setattr(self, key, value) + except IOError: + # No pickled version, so initialize various attributes + self.archives=[] # Archives + self._dirty_archives=[] # Archives that will have to be updated + self.sequence=0 # Sequence variable used for numbering articles + self.update_TOC=0 # Does the TOC need updating? + + def close(self): + "Close an archive, saving its state and updating any changed archives." + self.update_dirty_archives()# Update all changed archives + # If required, update the table of contents + if self.update_TOC or 1: + self.update_TOC=0 + self.write_TOC() + # Save the collective state + self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck')) + self.database.close() + del self.database + f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w') + pickle.dump(self.__dict__, f) + f.close() + + # + # Private methods + # + # These will be neither overridden nor called by custom archivers. + # -def ReadEnvironment(prefs): - import sys, os - for key in prefs.keys(): - envvar=string.upper('PM_'+key) - if os.environ.has_key(envvar): - if type(prefs[key])==type(''): prefs[key]=os.environ[envvar] - else: prefs[key]=string.atoi(os.environ[envvar]) - -def UpdateMsgHeaders(prefs, filename, L): - """Update the next/previous message information in a message header. -The message is scanned for <!--next--> and <!--endnext--> comments, and -new pointers are written. Otherwise, the text is simply copied without any processing.""" - pass - -def ProcessMsgBody(prefs, msg, filename, articles): - """Transform one mail message from plain text to HTML. -This involves writing an HTML header, scanning through the text looking -for <html></html> directives, e-mail addresses, and URLs, and -finishing off with a footer.""" - import cgi, posixpath - outputname=posixpath.join(prefs['DIR'], filename) - output=open(outputname, 'w') - os.chmod(outputname, prefs['FILEMODE']) - subject, email, poster, date, datestr, parent, id = articles[filename] - # JV - if not email: - email = '' - if not subject: - subject = '<No subject>' - if not poster: - poster = '*Unknown*' - if not datestr: - datestr = '' - output.write('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 3.0//EN">' - "<html><head><title>%s Mailing List: %s</title></head>" - "<body><h1>%s</h1>" - "%s (<i>%s</i>)<br><i>%s</i><p>" % - (prefs['LABEL'], cgi.escape(subject),cgi.escape(subject), - cgi.escape(poster),cgi.escape(email), - cgi.escape(datestr))) - output.write('<ul><li> <b>Messages sorted by:</b>' - '<a target="toc" href="date.html#1">[ date ]</a>' - '<a target="toc" href="thread.html#1">[ thread ]</a>' - '<a target="toc" href="subject.html#1">[ subject ]</a>' - '<a target="toc" href="author.html#1">[ author ]</a></ul>\n') - - html_mode=0 - if prefs['SHOWHR']: output.write('<hr>') - output.write('<p>') - if not prefs['SHOWHTML']: output.write('<pre>\n') - msg.rewindbody() # Seek to start of message body - quoted=-1 - while (1): - L=msg.fp.readline() - if L=="": break - if html_mode: - # If in HTML mode, check for ending tag; otherwise, we - # copy the line without any changes. - if nohtmlpat.match(L)==-1: - output.write(L) ; continue - else: - html_mode=0 - if not prefs['SHOWHTML']: output.write('<pre>\n') - continue - # Check for opening <html> tag - elif htmlpat.match(L)!=-1: - html_mode=1 - if not prefs['SHOWHTML']: output.write('</pre>\n') - continue - if prefs['SHOWHTML'] and prefs['IQUOTES']: - # Check for a line of quoted text and italicise it - # (We have to do this before escaping HTML special - # characters because '>' is commonly used.) - quoted=quotedpat.match(L) - if quoted!=-1: - L=cgi.escape(L[:quoted]) + '<i>' + cgi.escape(L[quoted:]) + '</i>' - # If we're flowing the message text together, quoted lines - # need explicit breaks, no matter what mode we're in. - if prefs['SHOWHTML']: L=L+'<br>' - else: L=cgi.escape(L) - else: L=cgi.escape(L) - # Check for an e-mail address - L2="" ; i=emailpat.search(L) - while i!=-1: - length=len(emailpat.group(1)) - mailcmd=prefs['MAILCOMMAND'] % {'TO':L[i:i+length]} - L2=L2+'%s<A HREF="%s">%s</A>' % (L[:i], - mailcmd, L[i:i+length]) - L=L[i+length:] - i=emailpat.search(L) - L=L2+L ; L2=""; i=urlpat.search(L) - while i!=-1: - length=len(urlpat.group(1)) - L2=L2+'%s<A HREF="%s">%s</A>' % (L[:i], - L[i:i+length], L[i:i+length]) - L=L[i+length:] - i=urlpat.search(L) - L=L2+L - if prefs['SHOWHTML']: - while (L!="" and L[-1] in '\015\012'): L=L[:-1] - if prefs['SHOWBR']: - # We don't want to flow quoted passages - if quoted==-1: L=L+'<br>' + # Create a dictionary of various parameters that will be passed + # to the write_index_{header,footer} functions + def __set_parameters(self, archive): + import time + # Determine the earliest and latest date in the archive + firstdate=self.database.firstdate(archive) + lastdate=self.database.lastdate(archive) + + # Get the current time + now=time.asctime(time.localtime(time.time())) + self.firstdate=firstdate ; self.lastdate=lastdate + self.archivedate=now ; self.size=self.database.numArticles(archive) + self.archive=archive ; self.version=__version__ + + # Find the message ID of an article's parent, or return None + # if no parent can be found. + + def __findParent(self, article, children=[]): + parentID=None + if article.in_reply_to!='': parentID=article.in_reply_to + elif article.references!=[]: + # Remove article IDs that aren't in the archive + refs=filter(self.articleIndex.has_key, article.references) + if len(refs): + refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) + maxdate=refs[0] + for i in refs[1:]: + if i.date>maxdate.date: maxdate=i + parentID=maxdate.msgid else: - # If we're not adding <br> to each line, we'll need to - # insert <p> markup on blank lines to separate paragraphs. - if blankpat.match(L)!=-1: L=L+'<p>' - L=L+'\n' - output.write(L) + # Look for the oldest matching subject + try: + key, tempid=self.subjectIndex.set_location(article.subject) + print key, tempid + self.subjectIndex.next() + [subject, date]= string.split(key, '\0') + print article.subject, subject, date + if (subject==article.subject and tempid not in children): + parentID=tempid + except KeyError: pass + return parentID + + # Update the threaded index completely + def updateThreadedIndex(self): + import pickle, sys + # Erase the threaded index + self.database.clearIndex(self.archive, 'thread') - if not prefs['SHOWHTML'] and not html_mode: output.write('</pre>') - if prefs['SHOWHR']: output.write('<hr>') - output.write('<!--next-->\n<!--endnext-->\n</body></html>') - output.close() + # Loop over all the articles + msgid=self.database.first(self.archive, 'date') + while (msgid != None): + article=self.database.getArticle(self.archive, msgid) + if article.parentID==None or not self.database.hasArticle(self.archive, article.parentID): + key=article.date + else: + parent=self.database.getArticle(self.archive, article.parentID) + article.threadKey=parent.threadKey+article.date+'-' + self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, msgid) + msgid=self.database.next(self.archive, 'date') -def WriteHTMLIndex(prefs, fp, L, articles, indexname): - """Process a list L into an HTML index, written to fp. -L is processed from left to right, and contains a list of 2-tuples; -an integer of 1 or more giving the depth of indentation, and -a list of strings which are used to reference the 'articles' -dictionary. Most of the time the lists contain only 1 element.""" - fp.write('<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 3.0//EN">\n' - "<html><head><base target=display>" - "<title>%s Mailing List Archive by %s</title></head><body>\n" - % (prefs['LABEL'], indexname)) - fp.write('<H1><A name="start">%s Mailing List Archive by %s</A></H1>' - '<ul><li> <b><a target="toc" href="#end">Most recent messages</a></b>' - '<li> <b>Messages sorted by:</b>' - % (prefs['LABEL'], indexname)) - if indexname!='Date': - fp.write('<a target="toc" href="date.html#start">[ date ]</a>') - if indexname!='Subject': - fp.write('<a target="toc" href="subject.html#start">[ subject ]</a>') - if indexname!='Author': - fp.write('<a target="toc" href="author.html#start">[ author ]</a>') - if indexname!='Thread': - fp.write('<a target="toc" href="thread.html#start">[ thread ]</a>') - if prefs['ARCHIVES']!='NONE': - fp.write('<li> <b><a href="%s">Other mail archives</a></b>' % - (prefs['ARCHIVES'],)) -# This doesn't look professional. -- JV -# mailcmd=prefs['MAILCOMMAND'] % {'TO':'amk@magnet.com'} -# fp.write('</ul><p>Please inform <A HREF="%s">amk@magnet.com</a> if any of the messages are formatted incorrectly.' % (mailcmd,) ) +## L1=[] ; L2=[] +## while (1): +## article=self.database.getArticle(self.archive, msgid) +## L1.append('') ; L2.append(msgid) +## L1=map(lambda x, d=article.date: d+'-'+x, L1) +## parentID=self.__findParent(article, L2) +## if parentID==None or not self.database.hasArticle(parentID): +## break +## else: msgid=parentID +## for i in range(0, len(L1)): +## self.database.setThreadKey(self.archive, L1[i], '\000'+L2[i]) +## self.database.setThreadKey(self.archive, '\000'+L2[i], L1[i]) + + # + # Public methods: + # + # These are part of the public interface of the T class, but will + # never be overridden (unless you're trying to do something very new). - fp.write("<p><b>Starting:</b> <i>%s</i><br>" - "<b>Ending:</b> <i>%s</i><br><b>Messages:</b> %i<p>" - % (prefs['firstDate'], prefs['endDate'], len(L)) ) + # Update a single archive's indices, whether the archive's been + # dirtied or not. + def update_archive(self, archive): + self.archive=archive + self.message("Updating index files for archive ["+archive+']') + arcdir=os.path.join(self.basedir, archive) + parameters=self.__set_parameters(archive) + # Handle the 3 simple indices first + for i in ['Date', 'Subject', 'Author']: + self.message(" "+i) + self.type=i + # Get the right index + i=string.lower(i) - # Write the index - level=1 - fp.write('<ul>\n') - for indent, keys in L: - if indent>level and indent<=prefs['THRDLEVELS']: - fp.write((indent-level)*'<ul>'+'\n') - if indent<level: fp.write((level-indent)*'</ul>'+'\n') - level=indent - for j in keys: - subj, email, poster, date, datestr, parent, id=articles[j] - # XXX Should we put a mailto URL in here? - fp.write('<li> <A HREF="%s"><b>%s</b></a> <i>%s</i>\n' % - (j, subj, poster) ) - for i in range(0, indent): fp.write('</ul>') - fp.write('<p>') + # Redirect sys.stdout + import sys + f=open(os.path.join(arcdir, i+self.INDEX_EXT), 'w') + os.chmod(f.name, self.FILEMODE) + temp_stdout, sys.stdout=sys.stdout, f + self.write_index_header() + count=0 + # Loop over the index entries + finished=0 + msgid=self.database.first(archive, i) + while (msgid != None): + article=self.database.getArticle(self.archive, msgid) + count=count+1 + self.write_index_entry(article) + msgid = self.database.next(archive, i ) + # Finish up this index + self.write_index_footer() + sys.stdout=temp_stdout + f.close() - # Write the footer - import time - now=time.asctime(time.localtime(time.time())) - -# JV -- Fixed a bug here. - if prefs['ARCHIVES'] <> 'NONE': - otherstr=('<li> <b><a href="%s">Other mail archives</a></b>' % - (prefs['ARCHIVES'],) ) - else: otherstr="" - fp.write('<a name="end"><b>Last message date:</b></a> <i>%s</i><br>' - '<b>Archived on:</b> <i>%s</i><p><ul>' - '<li> <b>Messages sorted by:</b>' - '<a target="toc" href="date.html#start">[ date ]</a>' - '<a target="toc" href="subject.html#start">[ subject ]</a>' - '<a target="toc" href="author.html#start">[ author ]</a>' - '<a target="toc" href="thread.html#start">[ thread ]</a>' - '%s</ul><p>' % (prefs['endDate'], now, otherstr)) + # Print the threaded index + self.message(" Thread") + temp_stdout, sys.stdout=sys.stdout, open(os.path.join(arcdir, 'thread' + self.INDEX_EXT), 'w') + os.chmod(os.path.join(arcdir, 'thread' + self.INDEX_EXT), self.FILEMODE) + self.type='Thread' + self.write_index_header() + + # To handle the prev./next in thread pointers, we need to + # track articles 5 at a time. + + # Get the first 5 articles + L=[ None ]*5 ; i=2 ; finished=0 + msgid=self.database.first(self.archive, 'thread') + while msgid!=None and i<5: + L[i]=self.database.getArticle(self.archive, msgid) ; i=i+1 + msgid = self.database.next(self.archive, 'thread') + + while L[2]!=None: + article=L[2] ; artkey=None + if article!=None: artkey=article.threadKey + if artkey!=None: + import sys + self.write_threadindex_entry(article, string.count(artkey, '-')-1) + if self.database.changed.has_key( (archive,article.msgid) ): + a1=L[1] ; a3=L[3] + self.update_article(arcdir, article, a1, a3) + if a3!=None: self.database.changed[ (archive,a3.msgid) ]=None + if a1!=None: + if not self.database.changed.has_key( (archive,a1.msgid) ): + self.update_article(arcdir, a1, L[0], L[2]) + else: del self.database.changed[ (archive,a1.msgid) ] + L=L[1:] # Rotate the list + if msgid==None: L.append(msgid) + else: L.append( self.database.getArticle(self.archive, msgid) ) + msgid = self.database.next(self.archive, 'thread') - fp.write('<p><hr><i>This archive was generated by ' -# JV Updated the URL. - '<A HREF="http://www.magnet.com/~amk/python/pipermail.html">' - 'Pipermail %s</A>.</i></body></html>' % (VERSION,)) + self.write_index_footer() + sys.stdout=temp_stdout -# Set the hard-wired preferences first -# JV Changed the SHOWHTML pref default to 0 because 1 looks bad. -prefs={'CONFIGFILE':'~/.pmrc', 'MBOX':'mbox', - 'ARCHIVES': 'NONE', 'ABOUT':'NONE', 'REVERSE':0, - 'SHOWHEADERS':0, 'SHOWHTML':0, 'LABEL':"", - 'DIR':'archive', 'DIRMODE':0755, - 'FILEMODE':0644, 'OVERWRITE':0, 'VERBOSE':0, - 'THRDLEVELS':3, 'SHOWBR':0, 'IQUOTES':1, - 'SHOWHR':1, 'MAILCOMMAND':'mailto:%(TO)s', - 'INDEXFILE':'NONE' -} + # Update only archives that have been marked as "changed". + def update_dirty_archives(self): + for i in self._dirty_archives: self.update_archive(i) + self._dirty_archives=[] -# Read the ~/.pmrc file -ReadCfgFile(prefs) -# Read environment variables -ReadEnvironment(prefs) + # Read a Unix mailbox file from the file object <input>, + # and create a series of Article objects. Each article + # object will then be archived. + + def processUnixMailbox(self, input, articleClass=Article): + import mailbox + mbox=mailbox.UnixMailbox(input) + while (1): + m=mbox.next() + if not m: break # End of file reached + a=articleClass(m, self.sequence) # Create an article object + self.sequence=self.sequence+1 # Increment the archive's sequence number + self.add_article(a) # Add the article -# Parse command-line options -import getopt -options, params=getopt.getopt(sys.argv[1:], 'a:b:c:d:l:m:s:uipvxzh?') -for option, value in options: - if option=='-a': prefs['ARCHIVES']=value - if option=='-b': prefs['ABOUT']=value - if option=='-c': prefs['CONFIGFILE']=value - if option=='-d': prefs['DIR']=value -# if option=='-f': prefs.frames=1 - if option=='-i': prefs['MBOX']='-' - if option=='-l': prefs['LABEL']=value - if option=='-m': prefs['MBOX']=value - if option=='-s': prefs['INDEXFILE']=value - if option=='-p' or option=='-v': prefs['VERBOSE']=1 - if option=='-x': prefs['OVERWRITE']=1 - if option=='-z' or option=='-h' or option=='-?': PrintUsage() + # Archive an Article object. + def add_article(self, article): + # Determine into what archives the article should be placed + archives=self.get_archives(article) + if archives==None: archives=[] # If no value was returned, ignore it + if type(archives)==type(''): archives=[archives] # If a string was returned, convert to a list + if archives==[]: return # Ignore the article -# Set up various variables -articles={} ; sequence=0 -for key in ['INDEXFILE', 'MBOX', 'CONFIGFILE', 'DIR']: - prefs[key]=posixpath.expanduser(prefs[key]) - -if prefs['INDEXFILE']=='NONE': - if prefs['MBOX']!='-': - prefs['INDEXFILE']=prefs['MBOX']+'.pipermail' - else: prefs['INDEXFILE']='mbox.pipermail' + # Add the article to each archive in turn + article.filename=filename=self.get_filename(article) + temp=self.format_article(article) # Reformat the article + self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives)) + for i in archives: + self.archive=i + archivedir=os.path.join(self.basedir, i) + # If it's a new archive, create it + if i not in self.archives: + self.archives.append(i) ; self.update_TOC=1 + self.database.newArchive(i) + # If the archive directory doesn't exist, create it + try: os.stat(archivedir) + except os.error, errdata: + errno, errmsg=errdata + if errno==2: + os.mkdir(archivedir, self.DIRMODE) + else: raise os.error, errdata + self.open_new_archive(i, archivedir) + + # Write the HTML-ized article + f=open(os.path.join(archivedir, filename), 'w') + os.chmod(os.path.join(archivedir, filename), self.FILEMODE) + temp_stdout, sys.stdout = sys.stdout, f + self.write_article_header(temp) + sys.stdout.writelines(temp.body) + self.write_article_footer(temp) + sys.stdout=temp_stdout + f.close() -# Read an index file, if one can be found -if not prefs['OVERWRITE']: - # Look for a file contained pickled state - import pickle - try: - if prefs['VERBOSE']: - print 'Attempting to read index file', prefs['INDEXFILE'] - f=open(prefs['INDEXFILE'], 'r') - articles, sequence =pickle.load(f) - f.close() - except IOError: - if prefs['VERBOSE']: print 'No index file found.' - pass # Ignore errors + authorkey=fixAuthor(article.author)+'\000'+article.date + subjectkey=string.lower(article.subject)+'\000'+article.date -# Open the input file -if prefs['MBOX']=='-': prefs['MBOX']=sys.stdin -else: - if prefs['VERBOSE']: print 'Opening input file', prefs['MBOX'] - prefs['MBOX']=open(prefs['MBOX'], 'r') + # Update parenting info + parentID=None + if article.in_reply_to!='': parentID=article.in_reply_to + elif article.references!=[]: + # Remove article IDs that aren't in the archive + refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x), + article.references) + if len(refs): + refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) + maxdate=refs[0] + for ref in refs[1:]: + if ref.date>maxdate.date: maxdate=ref + parentID=maxdate.msgid + else: + # Get the oldest article with a matching subject, and assume this is + # a follow-up to that article + parentID=self.database.getOldestArticle(self.archive, article.subject) -# Create the destination directory; if it already exists, we don't care -try: - os.mkdir(prefs['DIR'], prefs['DIRMODE']) - if prefs['VERBOSE']: print 'Directory %s created'%(prefs['DIR'],) -except os.error, (errno, errmsg): pass + if parentID!=None and not self.database.hasArticle(self.archive, parentID): + parentID=None + article.parentID=parentID + if parentID!=None: + parent=self.database.getArticle(self.archive, parentID) + article.threadKey=parent.threadKey+article.date+'-' + else: article.threadKey=article.date+'-' + self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid) + self.database.addArticle(i, temp, subjectkey, authorkey) + + if i not in self._dirty_archives: + self._dirty_archives.append(i) + del temp -# Create various data structures: -# msgids maps Message-IDs to filenames. -# roots maps Subject lines to (date, filename) tuples, and is used to -# identify the oldest article with a given subject line for threading. + # Abstract methods: these will need to be overridden by subclasses + # before anything useful can be done. -msgids={} ; roots={} -for i in articles.keys(): - subject, email, poster, date, datestr, parent, id =articles[i] - if id: msgids[id]=i - if not roots.has_key(subject) or roots[subject]<date: - roots[subject]=(date, i) + def get_filename(self, article): + pass + def get_archives(self, article): + """Return a list of indexes where the article should be filed. + A string can be returned if the list only contains one entry, + and the empty list is legal.""" + pass + def format_article(self, article): + pass + def write_index_header(self): + pass + def write_index_footer(self): + pass + def write_index_entry(self, article): + pass + def write_threadindex_entry(self, article, depth): + pass + def write_article_header(self, article): + pass + def write_article_footer(self, article): + pass + def write_article_entry(self, article): + pass + def update_article(self, archivedir, article, prev, next): + pass + def write_TOC(self): + pass + def open_new_archive(self, archive, dir): + pass + def message(self, msg): + pass -# Start processing the index -import mailbox -mbox=mailbox.UnixMailbox(prefs['MBOX']) -while (1): - m=mbox.next() - if not m: break - filename='%04i.html' % (sequence,) - if prefs['VERBOSE']: sys.stdout.write("Processing "+filename+"\n") - # The apparently redundant str() actually catches the case where - # m.getheader() returns None. - subj=str(m.getheader('Subject')) - # Remove any number of 'Re:' prefixes from the subject line - while (1): - i=REpat.match(subj) - if i!=-1: subj=subj[i:] - else: break - # Locate an e-mail address - L=m.getheader('From') - # JV: sometimes there is no From header, so use the one from unixfrom. - if not L: +class BSDDBdatabase(Database): + def __init__(self, basedir): + self.__cachekeys=[] ; self.__cachedict={} + self.__currentOpenArchive=None # The currently open indices + self.basedir=os.path.expanduser(basedir) + self.changed={} # Recently added articles, indexed only by message ID + def firstdate(self, archive): + import time + self.__openIndices(archive) + date='None' try: - L = string.split(m.unixfrom)[1] - except: - L = "***Unknown***" - email=None - i=emailpat.search(L) - if i!=-1: - length=emailpat.match(L[i:]) - email=L[i:i+length] - # Remove e-mail addresses inside angle brackets - poster=str(regsub.gsub('<.*>', '', L)) - # Check if there's a name in parentheses - i=paren_name_pat.match(poster) - if i!=-1: poster=paren_name_pat.group(1)[1:-1] - datestr=m.getheader('Date') - # JV -- Hacks to make the getdate work. - # These hacks might skew the post time a bit. - # Crude, but so far, effective. - words = string.split(datestr) - if ((len(words[-1]) == 4) and (len(words) == 5) - and (words[-1][:-1] == '199')): - try: - date = time.mktime(rfc822.parsedate('%s, %s %s %s %s' % - (words[0], words[2], words[1], - words[4], words[3]))) - except: - date = time.mktime(m.getdate('Date')) # Odd - elif len(words) > 4 and words[4][-1] == ',': + date, msgid = self.dateIndex.first() + date=time.asctime(time.localtime(string.atof(date))) + except KeyError: pass + return date + def lastdate(self, archive): + import time + self.__openIndices(archive) + date='None' try: - date = time.mktime(rfc822.parsedate('%s, %s %s %s %s' % - (words[0], words[1], words[2], - words[3], words[4][:-1]))) - except: - date = time.mktime(m.getdate('Date')) # Hmm - else: - try: - date=time.mktime(m.getdate('Date')) - except: - print 'Error getting date!' - print 'Subject = ', m.getheader('subject') - print 'Date = ', m.getheader('date') - - id=m.getheader('Message-Id') - if id: id=id[1:-1] ; msgids[id]=filename - parent=None - in_reply_to=m.getheader('In-Reply-To') - if in_reply_to: - in_reply_to=in_reply_to[1:-1] - if msgids.has_key(in_reply_to): parent=msgids[in_reply_to] - elif roots.has_key(subj) and roots[subj][0]<date: - parent=roots[subj][1] - else: roots[subj]=(date,filename) - - articles[filename]=(subj, email, poster, date, datestr, parent, id) - ProcessMsgBody(prefs, m, filename, articles) - sequence=sequence+1 -prefs['MBOX'].close() + date, msgid = self.dateIndex.last() + date=time.asctime(time.localtime(string.atof(date))) + except KeyError: pass + return date + def numArticles(self, archive): + self.__openIndices(archive) + return len(self.dateIndex) -if prefs['VERBOSE']: sys.stdout.write("Writing date index\n") -import time -indexname=posixpath.join(prefs['DIR'], 'date.html') -f=open(indexname, 'w') ; os.chmod(indexname, prefs['FILEMODE']) -dates=ListDict() -for i in articles.keys(): - subject, email, poster, date, datestr, parent, id=articles[i] - dates[date]=i -L=dates.keys() ; L.sort() -if prefs['REVERSE']: L.reverse() -prefs['firstDate']=time.asctime(time.localtime(L[0])) -prefs['endDate']=time.asctime(time.localtime(L[-1])) -L=map(lambda key, s=dates: (1,s[key]), L) -WriteHTMLIndex(prefs, f, L, articles, 'Date') -f.close() ; del dates, L + # Add a single article to the internal indexes for an archive. -if prefs['VERBOSE']: sys.stdout.write("Writing thread index\n") -indexname=posixpath.join(prefs['DIR'], 'thread.html') -f=open(indexname, 'w') ; os.chmod(indexname, prefs['FILEMODE']) -def DFS(p, N=None, depth=0, prefs=prefs): - set=filter(lambda x, N=N, p=p: p[x][1]==N, p.keys()) - set=map(lambda x, a=articles: (articles[x][3],x), set) - set.sort() - if prefs['REVERSE']: set.reverse() - set=map(lambda x: x[1], set) - if len(set)==0: return [(depth, [N])] - else: - L=[] - for i in set: - L=L+DFS(p, i, depth+1) - return [(depth,[N])]+L -parents={} -for i in articles.keys(): - subject, email, poster, date, datestr, parent, id=articles[i] - parents[i]=(date, parent) -L=DFS(parents)[1:] -WriteHTMLIndex(prefs, f, L, articles, 'Thread') -f.close() ; del L, parents + def addArticle(self, archive, article, subjectkey, authorkey): + import pickle + self.__openIndices(archive) + + # Add the new article + self.dateIndex[article.date]=article.msgid + self.authorIndex[authorkey]=article.msgid + self.subjectIndex[subjectkey]=article.msgid + # Set the 'body' attribute to empty, to avoid storing the whole message + temp = article.body ; article.body=[] + self.articleIndex[article.msgid]=pickle.dumps(article) + article.body=temp + self.changed[archive,article.msgid]=None + + parentID=article.parentID + if parentID!=None and self.articleIndex.has_key(parentID): + parent=self.getArticle(archive, parentID) + myThreadKey=parent.threadKey+article.date+'-' + else: myThreadKey = article.date+'-' + article.threadKey=myThreadKey + self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid) + + # Open the BSDDB files that are being used as indices + # (dateIndex, authorIndex, subjectIndex, articleIndex) + def __openIndices(self, archive): + if self.__currentOpenArchive==archive: return -if prefs['VERBOSE']: sys.stdout.write("Writing subject index\n") -indexname=posixpath.join(prefs['DIR'], 'subject.html') -f=open(indexname, 'w') ; os.chmod(indexname, prefs['FILEMODE']) -subjects=ListDict() -for i in articles.keys(): - subject, email, poster, date, datestr, parent, id=articles[i] - subjects[(subject, date)]=i -L=subjects.keys() ; L.sort() ; L=map(lambda key, s=subjects: (1, s[key]), L) -WriteHTMLIndex(prefs, f, L, articles, 'Subject') -f.close() ; del subjects, L + import bsddb + self.__closeIndices() +# print 'opening indices for [%s]' % (repr(archive),) + arcdir=os.path.join(self.basedir, 'database') + try: os.mkdir(arcdir, 0700) + except os.error: pass + for i in ['date', 'author', 'subject', 'article', 'thread']: + t=bsddb.btopen(os.path.join(arcdir, archive+'-'+i), 'c') + setattr(self, i+'Index', t) + self.__currentOpenArchive=archive -if prefs['VERBOSE']: sys.stdout.write("Writing author index\n") -indexname=posixpath.join(prefs['DIR'], 'author.html') -f=open(indexname, 'w') ; os.chmod(indexname, prefs['FILEMODE']) -authors=ListDict() -for i in articles.keys(): - v=articles[i] - authors[(v[2],v[3])]=i -L=authors.keys() ; L.sort() ; L=map(lambda key, s=authors: (1,s[key]), L) -WriteHTMLIndex(prefs, f, L, articles, 'Author') -f.close() ; del authors, L + # Close the BSDDB files that are being used as indices (if they're + # open--this is safe to call if they're already closed) + def __closeIndices(self): + if self.__currentOpenArchive!=None: + pass +# print 'closing indices for [%s]' % (repr(self.__currentOpenArchive),) + for i in ['date', 'author', 'subject', 'thread', 'article']: + attr=i+'Index' + if hasattr(self, attr): + index=getattr(self, attr) + if i=='article': + if not hasattr(self, 'archive_length'): self.archive_length={} + self.archive_length[self.__currentOpenArchive]=len(index) + index.close() + delattr(self,attr) + self.__currentOpenArchive=None + def close(self): + self.__closeIndices() + def hasArticle(self, archive, msgid): + self.__openIndices(archive) + return self.articleIndex.has_key(msgid) + def setThreadKey(self, archive, key, msgid): + self.__openIndices(archive) + self.threadIndex[key]=msgid + def getArticle(self, archive, msgid): + self.__openIndices(archive) + if self.__cachedict.has_key(msgid): + self.__cachekeys.remove(msgid) + self.__cachekeys.append(msgid) + return self.__cachedict[msgid] + if len(self.__cachekeys)==CACHESIZE: + delkey, self.__cachekeys = self.__cachekeys[0], self.__cachekeys[1:] + del self.__cachedict[delkey] + s=self.articleIndex[msgid] + article=pickle.loads(s) + self.__cachekeys.append(msgid) ; self.__cachedict[msgid]=article + return article -if prefs['VERBOSE']: sys.stdout.write("Writing framed index\n") -f=open(posixpath.join(prefs['DIR'], 'blank.html'), 'w') -f.write("<html></html>") ; f.close() -# JV changed... -f=open(posixpath.join(prefs['DIR'], mm_cfg.HOME_PAGE), 'w') -f.write("""<html><head><title>%s Pipermail Archive</title> -<frameset cols="*, 60%%"> -<FRAME SRC="thread.html" NAME=toc> -<FRAME SRC="blank.html" NAME=display> -</frameset></head> -<body><noframes> -To access the %s Pipermail Archive, choose one of the following links: -<p> -Messages sorted by <a target="toc" href="date.html#start">[ date ] </a> -<a target="toc" href="subject.html#start">[ subject ]</a> -<a target="toc" href="author.html#start">[ author ]</a> -<a target="toc" href="thread.html#start">[ thread ]</a> -</ol> -</noframes> -</body></html> -""" % (prefs['LABEL'],prefs['LABEL']) ) + def first(self, archive, index): + self.__openIndices(archive) + index=getattr(self, index+'Index') + try: + key, msgid = index.first() + return msgid + except KeyError: return None + def next(self, archive, index): + self.__openIndices(archive) + index=getattr(self, index+'Index') + try: + key, msgid = index.next() + return msgid + except KeyError: return None + + def getOldestArticle(self, archive, subject): + self.__openIndices(archive) + subject=string.lower(subject) + try: + key, tempid=self.subjectIndex.set_location(subject) + self.subjectIndex.next() + [subject2, date]= string.split(key, '\0') + if subject!=subject2: return None + return tempid + except KeyError: + return None + + def newArchive(self, archive): pass + def clearIndex(self, archive, index): + self.__openIndices(archive) + index=getattr(self, index+'Index') + finished=0 + try: + key, msgid=self.threadIndex.first() + except KeyError: finished=1 + while not finished: + del self.threadIndex[key] + try: + key, msgid=self.threadIndex.next() + except KeyError: finished=1 -import pickle -if prefs['VERBOSE']: print 'Writing index file', prefs['INDEXFILE'] -f=open(prefs['INDEXFILE'], 'w') -pickle.dump( (articles, sequence), f ) -f.close() |
