diff options
| author | cotton | 1998-10-22 21:14:44 +0000 |
|---|---|---|
| committer | cotton | 1998-10-22 21:14:44 +0000 |
| commit | 73b134e83997212d049c58946d9e2d2e2b4b070c (patch) | |
| tree | 9e3bd65b1ac74228498231863970337cc5098097 /Mailman/pipermail.py | |
| parent | 0eb0572a6f7f521c23cd88d13b06fd8c48d15511 (diff) | |
| download | mailman-73b134e83997212d049c58946d9e2d2e2b4b070c.tar.gz mailman-73b134e83997212d049c58946d9e2d2e2b4b070c.tar.zst mailman-73b134e83997212d049c58946d9e2d2e2b4b070c.zip | |
code cleanup: put all the web archive related stuff into it's own sub
package.
Details:
changed Makefile to add the sub package as a directory to make
recursively in
changed configure to make the replacements to
Mailman/Archiver/Makefile.in
changed Mailman/Makefile.in to add Archiver as a sub package
moved Mailman/Archiver.py to Mailman/Archiver/Archiver.py and
Mailman/Hyper* to Mailman/Archiver
Mailman/pipermail.py to Mailman/Archiver/
created Mailman/Archiver/__init__.py to do a "from Archiver import *"
in order to make it's interface identical to previously.
import change: changed imports to import Mailman.<module> from import
<module> in order to accomodate the package import semantics, also
localized <module> by calling <module> = Mailman.<module>. see diffs
for details if this sounds confusing, it's not. this change was
applied to all of the moved files except pipermail, which didn't need
it.
did a basic new installation test to make sure all the
Makefile/configure changes took place atleast basically correctly.
scott
Diffstat (limited to 'Mailman/pipermail.py')
| -rw-r--r-- | Mailman/pipermail.py | 626 |
1 files changed, 0 insertions, 626 deletions
diff --git a/Mailman/pipermail.py b/Mailman/pipermail.py deleted file mode 100644 index c4bdec1cb..000000000 --- a/Mailman/pipermail.py +++ /dev/null @@ -1,626 +0,0 @@ -#!/usr/local/bin/python - -import os, sys, pickle, string, re - -__version__='0.05' -VERSION=__version__ -CACHESIZE=100 # Number of slots in the cache - -msgid_pat=re.compile(r'(<.*>)') -def strip_separators(s): - "Remove quotes or parenthesization from a Message-ID string" - if s==None or s=="": return "" - if s[0] in '"<([' and s[-1] in '">)]': s=s[1:-1] - return s - -smallNameParts = ['van', 'von', 'der', 'de'] - -def fixAuthor(author): - "Canonicalize a name into Last, First format" - # If there's a comma, guess that it's already in "Last, First" format - if ',' in author: return author - L=string.split(author) - i=len(L)-1 - if i==0: return author # The string's one word--forget it - if string.upper(author)==author or string.lower(author)==author: - # Damn, the name is all upper- or lower-case. - while i>0 and string.lower(L[i-1]) in smallNameParts: i=i-1 - else: - # Mixed case; assume that small parts of the last name will be - # in lowercase, and check them against the list. - while i>0 and (L[i-1][0] in string.lowercase or - string.lower(L[i-1]) in smallNameParts): - i=i-1 - author=string.join(L[-1:]+L[i:-1], ' ')+', '+string.join(L[:i], ' ') - return author - -# Abstract class for databases - -class Database: - def __init__(self): pass - def close(self): pass - def getArticle(self, archive, msgid): pass - def hasArticle(self, archive, msgid): pass - def addArticle(self, archive, article, subjectkey, authorkey): pass - def firstdate(self, archive): pass - def lastdate(self, archive): pass - def first(self, archive, index): pass - def next(self, archive, index): pass - def numArticles(self, archive): pass - def newArchive(self, archive): pass - def setThreadKey(self, archive, key, msgid): pass - def getOldestArticle(self, subject): pass - -# The Article class encapsulates a single posting. The attributes -# are: -# -# sequence : Sequence number, unique for each article in a set of archives -# subject : Subject -# datestr : The posting date, in human-readable format -# date : The posting date, in purely numeric format -# headers : Any other headers of interest -# author : The author's name (and possibly organization) -# email : The author's e-mail address -# msgid : A unique message ID -# in_reply_to : If !="", this is the msgid of the article being replied to -# references: A (possibly empty) list of msgid's of earlier articles in the thread -# body : A list of strings making up the message body - -class Article: - import time - __last_article_time=time.time() - def __init__(self, message=None, sequence=0, keepHeaders=[]): - import time - if message==None: return - self.sequence=sequence - - self.parentID = None ; self.threadKey = None - # otherwise the current sequence number is used. - id=strip_separators(message.getheader('Message-Id')) - if id=="": self.msgid=str(self.sequence) - else: self.msgid=id - - if message.has_key('Subject'): self.subject=str(message['Subject']) - else: self.subject='No subject' - if self.subject=="": self.subject='No subject' - - if message.has_key('Date'): - self.datestr=str(message['Date']) - date=message.getdate_tz('Date') - else: - self.datestr='None' - date=None - if date!=None: - date, tzoffset=date[:9], date[-1] - date=time.mktime(date)-tzoffset - else: - date=self.__last_article_time+1 ; print 'Article without date:', self.msgid - - self.__last_article_time=date - self.date='%011i' % (date,) - - # Figure out the e-mail address and poster's name - self.author, self.email=message.getaddr('From') - e=message.getheader('Reply-To') - if e!=None: self.email=e - self.email=strip_separators(self.email) - self.author=strip_separators(self.author) - - if self.author=="": self.author=self.email - - # Save the 'In-Reply-To:' and 'References:' lines - i_r_t=message.getheader('In-Reply-To') - if i_r_t==None: self.in_reply_to='' - else: - match=msgid_pat.search(i_r_t) - if match==None: self.in_reply_to='' - else: self.in_reply_to=strip_separators(match.group(1)) - - references=message.getheader('References') - if references==None: self.references=[] - else: self.references=map(strip_separators, string.split(references)) - - # Save any other interesting headers - self.headers={} - for i in keepHeaders: - if message.has_key(i): self.headers[i]=message[i] - - # Read the message body - self.body=[] - message.rewindbody() - while (1): - line=message.fp.readline() - if line=="": break - self.body.append(line) - def __repr__(self): - return '<Article ID='+repr(self.msgid)+'>' - -# Pipermail formatter class - -class T: - DIRMODE=0755 # Mode to give to created directories - FILEMODE=0644 # Mode to give to created files - INDEX_EXT = ".html" # Extension for indexes - - def __init__(self, basedir=None, reload=1, database=None): - # If basedir isn't provided, assume the current directory - if basedir==None: self.basedir=os.getcwd() - else: - basedir=os.path.expanduser(basedir) - self.basedir=basedir - self.database=database - - # If the directory doesn't exist, create it - try: os.stat(self.basedir) - except os.error, errdata: - errno, errmsg = errdata - if errno!=2: raise os.error, errdata - else: - self.message('Creating archive directory '+self.basedir) - os.mkdir(self.basedir, self.DIRMODE) - - # Try to load previously pickled state - try: - if not reload: raise IOError - f=open(os.path.join(self.basedir, 'pipermail.pck'), 'r') - self.message('Reloading pickled archive state') - d=pickle.load(f) - f.close() - for key, value in d.items(): setattr(self, key, value) - except IOError: - # No pickled version, so initialize various attributes - self.archives=[] # Archives - self._dirty_archives=[] # Archives that will have to be updated - self.sequence=0 # Sequence variable used for numbering articles - self.update_TOC=0 # Does the TOC need updating? - - def close(self): - "Close an archive, saving its state and updating any changed archives." - self.update_dirty_archives()# Update all changed archives - # If required, update the table of contents - if self.update_TOC or 1: - self.update_TOC=0 - self.write_TOC() - # Save the collective state - self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck')) - self.database.close() - del self.database - f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w') - pickle.dump(self.__dict__, f) - f.close() - - # - # Private methods - # - # These will be neither overridden nor called by custom archivers. - # - - - # Create a dictionary of various parameters that will be passed - # to the write_index_{header,footer} functions - def __set_parameters(self, archive): - import time - # Determine the earliest and latest date in the archive - firstdate=self.database.firstdate(archive) - lastdate=self.database.lastdate(archive) - - # Get the current time - now=time.asctime(time.localtime(time.time())) - self.firstdate=firstdate ; self.lastdate=lastdate - self.archivedate=now ; self.size=self.database.numArticles(archive) - self.archive=archive ; self.version=__version__ - - # Find the message ID of an article's parent, or return None - # if no parent can be found. - - def __findParent(self, article, children=[]): - parentID=None - if article.in_reply_to!='': parentID=article.in_reply_to - elif article.references!=[]: - # Remove article IDs that aren't in the archive - refs=filter(self.articleIndex.has_key, article.references) - if len(refs): - refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) - maxdate=refs[0] - for i in refs[1:]: - if i.date>maxdate.date: maxdate=i - parentID=maxdate.msgid - else: - # Look for the oldest matching subject - try: - key, tempid=self.subjectIndex.set_location(article.subject) - print key, tempid - self.subjectIndex.next() - [subject, date]= string.split(key, '\0') - print article.subject, subject, date - if (subject==article.subject and tempid not in children): - parentID=tempid - except KeyError: pass - return parentID - - # Update the threaded index completely - def updateThreadedIndex(self): - import pickle, sys - # Erase the threaded index - self.database.clearIndex(self.archive, 'thread') - - # Loop over all the articles - msgid=self.database.first(self.archive, 'date') - while (msgid != None): - article=self.database.getArticle(self.archive, msgid) - if article.parentID==None or not self.database.hasArticle(self.archive, article.parentID): - key=article.date - else: - parent=self.database.getArticle(self.archive, article.parentID) - article.threadKey=parent.threadKey+article.date+'-' - self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, msgid) - msgid=self.database.next(self.archive, 'date') - -## L1=[] ; L2=[] -## while (1): -## article=self.database.getArticle(self.archive, msgid) -## L1.append('') ; L2.append(msgid) -## L1=map(lambda x, d=article.date: d+'-'+x, L1) -## parentID=self.__findParent(article, L2) -## if parentID==None or not self.database.hasArticle(parentID): -## break -## else: msgid=parentID -## for i in range(0, len(L1)): -## self.database.setThreadKey(self.archive, L1[i], '\000'+L2[i]) -## self.database.setThreadKey(self.archive, '\000'+L2[i], L1[i]) - - # - # Public methods: - # - # These are part of the public interface of the T class, but will - # never be overridden (unless you're trying to do something very new). - - # Update a single archive's indices, whether the archive's been - # dirtied or not. - def update_archive(self, archive): - self.archive=archive - self.message("Updating index files for archive ["+archive+']') - arcdir=os.path.join(self.basedir, archive) - parameters=self.__set_parameters(archive) - # Handle the 3 simple indices first - for i in ['Date', 'Subject', 'Author']: - self.message(" "+i) - self.type=i - # Get the right index - i=string.lower(i) - - # Redirect sys.stdout - import sys - f=open(os.path.join(arcdir, i+self.INDEX_EXT), 'w') - os.chmod(f.name, self.FILEMODE) - temp_stdout, sys.stdout=sys.stdout, f - self.write_index_header() - count=0 - # Loop over the index entries - finished=0 - msgid=self.database.first(archive, i) - while (msgid != None): - article=self.database.getArticle(self.archive, msgid) - count=count+1 - self.write_index_entry(article) - msgid = self.database.next(archive, i ) - # Finish up this index - self.write_index_footer() - sys.stdout=temp_stdout - f.close() - - # Print the threaded index - self.message(" Thread") - temp_stdout, sys.stdout=sys.stdout, open(os.path.join(arcdir, 'thread' + self.INDEX_EXT), 'w') - os.chmod(os.path.join(arcdir, 'thread' + self.INDEX_EXT), self.FILEMODE) - self.type='Thread' - self.write_index_header() - - # To handle the prev./next in thread pointers, we need to - # track articles 5 at a time. - - # Get the first 5 articles - L=[ None ]*5 ; i=2 ; finished=0 - msgid=self.database.first(self.archive, 'thread') - while msgid!=None and i<5: - L[i]=self.database.getArticle(self.archive, msgid) ; i=i+1 - msgid = self.database.next(self.archive, 'thread') - - while L[2]!=None: - article=L[2] ; artkey=None - if article!=None: artkey=article.threadKey - if artkey!=None: - import sys - self.write_threadindex_entry(article, string.count(artkey, '-')-1) - if self.database.changed.has_key( (archive,article.msgid) ): - a1=L[1] ; a3=L[3] - self.update_article(arcdir, article, a1, a3) - if a3!=None: self.database.changed[ (archive,a3.msgid) ]=None - if a1!=None: - if not self.database.changed.has_key( (archive,a1.msgid) ): - self.update_article(arcdir, a1, L[0], L[2]) - else: del self.database.changed[ (archive,a1.msgid) ] - L=L[1:] # Rotate the list - if msgid==None: L.append(msgid) - else: L.append( self.database.getArticle(self.archive, msgid) ) - msgid = self.database.next(self.archive, 'thread') - - self.write_index_footer() - sys.stdout=temp_stdout - - # Update only archives that have been marked as "changed". - def update_dirty_archives(self): - for i in self._dirty_archives: self.update_archive(i) - self._dirty_archives=[] - - # Read a Unix mailbox file from the file object <input>, - # and create a series of Article objects. Each article - # object will then be archived. - - def processUnixMailbox(self, input, articleClass=Article): - import mailbox - mbox=mailbox.UnixMailbox(input) - while (1): - m=mbox.next() - if not m: break # End of file reached - a=articleClass(m, self.sequence) # Create an article object - self.sequence=self.sequence+1 # Increment the archive's sequence number - self.add_article(a) # Add the article - - # Archive an Article object. - def add_article(self, article): - # Determine into what archives the article should be placed - archives=self.get_archives(article) - if archives==None: archives=[] # If no value was returned, ignore it - if type(archives)==type(''): archives=[archives] # If a string was returned, convert to a list - if archives==[]: return # Ignore the article - - # Add the article to each archive in turn - article.filename=filename=self.get_filename(article) - temp=self.format_article(article) # Reformat the article - self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives)) - for i in archives: - self.archive=i - archivedir=os.path.join(self.basedir, i) - # If it's a new archive, create it - if i not in self.archives: - self.archives.append(i) ; self.update_TOC=1 - self.database.newArchive(i) - # If the archive directory doesn't exist, create it - try: os.stat(archivedir) - except os.error, errdata: - errno, errmsg=errdata - if errno==2: - os.mkdir(archivedir, self.DIRMODE) - else: raise os.error, errdata - self.open_new_archive(i, archivedir) - - # Write the HTML-ized article - f=open(os.path.join(archivedir, filename), 'w') - os.chmod(os.path.join(archivedir, filename), self.FILEMODE) - temp_stdout, sys.stdout = sys.stdout, f - self.write_article_header(temp) - sys.stdout.writelines(temp.body) - self.write_article_footer(temp) - sys.stdout=temp_stdout - f.close() - - authorkey=fixAuthor(article.author)+'\000'+article.date - subjectkey=string.lower(article.subject)+'\000'+article.date - - # Update parenting info - parentID=None - if article.in_reply_to!='': parentID=article.in_reply_to - elif article.references!=[]: - # Remove article IDs that aren't in the archive - refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x), - article.references) - if len(refs): - refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) - maxdate=refs[0] - for ref in refs[1:]: - if ref.date>maxdate.date: maxdate=ref - parentID=maxdate.msgid - else: - # Get the oldest article with a matching subject, and assume this is - # a follow-up to that article - parentID=self.database.getOldestArticle(self.archive, article.subject) - - if parentID!=None and not self.database.hasArticle(self.archive, parentID): - parentID=None - article.parentID=parentID - if parentID!=None: - parent=self.database.getArticle(self.archive, parentID) - article.threadKey=parent.threadKey+article.date+'-' - else: article.threadKey=article.date+'-' - self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid) - self.database.addArticle(i, temp, subjectkey, authorkey) - - if i not in self._dirty_archives: - self._dirty_archives.append(i) - del temp - - # Abstract methods: these will need to be overridden by subclasses - # before anything useful can be done. - - def get_filename(self, article): - pass - def get_archives(self, article): - """Return a list of indexes where the article should be filed. - A string can be returned if the list only contains one entry, - and the empty list is legal.""" - pass - def format_article(self, article): - pass - def write_index_header(self): - pass - def write_index_footer(self): - pass - def write_index_entry(self, article): - pass - def write_threadindex_entry(self, article, depth): - pass - def write_article_header(self, article): - pass - def write_article_footer(self, article): - pass - def write_article_entry(self, article): - pass - def update_article(self, archivedir, article, prev, next): - pass - def write_TOC(self): - pass - def open_new_archive(self, archive, dir): - pass - def message(self, msg): - pass - - -class BSDDBdatabase(Database): - def __init__(self, basedir): - self.__cachekeys=[] ; self.__cachedict={} - self.__currentOpenArchive=None # The currently open indices - self.basedir=os.path.expanduser(basedir) - self.changed={} # Recently added articles, indexed only by message ID - def firstdate(self, archive): - import time - self.__openIndices(archive) - date='None' - try: - date, msgid = self.dateIndex.first() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass - return date - def lastdate(self, archive): - import time - self.__openIndices(archive) - date='None' - try: - date, msgid = self.dateIndex.last() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass - return date - def numArticles(self, archive): - self.__openIndices(archive) - return len(self.dateIndex) - - # Add a single article to the internal indexes for an archive. - - def addArticle(self, archive, article, subjectkey, authorkey): - import pickle - self.__openIndices(archive) - - # Add the new article - self.dateIndex[article.date]=article.msgid - self.authorIndex[authorkey]=article.msgid - self.subjectIndex[subjectkey]=article.msgid - # Set the 'body' attribute to empty, to avoid storing the whole message - temp = article.body ; article.body=[] - self.articleIndex[article.msgid]=pickle.dumps(article) - article.body=temp - self.changed[archive,article.msgid]=None - - parentID=article.parentID - if parentID!=None and self.articleIndex.has_key(parentID): - parent=self.getArticle(archive, parentID) - myThreadKey=parent.threadKey+article.date+'-' - else: myThreadKey = article.date+'-' - article.threadKey=myThreadKey - self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid) - - # Open the BSDDB files that are being used as indices - # (dateIndex, authorIndex, subjectIndex, articleIndex) - def __openIndices(self, archive): - if self.__currentOpenArchive==archive: return - - import bsddb - self.__closeIndices() -# print 'opening indices for [%s]' % (repr(archive),) - arcdir=os.path.join(self.basedir, 'database') - try: os.mkdir(arcdir, 0700) - except os.error: pass - for i in ['date', 'author', 'subject', 'article', 'thread']: - t=bsddb.btopen(os.path.join(arcdir, archive+'-'+i), 'c') - setattr(self, i+'Index', t) - self.__currentOpenArchive=archive - - # Close the BSDDB files that are being used as indices (if they're - # open--this is safe to call if they're already closed) - def __closeIndices(self): - if self.__currentOpenArchive!=None: - pass -# print 'closing indices for [%s]' % (repr(self.__currentOpenArchive),) - for i in ['date', 'author', 'subject', 'thread', 'article']: - attr=i+'Index' - if hasattr(self, attr): - index=getattr(self, attr) - if i=='article': - if not hasattr(self, 'archive_length'): self.archive_length={} - self.archive_length[self.__currentOpenArchive]=len(index) - index.close() - delattr(self,attr) - self.__currentOpenArchive=None - def close(self): - self.__closeIndices() - def hasArticle(self, archive, msgid): - self.__openIndices(archive) - return self.articleIndex.has_key(msgid) - def setThreadKey(self, archive, key, msgid): - self.__openIndices(archive) - self.threadIndex[key]=msgid - def getArticle(self, archive, msgid): - self.__openIndices(archive) - if self.__cachedict.has_key(msgid): - self.__cachekeys.remove(msgid) - self.__cachekeys.append(msgid) - return self.__cachedict[msgid] - if len(self.__cachekeys)==CACHESIZE: - delkey, self.__cachekeys = self.__cachekeys[0], self.__cachekeys[1:] - del self.__cachedict[delkey] - s=self.articleIndex[msgid] - article=pickle.loads(s) - self.__cachekeys.append(msgid) ; self.__cachedict[msgid]=article - return article - - def first(self, archive, index): - self.__openIndices(archive) - index=getattr(self, index+'Index') - try: - key, msgid = index.first() - return msgid - except KeyError: return None - def next(self, archive, index): - self.__openIndices(archive) - index=getattr(self, index+'Index') - try: - key, msgid = index.next() - return msgid - except KeyError: return None - - def getOldestArticle(self, archive, subject): - self.__openIndices(archive) - subject=string.lower(subject) - try: - key, tempid=self.subjectIndex.set_location(subject) - self.subjectIndex.next() - [subject2, date]= string.split(key, '\0') - if subject!=subject2: return None - return tempid - except KeyError: - return None - - def newArchive(self, archive): pass - def clearIndex(self, archive, index): - self.__openIndices(archive) - index=getattr(self, index+'Index') - finished=0 - try: - key, msgid=self.threadIndex.first() - except KeyError: finished=1 - while not finished: - del self.threadIndex[key] - try: - key, msgid=self.threadIndex.next() - except KeyError: finished=1 - - |
