summaryrefslogtreecommitdiff
path: root/Mailman/Archiver/pipermail.py
diff options
context:
space:
mode:
authorcotton1998-10-22 21:14:44 +0000
committercotton1998-10-22 21:14:44 +0000
commit73b134e83997212d049c58946d9e2d2e2b4b070c (patch)
tree9e3bd65b1ac74228498231863970337cc5098097 /Mailman/Archiver/pipermail.py
parent0eb0572a6f7f521c23cd88d13b06fd8c48d15511 (diff)
downloadmailman-73b134e83997212d049c58946d9e2d2e2b4b070c.tar.gz
mailman-73b134e83997212d049c58946d9e2d2e2b4b070c.tar.zst
mailman-73b134e83997212d049c58946d9e2d2e2b4b070c.zip
Diffstat (limited to 'Mailman/Archiver/pipermail.py')
-rw-r--r--Mailman/Archiver/pipermail.py626
1 files changed, 626 insertions, 0 deletions
diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py
new file mode 100644
index 000000000..c4bdec1cb
--- /dev/null
+++ b/Mailman/Archiver/pipermail.py
@@ -0,0 +1,626 @@
+#!/usr/local/bin/python
+
+import os, sys, pickle, string, re
+
+__version__='0.05'
+VERSION=__version__
+CACHESIZE=100 # Number of slots in the cache
+
+msgid_pat=re.compile(r'(<.*>)')
+def strip_separators(s):
+ "Remove quotes or parenthesization from a Message-ID string"
+ if s==None or s=="": return ""
+ if s[0] in '"<([' and s[-1] in '">)]': s=s[1:-1]
+ return s
+
+smallNameParts = ['van', 'von', 'der', 'de']
+
+def fixAuthor(author):
+ "Canonicalize a name into Last, First format"
+ # If there's a comma, guess that it's already in "Last, First" format
+ if ',' in author: return author
+ L=string.split(author)
+ i=len(L)-1
+ if i==0: return author # The string's one word--forget it
+ if string.upper(author)==author or string.lower(author)==author:
+ # Damn, the name is all upper- or lower-case.
+ while i>0 and string.lower(L[i-1]) in smallNameParts: i=i-1
+ else:
+ # Mixed case; assume that small parts of the last name will be
+ # in lowercase, and check them against the list.
+ while i>0 and (L[i-1][0] in string.lowercase or
+ string.lower(L[i-1]) in smallNameParts):
+ i=i-1
+ author=string.join(L[-1:]+L[i:-1], ' ')+', '+string.join(L[:i], ' ')
+ return author
+
+# Abstract class for databases
+
+class Database:
+ def __init__(self): pass
+ def close(self): pass
+ def getArticle(self, archive, msgid): pass
+ def hasArticle(self, archive, msgid): pass
+ def addArticle(self, archive, article, subjectkey, authorkey): pass
+ def firstdate(self, archive): pass
+ def lastdate(self, archive): pass
+ def first(self, archive, index): pass
+ def next(self, archive, index): pass
+ def numArticles(self, archive): pass
+ def newArchive(self, archive): pass
+ def setThreadKey(self, archive, key, msgid): pass
+ def getOldestArticle(self, subject): pass
+
+# The Article class encapsulates a single posting. The attributes
+# are:
+#
+# sequence : Sequence number, unique for each article in a set of archives
+# subject : Subject
+# datestr : The posting date, in human-readable format
+# date : The posting date, in purely numeric format
+# headers : Any other headers of interest
+# author : The author's name (and possibly organization)
+# email : The author's e-mail address
+# msgid : A unique message ID
+# in_reply_to : If !="", this is the msgid of the article being replied to
+# references: A (possibly empty) list of msgid's of earlier articles in the thread
+# body : A list of strings making up the message body
+
+class Article:
+ import time
+ __last_article_time=time.time()
+ def __init__(self, message=None, sequence=0, keepHeaders=[]):
+ import time
+ if message==None: return
+ self.sequence=sequence
+
+ self.parentID = None ; self.threadKey = None
+ # otherwise the current sequence number is used.
+ id=strip_separators(message.getheader('Message-Id'))
+ if id=="": self.msgid=str(self.sequence)
+ else: self.msgid=id
+
+ if message.has_key('Subject'): self.subject=str(message['Subject'])
+ else: self.subject='No subject'
+ if self.subject=="": self.subject='No subject'
+
+ if message.has_key('Date'):
+ self.datestr=str(message['Date'])
+ date=message.getdate_tz('Date')
+ else:
+ self.datestr='None'
+ date=None
+ if date!=None:
+ date, tzoffset=date[:9], date[-1]
+ date=time.mktime(date)-tzoffset
+ else:
+ date=self.__last_article_time+1 ; print 'Article without date:', self.msgid
+
+ self.__last_article_time=date
+ self.date='%011i' % (date,)
+
+ # Figure out the e-mail address and poster's name
+ self.author, self.email=message.getaddr('From')
+ e=message.getheader('Reply-To')
+ if e!=None: self.email=e
+ self.email=strip_separators(self.email)
+ self.author=strip_separators(self.author)
+
+ if self.author=="": self.author=self.email
+
+ # Save the 'In-Reply-To:' and 'References:' lines
+ i_r_t=message.getheader('In-Reply-To')
+ if i_r_t==None: self.in_reply_to=''
+ else:
+ match=msgid_pat.search(i_r_t)
+ if match==None: self.in_reply_to=''
+ else: self.in_reply_to=strip_separators(match.group(1))
+
+ references=message.getheader('References')
+ if references==None: self.references=[]
+ else: self.references=map(strip_separators, string.split(references))
+
+ # Save any other interesting headers
+ self.headers={}
+ for i in keepHeaders:
+ if message.has_key(i): self.headers[i]=message[i]
+
+ # Read the message body
+ self.body=[]
+ message.rewindbody()
+ while (1):
+ line=message.fp.readline()
+ if line=="": break
+ self.body.append(line)
+ def __repr__(self):
+ return '<Article ID='+repr(self.msgid)+'>'
+
+# Pipermail formatter class
+
+class T:
+ DIRMODE=0755 # Mode to give to created directories
+ FILEMODE=0644 # Mode to give to created files
+ INDEX_EXT = ".html" # Extension for indexes
+
+ def __init__(self, basedir=None, reload=1, database=None):
+ # If basedir isn't provided, assume the current directory
+ if basedir==None: self.basedir=os.getcwd()
+ else:
+ basedir=os.path.expanduser(basedir)
+ self.basedir=basedir
+ self.database=database
+
+ # If the directory doesn't exist, create it
+ try: os.stat(self.basedir)
+ except os.error, errdata:
+ errno, errmsg = errdata
+ if errno!=2: raise os.error, errdata
+ else:
+ self.message('Creating archive directory '+self.basedir)
+ os.mkdir(self.basedir, self.DIRMODE)
+
+ # Try to load previously pickled state
+ try:
+ if not reload: raise IOError
+ f=open(os.path.join(self.basedir, 'pipermail.pck'), 'r')
+ self.message('Reloading pickled archive state')
+ d=pickle.load(f)
+ f.close()
+ for key, value in d.items(): setattr(self, key, value)
+ except IOError:
+ # No pickled version, so initialize various attributes
+ self.archives=[] # Archives
+ self._dirty_archives=[] # Archives that will have to be updated
+ self.sequence=0 # Sequence variable used for numbering articles
+ self.update_TOC=0 # Does the TOC need updating?
+
+ def close(self):
+ "Close an archive, saving its state and updating any changed archives."
+ self.update_dirty_archives()# Update all changed archives
+ # If required, update the table of contents
+ if self.update_TOC or 1:
+ self.update_TOC=0
+ self.write_TOC()
+ # Save the collective state
+ self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck'))
+ self.database.close()
+ del self.database
+ f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
+ pickle.dump(self.__dict__, f)
+ f.close()
+
+ #
+ # Private methods
+ #
+ # These will be neither overridden nor called by custom archivers.
+ #
+
+
+ # Create a dictionary of various parameters that will be passed
+ # to the write_index_{header,footer} functions
+ def __set_parameters(self, archive):
+ import time
+ # Determine the earliest and latest date in the archive
+ firstdate=self.database.firstdate(archive)
+ lastdate=self.database.lastdate(archive)
+
+ # Get the current time
+ now=time.asctime(time.localtime(time.time()))
+ self.firstdate=firstdate ; self.lastdate=lastdate
+ self.archivedate=now ; self.size=self.database.numArticles(archive)
+ self.archive=archive ; self.version=__version__
+
+ # Find the message ID of an article's parent, or return None
+ # if no parent can be found.
+
+ def __findParent(self, article, children=[]):
+ parentID=None
+ if article.in_reply_to!='': parentID=article.in_reply_to
+ elif article.references!=[]:
+ # Remove article IDs that aren't in the archive
+ refs=filter(self.articleIndex.has_key, article.references)
+ if len(refs):
+ refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
+ maxdate=refs[0]
+ for i in refs[1:]:
+ if i.date>maxdate.date: maxdate=i
+ parentID=maxdate.msgid
+ else:
+ # Look for the oldest matching subject
+ try:
+ key, tempid=self.subjectIndex.set_location(article.subject)
+ print key, tempid
+ self.subjectIndex.next()
+ [subject, date]= string.split(key, '\0')
+ print article.subject, subject, date
+ if (subject==article.subject and tempid not in children):
+ parentID=tempid
+ except KeyError: pass
+ return parentID
+
+ # Update the threaded index completely
+ def updateThreadedIndex(self):
+ import pickle, sys
+ # Erase the threaded index
+ self.database.clearIndex(self.archive, 'thread')
+
+ # Loop over all the articles
+ msgid=self.database.first(self.archive, 'date')
+ while (msgid != None):
+ article=self.database.getArticle(self.archive, msgid)
+ if article.parentID==None or not self.database.hasArticle(self.archive, article.parentID):
+ key=article.date
+ else:
+ parent=self.database.getArticle(self.archive, article.parentID)
+ article.threadKey=parent.threadKey+article.date+'-'
+ self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, msgid)
+ msgid=self.database.next(self.archive, 'date')
+
+## L1=[] ; L2=[]
+## while (1):
+## article=self.database.getArticle(self.archive, msgid)
+## L1.append('') ; L2.append(msgid)
+## L1=map(lambda x, d=article.date: d+'-'+x, L1)
+## parentID=self.__findParent(article, L2)
+## if parentID==None or not self.database.hasArticle(parentID):
+## break
+## else: msgid=parentID
+## for i in range(0, len(L1)):
+## self.database.setThreadKey(self.archive, L1[i], '\000'+L2[i])
+## self.database.setThreadKey(self.archive, '\000'+L2[i], L1[i])
+
+ #
+ # Public methods:
+ #
+ # These are part of the public interface of the T class, but will
+ # never be overridden (unless you're trying to do something very new).
+
+ # Update a single archive's indices, whether the archive's been
+ # dirtied or not.
+ def update_archive(self, archive):
+ self.archive=archive
+ self.message("Updating index files for archive ["+archive+']')
+ arcdir=os.path.join(self.basedir, archive)
+ parameters=self.__set_parameters(archive)
+ # Handle the 3 simple indices first
+ for i in ['Date', 'Subject', 'Author']:
+ self.message(" "+i)
+ self.type=i
+ # Get the right index
+ i=string.lower(i)
+
+ # Redirect sys.stdout
+ import sys
+ f=open(os.path.join(arcdir, i+self.INDEX_EXT), 'w')
+ os.chmod(f.name, self.FILEMODE)
+ temp_stdout, sys.stdout=sys.stdout, f
+ self.write_index_header()
+ count=0
+ # Loop over the index entries
+ finished=0
+ msgid=self.database.first(archive, i)
+ while (msgid != None):
+ article=self.database.getArticle(self.archive, msgid)
+ count=count+1
+ self.write_index_entry(article)
+ msgid = self.database.next(archive, i )
+ # Finish up this index
+ self.write_index_footer()
+ sys.stdout=temp_stdout
+ f.close()
+
+ # Print the threaded index
+ self.message(" Thread")
+ temp_stdout, sys.stdout=sys.stdout, open(os.path.join(arcdir, 'thread' + self.INDEX_EXT), 'w')
+ os.chmod(os.path.join(arcdir, 'thread' + self.INDEX_EXT), self.FILEMODE)
+ self.type='Thread'
+ self.write_index_header()
+
+ # To handle the prev./next in thread pointers, we need to
+ # track articles 5 at a time.
+
+ # Get the first 5 articles
+ L=[ None ]*5 ; i=2 ; finished=0
+ msgid=self.database.first(self.archive, 'thread')
+ while msgid!=None and i<5:
+ L[i]=self.database.getArticle(self.archive, msgid) ; i=i+1
+ msgid = self.database.next(self.archive, 'thread')
+
+ while L[2]!=None:
+ article=L[2] ; artkey=None
+ if article!=None: artkey=article.threadKey
+ if artkey!=None:
+ import sys
+ self.write_threadindex_entry(article, string.count(artkey, '-')-1)
+ if self.database.changed.has_key( (archive,article.msgid) ):
+ a1=L[1] ; a3=L[3]
+ self.update_article(arcdir, article, a1, a3)
+ if a3!=None: self.database.changed[ (archive,a3.msgid) ]=None
+ if a1!=None:
+ if not self.database.changed.has_key( (archive,a1.msgid) ):
+ self.update_article(arcdir, a1, L[0], L[2])
+ else: del self.database.changed[ (archive,a1.msgid) ]
+ L=L[1:] # Rotate the list
+ if msgid==None: L.append(msgid)
+ else: L.append( self.database.getArticle(self.archive, msgid) )
+ msgid = self.database.next(self.archive, 'thread')
+
+ self.write_index_footer()
+ sys.stdout=temp_stdout
+
+ # Update only archives that have been marked as "changed".
+ def update_dirty_archives(self):
+ for i in self._dirty_archives: self.update_archive(i)
+ self._dirty_archives=[]
+
+ # Read a Unix mailbox file from the file object <input>,
+ # and create a series of Article objects. Each article
+ # object will then be archived.
+
+ def processUnixMailbox(self, input, articleClass=Article):
+ import mailbox
+ mbox=mailbox.UnixMailbox(input)
+ while (1):
+ m=mbox.next()
+ if not m: break # End of file reached
+ a=articleClass(m, self.sequence) # Create an article object
+ self.sequence=self.sequence+1 # Increment the archive's sequence number
+ self.add_article(a) # Add the article
+
+ # Archive an Article object.
+ def add_article(self, article):
+ # Determine into what archives the article should be placed
+ archives=self.get_archives(article)
+ if archives==None: archives=[] # If no value was returned, ignore it
+ if type(archives)==type(''): archives=[archives] # If a string was returned, convert to a list
+ if archives==[]: return # Ignore the article
+
+ # Add the article to each archive in turn
+ article.filename=filename=self.get_filename(article)
+ temp=self.format_article(article) # Reformat the article
+ self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives))
+ for i in archives:
+ self.archive=i
+ archivedir=os.path.join(self.basedir, i)
+ # If it's a new archive, create it
+ if i not in self.archives:
+ self.archives.append(i) ; self.update_TOC=1
+ self.database.newArchive(i)
+ # If the archive directory doesn't exist, create it
+ try: os.stat(archivedir)
+ except os.error, errdata:
+ errno, errmsg=errdata
+ if errno==2:
+ os.mkdir(archivedir, self.DIRMODE)
+ else: raise os.error, errdata
+ self.open_new_archive(i, archivedir)
+
+ # Write the HTML-ized article
+ f=open(os.path.join(archivedir, filename), 'w')
+ os.chmod(os.path.join(archivedir, filename), self.FILEMODE)
+ temp_stdout, sys.stdout = sys.stdout, f
+ self.write_article_header(temp)
+ sys.stdout.writelines(temp.body)
+ self.write_article_footer(temp)
+ sys.stdout=temp_stdout
+ f.close()
+
+ authorkey=fixAuthor(article.author)+'\000'+article.date
+ subjectkey=string.lower(article.subject)+'\000'+article.date
+
+ # Update parenting info
+ parentID=None
+ if article.in_reply_to!='': parentID=article.in_reply_to
+ elif article.references!=[]:
+ # Remove article IDs that aren't in the archive
+ refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x),
+ article.references)
+ if len(refs):
+ refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
+ maxdate=refs[0]
+ for ref in refs[1:]:
+ if ref.date>maxdate.date: maxdate=ref
+ parentID=maxdate.msgid
+ else:
+ # Get the oldest article with a matching subject, and assume this is
+ # a follow-up to that article
+ parentID=self.database.getOldestArticle(self.archive, article.subject)
+
+ if parentID!=None and not self.database.hasArticle(self.archive, parentID):
+ parentID=None
+ article.parentID=parentID
+ if parentID!=None:
+ parent=self.database.getArticle(self.archive, parentID)
+ article.threadKey=parent.threadKey+article.date+'-'
+ else: article.threadKey=article.date+'-'
+ self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid)
+ self.database.addArticle(i, temp, subjectkey, authorkey)
+
+ if i not in self._dirty_archives:
+ self._dirty_archives.append(i)
+ del temp
+
+ # Abstract methods: these will need to be overridden by subclasses
+ # before anything useful can be done.
+
+ def get_filename(self, article):
+ pass
+ def get_archives(self, article):
+ """Return a list of indexes where the article should be filed.
+ A string can be returned if the list only contains one entry,
+ and the empty list is legal."""
+ pass
+ def format_article(self, article):
+ pass
+ def write_index_header(self):
+ pass
+ def write_index_footer(self):
+ pass
+ def write_index_entry(self, article):
+ pass
+ def write_threadindex_entry(self, article, depth):
+ pass
+ def write_article_header(self, article):
+ pass
+ def write_article_footer(self, article):
+ pass
+ def write_article_entry(self, article):
+ pass
+ def update_article(self, archivedir, article, prev, next):
+ pass
+ def write_TOC(self):
+ pass
+ def open_new_archive(self, archive, dir):
+ pass
+ def message(self, msg):
+ pass
+
+
+class BSDDBdatabase(Database):
+ def __init__(self, basedir):
+ self.__cachekeys=[] ; self.__cachedict={}
+ self.__currentOpenArchive=None # The currently open indices
+ self.basedir=os.path.expanduser(basedir)
+ self.changed={} # Recently added articles, indexed only by message ID
+ def firstdate(self, archive):
+ import time
+ self.__openIndices(archive)
+ date='None'
+ try:
+ date, msgid = self.dateIndex.first()
+ date=time.asctime(time.localtime(string.atof(date)))
+ except KeyError: pass
+ return date
+ def lastdate(self, archive):
+ import time
+ self.__openIndices(archive)
+ date='None'
+ try:
+ date, msgid = self.dateIndex.last()
+ date=time.asctime(time.localtime(string.atof(date)))
+ except KeyError: pass
+ return date
+ def numArticles(self, archive):
+ self.__openIndices(archive)
+ return len(self.dateIndex)
+
+ # Add a single article to the internal indexes for an archive.
+
+ def addArticle(self, archive, article, subjectkey, authorkey):
+ import pickle
+ self.__openIndices(archive)
+
+ # Add the new article
+ self.dateIndex[article.date]=article.msgid
+ self.authorIndex[authorkey]=article.msgid
+ self.subjectIndex[subjectkey]=article.msgid
+ # Set the 'body' attribute to empty, to avoid storing the whole message
+ temp = article.body ; article.body=[]
+ self.articleIndex[article.msgid]=pickle.dumps(article)
+ article.body=temp
+ self.changed[archive,article.msgid]=None
+
+ parentID=article.parentID
+ if parentID!=None and self.articleIndex.has_key(parentID):
+ parent=self.getArticle(archive, parentID)
+ myThreadKey=parent.threadKey+article.date+'-'
+ else: myThreadKey = article.date+'-'
+ article.threadKey=myThreadKey
+ self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid)
+
+ # Open the BSDDB files that are being used as indices
+ # (dateIndex, authorIndex, subjectIndex, articleIndex)
+ def __openIndices(self, archive):
+ if self.__currentOpenArchive==archive: return
+
+ import bsddb
+ self.__closeIndices()
+# print 'opening indices for [%s]' % (repr(archive),)
+ arcdir=os.path.join(self.basedir, 'database')
+ try: os.mkdir(arcdir, 0700)
+ except os.error: pass
+ for i in ['date', 'author', 'subject', 'article', 'thread']:
+ t=bsddb.btopen(os.path.join(arcdir, archive+'-'+i), 'c')
+ setattr(self, i+'Index', t)
+ self.__currentOpenArchive=archive
+
+ # Close the BSDDB files that are being used as indices (if they're
+ # open--this is safe to call if they're already closed)
+ def __closeIndices(self):
+ if self.__currentOpenArchive!=None:
+ pass
+# print 'closing indices for [%s]' % (repr(self.__currentOpenArchive),)
+ for i in ['date', 'author', 'subject', 'thread', 'article']:
+ attr=i+'Index'
+ if hasattr(self, attr):
+ index=getattr(self, attr)
+ if i=='article':
+ if not hasattr(self, 'archive_length'): self.archive_length={}
+ self.archive_length[self.__currentOpenArchive]=len(index)
+ index.close()
+ delattr(self,attr)
+ self.__currentOpenArchive=None
+ def close(self):
+ self.__closeIndices()
+ def hasArticle(self, archive, msgid):
+ self.__openIndices(archive)
+ return self.articleIndex.has_key(msgid)
+ def setThreadKey(self, archive, key, msgid):
+ self.__openIndices(archive)
+ self.threadIndex[key]=msgid
+ def getArticle(self, archive, msgid):
+ self.__openIndices(archive)
+ if self.__cachedict.has_key(msgid):
+ self.__cachekeys.remove(msgid)
+ self.__cachekeys.append(msgid)
+ return self.__cachedict[msgid]
+ if len(self.__cachekeys)==CACHESIZE:
+ delkey, self.__cachekeys = self.__cachekeys[0], self.__cachekeys[1:]
+ del self.__cachedict[delkey]
+ s=self.articleIndex[msgid]
+ article=pickle.loads(s)
+ self.__cachekeys.append(msgid) ; self.__cachedict[msgid]=article
+ return article
+
+ def first(self, archive, index):
+ self.__openIndices(archive)
+ index=getattr(self, index+'Index')
+ try:
+ key, msgid = index.first()
+ return msgid
+ except KeyError: return None
+ def next(self, archive, index):
+ self.__openIndices(archive)
+ index=getattr(self, index+'Index')
+ try:
+ key, msgid = index.next()
+ return msgid
+ except KeyError: return None
+
+ def getOldestArticle(self, archive, subject):
+ self.__openIndices(archive)
+ subject=string.lower(subject)
+ try:
+ key, tempid=self.subjectIndex.set_location(subject)
+ self.subjectIndex.next()
+ [subject2, date]= string.split(key, '\0')
+ if subject!=subject2: return None
+ return tempid
+ except KeyError:
+ return None
+
+ def newArchive(self, archive): pass
+ def clearIndex(self, archive, index):
+ self.__openIndices(archive)
+ index=getattr(self, index+'Index')
+ finished=0
+ try:
+ key, msgid=self.threadIndex.first()
+ except KeyError: finished=1
+ while not finished:
+ del self.threadIndex[key]
+ try:
+ key, msgid=self.threadIndex.next()
+ except KeyError: finished=1
+
+