diff options
| -rw-r--r-- | Mailman/Archiver/HyperArch.py | 34 | ||||
| -rw-r--r-- | Mailman/Archiver/pipermail.py | 702 |
2 files changed, 401 insertions, 335 deletions
diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py index 7a5c47cd7..8429b8356 100644 --- a/Mailman/Archiver/HyperArch.py +++ b/Mailman/Archiver/HyperArch.py @@ -420,13 +420,14 @@ arch_listing_end = '''\ class HyperArchive(pipermail.T): + __super_init = pipermail.T.__init__ # some defaults DIRMODE=02775 FILEMODE=0660 - VERBOSE=0 + VERBOSE=1 DEFAULTINDEX='thread' ARCHIVE_PERIOD='month' @@ -540,23 +541,19 @@ class HyperArchive(pipermail.T): d["archive_listing"] = listing return self.html_TOC_tmpl % d - def __init__(self, maillist,unlock=1): - self.maillist=maillist - self._unlocklist=unlock - self._lock_file=None + def __init__(self, maillist, unlock=1): + self.maillist = maillist + self._unlocklist = unlock + self._lock_file = None - - # # can't init the database while other # processes are writing to it! # XXX TODO- implement native locking # with mailman's LockFile module for HyperDatabase.HyperDatabase # - pipermail.T.__init__( - self, - maillist.archive_dir(), - reload=1, - database=HyperDatabase.HyperDatabase(maillist.archive_dir())) + dir = maillist.archive_dir() + db = HyperDatabase.HyperDatabase(dir) + self.__super_init(dir, reload=1, database=db) if hasattr(self.maillist,'archive_volume_frequency'): if self.maillist.archive_volume_frequency == 0: @@ -738,8 +735,10 @@ class HyperArchive(pipermail.T): def open_new_archive(self, archive, archivedir): index_html=os.path.join(archivedir, 'index.html') - try: os.unlink(index_html) - except: pass + try: + os.unlink(index_html) + except: + pass os.symlink(self.DEFAULTINDEX+'.html',index_html) @@ -930,10 +929,10 @@ class HyperArchive(pipermail.T): self.database.close() del self.database f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w') - pickle.dump(self.__getstate__(), f) + pickle.dump(self.getstate(), f) f.close() - def __getstate__(self): + def getstate(self): d={} for each in self.__dict__.keys(): if not (each in ['maillist','_lock_file','_unlocklist'] @@ -941,9 +940,6 @@ class HyperArchive(pipermail.T): d[each] = self.__dict__[each] return d - - - # Add <A HREF="..."> tags around URLs and e-mail addresses. def __processbody_URLquote(self, source, dest): diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py index ccdbb7c9f..c21cad9c7 100644 --- a/Mailman/Archiver/pipermail.py +++ b/Mailman/Archiver/pipermail.py @@ -1,6 +1,10 @@ #! /usr/bin/env python -import os, sys, string, re +import os +import re +import sys +import string +import time try: import cPickle @@ -12,17 +16,19 @@ from Mailman.Utils import mkdir, open_ex # TBD: ugly, ugly, ugly -baw open = open_ex -__version__='0.05 (Mailman edition)' -VERSION=__version__ -CACHESIZE=100 # Number of slots in the cache +__version__ = '0.05 (Mailman edition)' +VERSION = __version__ +CACHESIZE = 100 # Number of slots in the cache -msgid_pat=re.compile(r'(<.*>)') +msgid_pat = re.compile(r'(<.*>)') def strip_separators(s): "Remove quotes or parenthesization from a Message-ID string" - if s==None or s=="": return "" - if s[0] in '"<([' and s[-1] in '">)]': s=s[1:-1] + if not s: + return "" + if s[0] in '"<([' and s[-1] in '">)]': + s = s[1:-1] return s smallNameParts = ['van', 'von', 'der', 'de'] @@ -30,20 +36,24 @@ smallNameParts = ['van', 'von', 'der', 'de'] def fixAuthor(author): "Canonicalize a name into Last, First format" # If there's a comma, guess that it's already in "Last, First" format - if ',' in author: return author - L=string.split(author) - i=len(L)-1 - if i==0: return author # The string's one word--forget it - if string.upper(author)==author or string.lower(author)==author: + if ',' in author: + return author + L = string.split(author) + i = len(L) - 1 + if i == 0: + return author # The string's one word--forget it + if string.upper(author) == author or string.lower(author) == author: # Damn, the name is all upper- or lower-case. - while i>0 and string.lower(L[i-1]) in smallNameParts: i=i-1 + while i > 0 and string.lower(L[i-1]) in smallNameParts: + i = i - 1 else: # Mixed case; assume that small parts of the last name will be # in lowercase, and check them against the list. while i>0 and (L[i-1][0] in string.lowercase or string.lower(L[i-1]) in smallNameParts): - i=i-1 - author=string.join(L[-1:]+L[i:-1], ' ')+', '+string.join(L[:i], ' ') + i = i - 1 + author = string.join(L[-1:] + L[i:-1], ' ') \ + + ', ' + string.join(L[:i], ' ') return author # Abstract class for databases @@ -66,125 +76,144 @@ class Database: # The Article class encapsulates a single posting. The attributes # are: # -# sequence : Sequence number, unique for each article in a set of archives -# subject : Subject -# datestr : The posting date, in human-readable format -# date : The posting date, in purely numeric format -# headers : Any other headers of interest -# author : The author's name (and possibly organization) -# email : The author's e-mail address -# msgid : A unique message ID -# in_reply_to : If !="", this is the msgid of the article being replied to -# references: A (possibly empty) list of msgid's of earlier articles in the thread -# body : A list of strings making up the message body +# sequence : Sequence number, unique for each article in a set of archives +# subject : Subject +# datestr : The posting date, in human-readable format +# date : The posting date, in purely numeric format +# headers : Any other headers of interest +# author : The author's name (and possibly organization) +# email : The author's e-mail address +# msgid : A unique message ID +# in_reply_to: If != "", this is the msgid of the article being replied to +# references : A (possibly empty) list of msgid's of earlier articles +# in the thread +# body : A list of strings making up the message body class Article: - import time - __last_article_time=time.time() - def __init__(self, message=None, sequence=0, keepHeaders=[]): - import time - if message==None: return - self.sequence=sequence + __last_article_time = time.time() + + def __init__(self, message = None, sequence = 0, keepHeaders = []): + if message is None: + return + self.sequence = sequence - self.parentID = None ; self.threadKey = None + self.parentID = None + self.threadKey = None # otherwise the current sequence number is used. - id=strip_separators(message.getheader('Message-Id')) - if id=="": self.msgid=str(self.sequence) - else: self.msgid=id + id = strip_separators(message.getheader('Message-Id')) + if id == "": + self.msgid = str(self.sequence) + else: self.msgid = id - if message.has_key('Subject'): self.subject=str(message['Subject']) - else: self.subject='No subject' - if self.subject=="": self.subject='No subject' + if message.has_key('Subject'): + self.subject = str(message['Subject']) + else: + self.subject = 'No subject' + if self.subject == "": self.subject = 'No subject' if message.has_key('Date'): - self.datestr=str(message['Date']) - date=message.getdate_tz('Date') + self.datestr = str(message['Date']) + date = message.getdate_tz('Date') else: - self.datestr='None' - date=None - if date!=None: - date, tzoffset=date[:9], date[-1] - date=time.mktime(date)-tzoffset + self.datestr = 'None' + date = None + if date is not None: + date, tzoffset = date[:9], date[-1] + date = time.mktime(date)-tzoffset else: - date=self.__last_article_time+1 ; print 'Article without date:', self.msgid + date = self.__last_article_time+1 + print 'Article without date:', self.msgid - self.__last_article_time=date - self.date='%011i' % (date,) + self.__last_article_time = date + self.date = '%011i' % (date,) # Figure out the e-mail address and poster's name - self.author, self.email=message.getaddr('From') - e=message.getheader('Reply-To') - if e!=None: self.email=e - self.email=strip_separators(self.email) - self.author=strip_separators(self.author) + self.author, self.email = message.getaddr('From') + e = message.getheader('Reply-To') + if e is not None: + self.email = e + self.email = strip_separators(self.email) + self.author = strip_separators(self.author) - if self.author=="": self.author=self.email + if self.author == "": self.author = self.email # Save the 'In-Reply-To:' and 'References:' lines - i_r_t=message.getheader('In-Reply-To') - if i_r_t==None: self.in_reply_to='' + i_r_t = message.getheader('In-Reply-To') + if i_r_t is None: + self.in_reply_to = '' else: - match=msgid_pat.search(i_r_t) - if match==None: self.in_reply_to='' - else: self.in_reply_to=strip_separators(match.group(1)) + match = msgid_pat.search(i_r_t) + if match is None: self.in_reply_to = '' + else: self.in_reply_to = strip_separators(match.group(1)) - references=message.getheader('References') - if references==None: self.references=[] - else: self.references=map(strip_separators, string.split(references)) + references = message.getheader('References') + if references is None: + self.references = [] + else: + self.references = map(strip_separators, + string.split(references)) # Save any other interesting headers - self.headers={} + self.headers = {} for i in keepHeaders: - if message.has_key(i): self.headers[i]=message[i] + if message.has_key(i): + self.headers[i] = message[i] # Read the message body - self.body=[] + self.body = [] message.rewindbody() - while (1): - line=message.fp.readline() - if line=="": break + while 1: + line = message.fp.readline() + if line == "": + break self.body.append(line) def __repr__(self): - return '<Article ID='+repr(self.msgid)+'>' + return '<Article ID = '+repr(self.msgid)+'>' # Pipermail formatter class class T: - DIRMODE=0755 # Mode to give to created directories - FILEMODE=0644 # Mode to give to created files + DIRMODE = 0755 # Mode to give to created directories + FILEMODE = 0644 # Mode to give to created files INDEX_EXT = ".html" # Extension for indexes - def __init__(self, basedir=None, reload=1, database=None): + def __init__(self, basedir = None, reload = 1, database = None): # If basedir isn't provided, assume the current directory - if basedir==None: self.basedir=os.getcwd() + if basedir is None: + self.basedir = os.getcwd() else: - basedir=os.path.expanduser(basedir) - self.basedir=basedir - self.database=database + basedir = os.path.expanduser(basedir) + self.basedir = basedir + self.database = database # If the directory doesn't exist, create it - try: os.stat(self.basedir) + try: + os.stat(self.basedir) except os.error, errdata: errno, errmsg = errdata - if errno!=2: raise os.error, errdata + if errno != 2: + raise os.error, errdata else: - self.message('Creating archive directory '+self.basedir) + self.message('Creating archive directory ' + self.basedir) mkdir(self.basedir, self.DIRMODE) # Try to load previously pickled state try: - if not reload: raise IOError - f=open(os.path.join(self.basedir, 'pipermail.pck'), 'r') + if not reload: + raise IOError + f = open(os.path.join(self.basedir, 'pipermail.pck'), 'r') self.message('Reloading pickled archive state') - d=pickle.load(f) + d = pickle.load(f) f.close() - for key, value in d.items(): setattr(self, key, value) + for key, value in d.items(): + setattr(self, key, value) except IOError: # No pickled version, so initialize various attributes - self.archives=[] # Archives - self._dirty_archives=[] # Archives that will have to be updated - self.sequence=0 # Sequence variable used for numbering articles - self.update_TOC=0 # Does the TOC need updating? + self.archives = [] # Archives + self._dirty_archives = [] # Archives that will have to be updated + self.sequence = 0 # Sequence variable used for + # numbering articles + self.update_TOC = 0 # Does the TOC need updating? # # make the basedir variable work when passed in as an __init__ arg # and different from the one in the pickle. Let the one passed in @@ -195,17 +224,17 @@ class T: self.basedir = basedir def close(self): - "Close an archive, saving its state and updating any changed archives." - self.update_dirty_archives()# Update all changed archives - # If required, update the table of contents - if self.update_TOC or 1: - self.update_TOC=0 + "Close an archive, save its state, and update any changed archives." + self.update_dirty_archives() + if self.update_TOC: + self.update_TOC = 0 self.write_TOC() # Save the collective state - self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck')) + self.message('Pickling archive state into ' \ + + os.path.join(self.basedir, 'pipermail.pck')) self.database.close() del self.database - f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w') + f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w') pickle.dump(self.__dict__, f) f.close() @@ -219,43 +248,51 @@ class T: # Create a dictionary of various parameters that will be passed # to the write_index_{header,footer} functions def __set_parameters(self, archive): - import time # Determine the earliest and latest date in the archive - firstdate=self.database.firstdate(archive) - lastdate=self.database.lastdate(archive) + firstdate = self.database.firstdate(archive) + lastdate = self.database.lastdate(archive) # Get the current time - now=time.asctime(time.localtime(time.time())) - self.firstdate=firstdate ; self.lastdate=lastdate - self.archivedate=now ; self.size=self.database.numArticles(archive) - self.archive=archive ; self.version=__version__ + now = time.asctime(time.localtime(time.time())) + self.firstdate = firstdate + self.lastdate = lastdate + self.archivedate = now + self.size = self.database.numArticles(archive) + self.archive = archive + self.version = __version__ # Find the message ID of an article's parent, or return None # if no parent can be found. - def __findParent(self, article, children=[]): - parentID=None - if article.in_reply_to!='': parentID=article.in_reply_to - elif article.references!=[]: + def __findParent(self, article, children = []): + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: # Remove article IDs that aren't in the archive - refs=filter(self.articleIndex.has_key, article.references) - if len(refs): - refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) - maxdate=refs[0] - for i in refs[1:]: - if i.date>maxdate.date: maxdate=i - parentID=maxdate.msgid + refs = filter(self.articleIndex.has_key, article.references) + if not refs: + return None + maxdate = self.database.getArticle(self.archive, + refs[0]) + for ref in refs[1:]: + a = self.database.getArticle(self.archive, ref) + if a.date > maxdate.data: + maxdate = a + parentID = maxdate.msgid else: # Look for the oldest matching subject try: - key, tempid=self.subjectIndex.set_location(article.subject) + key, tempid = \ + self.subjectIndex.set_location(article.subject) print key, tempid self.subjectIndex.next() - [subject, date]= string.split(key, '\0') + [subject, date] = string.split(key, '\0') print article.subject, subject, date - if (subject==article.subject and tempid not in children): - parentID=tempid - except KeyError: pass + if subject == article.subject and tempid not in children: + parentID = tempid + except KeyError: + pass return parentID # Update the threaded index completely @@ -264,40 +301,26 @@ class T: self.database.clearIndex(self.archive, 'thread') # Loop over all the articles - msgid=self.database.first(self.archive, 'date') - while (msgid != None): + msgid = self.database.first(self.archive, 'date') + while msgid is not None: try: - article=self.database.getArticle(self.archive, msgid) + article = self.database.getArticle(self.archive, msgid) except KeyError: pass else: - if article.parentID==None or \ + if article.parentID is None or \ not self.database.hasArticle(self.archive, article.parentID): # then - key=article.date + key = article.date else: - parent=self.database.getArticle(self.archive, + parent = self.database.getArticle(self.archive, article.parentID) - article.threadKey=parent.threadKey+article.date+'-' - self.database.setThreadKey( - self.archive, - article.threadKey+'\000'+ article.msgid, + article.threadKey = parent.threadKey+article.date+'-' + self.database.setThreadKey(self.archive, + article.threadKey + '\000' + article.msgid, msgid) - msgid=self.database.next(self.archive, 'date') - -## L1=[] ; L2=[] -## while (1): -## article=self.database.getArticle(self.archive, msgid) -## L1.append('') ; L2.append(msgid) -## L1=map(lambda x, d=article.date: d+'-'+x, L1) -## parentID=self.__findParent(article, L2) -## if parentID==None or not self.database.hasArticle(parentID): -## break -## else: msgid=parentID -## for i in range(0, len(L1)): -## self.database.setThreadKey(self.archive, L1[i], '\000'+L2[i]) -## self.database.setThreadKey(self.archive, '\000'+L2[i], L1[i]) + msgid = self.database.next(self.archive, 'date') # # Public methods: @@ -308,171 +331,201 @@ class T: # Update a single archive's indices, whether the archive's been # dirtied or not. def update_archive(self, archive): - self.archive=archive - self.message("Updating index files for archive ["+archive+']') - arcdir=os.path.join(self.basedir, archive) - parameters=self.__set_parameters(archive) - # Handle the 3 simple indices first - for i in ['Date', 'Subject', 'Author']: - self.message(" "+i) - self.type=i - # Get the right index - i=string.lower(i) + self.archive = archive + self.message("Updating index files for archive [%s]" % archive) + arcdir = os.path.join(self.basedir, archive) + self.__set_parameters(archive) - # Redirect sys.stdout - import sys - f=open(os.path.join(arcdir, i+self.INDEX_EXT), 'w') -## os.chmod(f.name, self.FILEMODE) - temp_stdout, sys.stdout=sys.stdout, f - self.write_index_header() - count=0 - # Loop over the index entries - finished=0 - msgid=self.database.first(archive, i) - while (msgid != None): - try: - article=self.database.getArticle(self.archive, msgid) - except KeyError: - pass - else: - count=count+1 - self.write_index_entry(article) - msgid = self.database.next(archive, i) - # Finish up this index - self.write_index_footer() - sys.stdout=temp_stdout - f.close() + for hdr in ('Date', 'Subject', 'Author'): + self._update_simple_index(hdr, archive, arcdir) + + self._update_thread_index(archive, arcdir) + + def _update_simple_index(self, hdr, archive, arcdir): + self.message(" " + hdr) + self.type = hdr + hdr = string.lower(hdr) + + self._open_index_file_as_stdout(arcdir, hdr) + self.write_index_header() + count = 0 + # Loop over the index entries + finished = 0 + msgid = self.database.first(archive, hdr) + while msgid is not None: + try: + article = self.database.getArticle(self.archive, msgid) + except KeyError: + pass + else: + count = count + 1 + self.write_index_entry(article) + msgid = self.database.next(archive, hdr) + # Finish up this index + self.write_index_footer() + self._restore_stdout() - # Print the threaded index + def _update_thread_index(self, archive, arcdir): self.message(" Thread") - temp_stdout, sys.stdout=sys.stdout, open(os.path.join(arcdir, 'thread' + self.INDEX_EXT), 'w') -## os.chmod(os.path.join(arcdir, 'thread' + self.INDEX_EXT), self.FILEMODE) - self.type='Thread' + self._open_index_file_as_stdout(arcdir, "thread") + self.type = 'Thread' self.write_index_header() # To handle the prev./next in thread pointers, we need to # track articles 5 at a time. # Get the first 5 articles - L=[ None ]*5 ; i=2 ; finished=0 - msgid=self.database.first(self.archive, 'thread') - while msgid!=None and i<5: - L[i]=self.database.getArticle(self.archive, msgid) ; i=i+1 + L = [None] * 5 + i = 2 + finished = 0 + msgid = self.database.first(self.archive, 'thread') + + while msgid is not None and i < 5: + L[i] = self.database.getArticle(self.archive, msgid) + i = i + 1 msgid = self.database.next(self.archive, 'thread') - while L[2]!=None: - article=L[2] ; artkey=None - if article!=None: artkey=article.threadKey - if artkey!=None: - import sys - self.write_threadindex_entry(article, string.count(artkey, '-')-1) - if self.database.changed.has_key( (archive,article.msgid) ): - a1=L[1] ; a3=L[3] + while L[2] is not None: + article = L[2] + artkey = None + if article is not None: + artkey = article.threadKey + if artkey is not None: + self.write_threadindex_entry(article, + string.count(artkey, '-') - 1) + if self.database.changed.has_key((archive,article.msgid)): + a1 = L[1] + a3 = L[3] self.update_article(arcdir, article, a1, a3) - if a3!=None: self.database.changed[ (archive,a3.msgid) ]=None - if a1!=None: - if not self.database.changed.has_key( (archive,a1.msgid) ): + if a3 is not None: + self.database.changed[(archive, a3.msgid)] = None + if a1 is not None: + key = archive, a1.msgid + if not self.database.changed.has_key(key): self.update_article(arcdir, a1, L[0], L[2]) - else: del self.database.changed[ (archive,a1.msgid) ] - L=L[1:] # Rotate the list - if msgid==None: L.append(msgid) - else: L.append( self.database.getArticle(self.archive, msgid) ) + else: + del self.database.changed[key] + L = L[1:] # Rotate the list + if msgid is None: + L.append(msgid) + else: + L.append(self.database.getArticle(self.archive, msgid)) msgid = self.database.next(self.archive, 'thread') self.write_index_footer() - sys.stdout=temp_stdout + self._restore_stdout() + + def _open_index_file_as_stdout(self, arcdir, index_name): + path = os.path.join(arcdir, index_name + self.INDEX_EXT) + self.__f = open(path, "w") + self.__stdout = sys.stdout + sys.stdout = self.__f + + def _restore_stdout(self): + sys.stdout = self.__stdout + self.__f.close() + del self.__f + del self.__stdout # Update only archives that have been marked as "changed". def update_dirty_archives(self): - for i in self._dirty_archives: self.update_archive(i) - self._dirty_archives=[] + for i in self._dirty_archives: + self.update_archive(i) + self._dirty_archives = [] # Read a Unix mailbox file from the file object <input>, # and create a series of Article objects. Each article # object will then be archived. - def processUnixMailbox(self, input, articleClass=Article): + def processUnixMailbox(self, input, articleClass = Article): import mailbox - mbox=mailbox.UnixMailbox(input) + mbox = mailbox.UnixMailbox(input) while (1): - m=mbox.next() - if not m: break # End of file reached - a=articleClass(m, self.sequence) # Create an article object - self.sequence=self.sequence+1 # Increment the archive's sequence number - self.add_article(a) # Add the article + m = mbox.next() + if not m: + break + a = articleClass(m, self.sequence) + self.sequence = self.sequence + 1 + self.add_article(a) # Archive an Article object. def add_article(self, article): # Determine into what archives the article should be placed - archives=self.get_archives(article) - if archives==None: archives=[] # If no value was returned, ignore it - if type(archives)==type(''): archives=[archives] # If a string was returned, convert to a list - if archives==[]: return # Ignore the article + archives = self.get_archives(article) + if not archives: + return + if type(archives) == type(''): + archives = [archives] # Add the article to each archive in turn - article.filename=filename=self.get_filename(article) - temp=self.format_article(article) # Reformat the article - self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives)) + article.filename = filename = self.get_filename(article) + temp = self.format_article(article) # Reformat the article + self.message("Processing article #" + str(article.sequence)+ \ + "into archives " + str(archives)) for i in archives: - self.archive=i - archivedir=os.path.join(self.basedir, i) + self.archive = i + archivedir = os.path.join(self.basedir, i) # If it's a new archive, create it if i not in self.archives: - self.archives.append(i) ; self.update_TOC=1 + self.archives.append(i) + self.update_TOC = 1 self.database.newArchive(i) # If the archive directory doesn't exist, create it try: os.stat(archivedir) except os.error, errdata: - errno, errmsg=errdata - if errno==2: + errno, errmsg = errdata + if errno == 2: mkdir(archivedir, self.DIRMODE) else: raise os.error, errdata self.open_new_archive(i, archivedir) # Write the HTML-ized article - f=open(os.path.join(archivedir, filename), 'w') -## os.chmod(os.path.join(archivedir, filename), self.FILEMODE) + f = open(os.path.join(archivedir, filename), 'w') temp_stdout, sys.stdout = sys.stdout, f self.write_article_header(temp) sys.stdout.writelines(temp.body) self.write_article_footer(temp) - sys.stdout=temp_stdout + sys.stdout = temp_stdout f.close() - authorkey=fixAuthor(article.author)+'\000'+article.date - subjectkey=string.lower(article.subject)+'\000'+article.date + authorkey = fixAuthor(article.author)+'\000'+article.date + subjectkey = string.lower(article.subject)+'\000'+article.date # Update parenting info - parentID=None - if article.in_reply_to!='': parentID=article.in_reply_to - elif article.references!=[]: - # Remove article IDs that aren't in the archive - refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x), - article.references) - if len(refs): - refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs) - maxdate=refs[0] - for ref in refs[1:]: - if ref.date>maxdate.date: maxdate=ref - parentID=maxdate.msgid + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: + refs = self._remove_external_references(article.references) + if refs: + maxdata = max(map(lambda ref:ref.data, refs)) + parentID = maxdate.msgid else: - # Get the oldest article with a matching subject, and assume this is - # a follow-up to that article - parentID=self.database.getOldestArticle(self.archive, article.subject) + # Get the oldest article with a matching subject, and + # assume this is a follow-up to that article + parentID = self.database.getOldestArticle(self.archive, + article.subject) - if parentID!=None and not self.database.hasArticle(self.archive, parentID): - parentID=None - article.parentID=parentID - if parentID!=None: - parent=self.database.getArticle(self.archive, parentID) - article.threadKey=parent.threadKey+article.date+'-' - else: article.threadKey=article.date+'-' - self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid) + if parentID is not None \ + and not self.database.hasArticle(self.archive, parentID): + parentID = None + article.parentID = parentID + if parentID is not None: + parent = self.database.getArticle(self.archive, parentID) + article.threadKey = parent.threadKey + article.date + '-' + else: + article.threadKey = article.date + '-' + key = article.threadKey + '\000' + article.msgid + self.database.setThreadKey(self.archive, key, article.msgid) self.database.addArticle(i, temp, subjectkey, authorkey) - if i not in self._dirty_archives: self._dirty_archives.append(i) - del temp + + def _remove_external_references(self, refs): + keep = [] + for ref in refs: + if self.database.hasArticle(self.archive, ref): + kepp.append(ref) # Abstract methods: these will need to be overridden by subclasses # before anything useful can be done. @@ -512,27 +565,29 @@ class T: class BSDDBdatabase(Database): def __init__(self, basedir): - self.__cachekeys=[] ; self.__cachedict={} - self.__currentOpenArchive=None # The currently open indices - self.basedir=os.path.expanduser(basedir) - self.changed={} # Recently added articles, indexed only by message ID + self.__cachekeys = [] + self.__cachedict = {} + self.__currentOpenArchive = None # The currently open indices + self.basedir = os.path.expanduser(basedir) + self.changed = {} # Recently added articles, indexed only by + # message ID def firstdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: date, msgid = self.dateIndex.first() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + date = time.asctime(time.localtime(string.atof(date))) + except KeyError: + pass return date def lastdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: date, msgid = self.dateIndex.last() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + date = time.asctime(time.localtime(string.atof(date))) + except KeyError: + pass return date def numArticles(self, archive): self.__openIndices(archive) @@ -544,55 +599,59 @@ class BSDDBdatabase(Database): self.__openIndices(archive) # Add the new article - self.dateIndex[article.date]=article.msgid - self.authorIndex[authorkey]=article.msgid - self.subjectIndex[subjectkey]=article.msgid - # Set the 'body' attribute to empty, to avoid storing the whole message - temp = article.body ; article.body=[] - self.articleIndex[article.msgid]=pickle.dumps(article) - article.body=temp - self.changed[archive,article.msgid]=None + self.dateIndex[article.date] = article.msgid + self.authorIndex[authorkey] = article.msgid + self.subjectIndex[subjectkey] = article.msgid + # Set the 'body' attribute to empty, to avoid storing the + # whole message + temp = article.body + article.body = [] + self.articleIndex[article.msgid] = pickle.dumps(article) + article.body = temp + self.changed[archive,article.msgid] = None - parentID=article.parentID - if parentID!=None and self.articleIndex.has_key(parentID): - parent=self.getArticle(archive, parentID) - myThreadKey=parent.threadKey+article.date+'-' - else: myThreadKey = article.date+'-' - article.threadKey=myThreadKey - self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid) + parentID = article.parentID + if parentID is not None and self.articleIndex.has_key(parentID): + parent = self.getArticle(archive, parentID) + myThreadKey = parent.threadKey+article.date + '-' + else: + myThreadKey = article.date + '-' + article.threadKey = myThreadKey + key = myThreadKey + '\000' + article.msgid + self.setThreadKey(archive, key, article.msgid) # Open the BSDDB files that are being used as indices # (dateIndex, authorIndex, subjectIndex, articleIndex) def __openIndices(self, archive): - if self.__currentOpenArchive==archive: return + if self.__currentOpenArchive == archive: return import bsddb self.__closeIndices() -# print 'opening indices for [%s]' % (repr(archive),) - arcdir=os.path.join(self.basedir, 'database') + arcdir = os.path.join(self.basedir, 'database') try: mkdir(arcdir) except os.error: pass - for i in ['date', 'author', 'subject', 'article', 'thread']: - t=bsddb.btopen(os.path.join(arcdir, archive+'-'+i), 'c') - setattr(self, i+'Index', t) - self.__currentOpenArchive=archive + for hdr in ('date', 'author', 'subject', 'article', 'thread'): + path = os.path.join(arcdir, archive + '-' + hdr) + t = bsddb.btopen(path, 'c') + setattr(self, hdr + 'Index', t) + self.__currentOpenArchive = archive # Close the BSDDB files that are being used as indices (if they're # open--this is safe to call if they're already closed) def __closeIndices(self): - if self.__currentOpenArchive!=None: + if self.__currentOpenArchive is not None: pass -# print 'closing indices for [%s]' % (repr(self.__currentOpenArchive),) - for i in ['date', 'author', 'subject', 'thread', 'article']: - attr=i+'Index' + for hdr in ('date', 'author', 'subject', 'thread', 'article'): + attr = hdr + 'Index' if hasattr(self, attr): - index=getattr(self, attr) - if i=='article': - if not hasattr(self, 'archive_length'): self.archive_length={} - self.archive_length[self.__currentOpenArchive]=len(index) + index = getattr(self, attr) + if hdr == 'article': + if not hasattr(self, 'archive_length'): + self.archive_length = {} + self.archive_length[self.__currentOpenArchive] = len(index) index.close() delattr(self,attr) - self.__currentOpenArchive=None + self.__currentOpenArchive = None def close(self): self.__closeIndices() def hasArticle(self, archive, msgid): @@ -600,60 +659,71 @@ class BSDDBdatabase(Database): return self.articleIndex.has_key(msgid) def setThreadKey(self, archive, key, msgid): self.__openIndices(archive) - self.threadIndex[key]=msgid + self.threadIndex[key] = msgid def getArticle(self, archive, msgid): self.__openIndices(archive) if self.__cachedict.has_key(msgid): self.__cachekeys.remove(msgid) self.__cachekeys.append(msgid) return self.__cachedict[msgid] - if len(self.__cachekeys)==CACHESIZE: - delkey, self.__cachekeys = self.__cachekeys[0], self.__cachekeys[1:] + if len(self.__cachekeys) == CACHESIZE: + delkey, self.__cachekeys = (self.__cachekeys[0], + self.__cachekeys[1:]) del self.__cachedict[delkey] - s=self.articleIndex[msgid] - article=pickle.loads(s) - self.__cachekeys.append(msgid) ; self.__cachedict[msgid]=article + s = self.articleIndex[msgid] + article = pickle.loads(s) + self.__cachekeys.append(msgid) + self.__cachedict[msgid] = article return article def first(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index+'Index') try: key, msgid = index.first() return msgid - except KeyError: return None + except KeyError: + return None def next(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index+'Index') try: key, msgid = index.next() - return msgid - except KeyError: return None + except KeyError: + return None + else: + return msgid + def getOldestArticle(self, archive, subject): self.__openIndices(archive) - subject=string.lower(subject) + subject = string.lower(subject) try: - key, tempid=self.subjectIndex.set_location(subject) + key, tempid = self.subjectIndex.set_location(subject) self.subjectIndex.next() - [subject2, date]= string.split(key, '\0') - if subject!=subject2: return None + [subject2, date] = string.split(key, '\0') + if subject != subject2: + return None return tempid - except KeyError: + except KeyError: # XXX what line raises the KeyError? return None - def newArchive(self, archive): pass + def newArchive(self, archive): + pass + def clearIndex(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') - finished=0 + index = getattr(self, index+'Index') + finished = 0 try: - key, msgid=self.threadIndex.first() - except KeyError: finished=1 + key, msgid = self.threadIndex.first() + except KeyError: + finished = 1 while not finished: del self.threadIndex[key] try: - key, msgid=self.threadIndex.next() - except KeyError: finished=1 + key, msgid = self.threadIndex.next() + except KeyError: + finished = 1 |
