diff options
| author | jhylton | 2000-09-22 18:23:37 +0000 |
|---|---|---|
| committer | jhylton | 2000-09-22 18:23:37 +0000 |
| commit | ad4ba52f694b2f026be863ab869b8d7ca83ab03c (patch) | |
| tree | 81c03da4aaa50d7c20b3f84609889dbc5dec66ed | |
| parent | bf6d28c3c757820f428772472379cc8848948a85 (diff) | |
| download | mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.gz mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.zst mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.zip | |
Fix index generation bug that oocasionally prevented messages from
appearing in index. pipermail generated several indexes by assuming
that date was unique. If two messages arrived with, e.g., the same
author and date, then the author index treated them as identical.
As a result, both messages were archived, but only the last one was
included in the index. Solution is to always include the msgid, which
is unique, in the index key.
Change database keys to combine elements using tuples instead of
string concatenation with \000 as separator.
Fix was accomplished by refactoring on pipermail.Database and its
subclasses. Push index-key generation into common concrete base class
Database; rename abstract base class to DatabaseInterface. Break up
addArticle method into several pieces.
TBD There is still more refactoring to do on Database class.
Because date key has changed, HyperDatabase method to return first and
last date changed to reflect format of date key.
Refactor pipermail.T.add_article into several pieces.
| -rw-r--r-- | Mailman/Archiver/HyperDatabase.py | 134 | ||||
| -rw-r--r-- | Mailman/Archiver/pipermail.py | 228 |
2 files changed, 202 insertions, 160 deletions
diff --git a/Mailman/Archiver/HyperDatabase.py b/Mailman/Archiver/HyperDatabase.py index a5779a7c7..c49b487be 100644 --- a/Mailman/Archiver/HyperDatabase.py +++ b/Mailman/Archiver/HyperDatabase.py @@ -20,6 +20,8 @@ import os import marshal import string +import sys +import time import errno # @@ -39,7 +41,6 @@ try: except ImportError: import pickle - # # we're using a python dict in place of # of bsddb.btree database. only defining @@ -47,12 +48,17 @@ except ImportError: # only one thing can access this at a time. # class DumbBTree: - # XXX This dictionary-like object stores pickles of all the - # Article objects. The object itself is stored using marshal. It - # would be much simpler, and probably faster, to store the actual - # objects in the DumbBTree and pickle it. - # XXX Also needs a more sensible name, like IteratableDictionary - # or SortedDictionary. + """Stores pickles of Article objects + + This dictionary-like object stores pickles of all the Article + objects. The object itself is stored using marshal. It would be + much simpler, and probably faster, to store the actual objects in + the DumbBTree and pickle it. + + TBD: Also needs a more sensible name, like IteratableDictionary or + SortedDictionary. + """ + def __init__(self, path): self.current_index = 0 self.path = path @@ -75,6 +81,9 @@ class DumbBTree: else: self.__sort(dirty=1) + def __repr__(self): + return "DumbBTree(%s)" % self.path + def __sort(self, dirty=None): if self.__dirty == 1 or dirty: self.sorted = self.dict.keys() @@ -123,9 +132,8 @@ class DumbBTree: raise KeyError else: key = self.sorted[0] - res = key, self.dict[key] self.current_index = 1 - return res + return key, self.dict[key] def last(self): if not self.sorted: @@ -178,7 +186,6 @@ class DumbBTree: fp.close() self.unlock() - # this is lifted straight out of pipermail with # the bsddb.btree replaced with above class. @@ -186,6 +193,8 @@ class DumbBTree: # __internal stuff that needs to be here -scott # class HyperDatabase(pipermail.Database): + __super_addArticle = pipermail.Database.addArticle + def __init__(self, basedir): self.__cache = {} self.__currentOpenArchive = None # The currently open indices @@ -194,89 +203,73 @@ class HyperDatabase(pipermail.Database): self.changed={} def firstdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: - date, msgid = self.dateIndex.first() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + datekey, msgid = self.dateIndex.first() + date = time.asctime(time.localtime(string.atof(datekey[0]))) + except KeyError: + pass return date def lastdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: - date, msgid = self.dateIndex.last() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + datekey, msgid = self.dateIndex.last() + date = time.asctime(time.localtime(string.atof(datekey[0]))) + except KeyError: + pass return date def numArticles(self, archive): self.__openIndices(archive) return len(self.dateIndex) - # Add a single article to the internal indexes for an archive. - - def addArticle(self, archive, article, subjectkey, authorkey): + def addArticle(self, archive, article, subject=None, author=None, + date=None): self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) - # Add the new article - self.dateIndex[article.date]=article.msgid - self.authorIndex[authorkey]=article.msgid - self.subjectIndex[subjectkey]=article.msgid - # Set the 'body' attribute to empty, to avoid storing the whole message - temp = article.body ; article.body=[] - self.articleIndex[article.msgid]=pickle.dumps(article) - article.body=temp - self.changed[archive,article.msgid]=None - - parentID=article.parentID - if parentID!=None and self.articleIndex.has_key(parentID): - parent=self.getArticle(archive, parentID) - myThreadKey=parent.threadKey+article.date+'-' - else: myThreadKey = article.date+'-' - article.threadKey=myThreadKey - self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid) - - # Open the BSDDB files that are being used as indices - # (dateIndex, authorIndex, subjectIndex, articleIndex) def __openIndices(self, archive): - if self.__currentOpenArchive==archive: return + if self.__currentOpenArchive == archive: + return self.__closeIndices() - arcdir=os.path.join(self.basedir, 'database') - try: mkdir(arcdir, mode=02770) - except os.error: pass - for i in ['date', 'author', 'subject', 'article', 'thread']: - t=DumbBTree(os.path.join(arcdir, archive+'-'+i)) - setattr(self, i+'Index', t) - self.__currentOpenArchive=archive + arcdir = os.path.join(self.basedir, 'database') + try: + mkdir(arcdir, mode=02770) + except os.error: + pass + for i in ('date', 'author', 'subject', 'article', 'thread'): + t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) + setattr(self, i + 'Index', t) + self.__currentOpenArchive = archive - # Close the BSDDB files that are being used as indices (if they're - # open--this is safe to call if they're already closed) def __closeIndices(self): - if self.__currentOpenArchive!=None: - pass - for i in ['date', 'author', 'subject', 'thread', 'article']: - attr=i+'Index' + for i in ('date', 'author', 'subject', 'thread', 'article'): + attr = i + 'Index' if hasattr(self, attr): - index=getattr(self, attr) - if i=='article': + index = getattr(self, attr) + if i == 'article': if not hasattr(self, 'archive_length'): - self.archive_length={} - self.archive_length[self.__currentOpenArchive]=len(index) + self.archive_length = {} + l = len(index) + self.archive_length[self.__currentOpenArchive] = l index.close() - delattr(self,attr) - self.__currentOpenArchive=None + delattr(self, attr) + self.__currentOpenArchive = None + def close(self): self.__closeIndices() + def hasArticle(self, archive, msgid): self.__openIndices(archive) return self.articleIndex.has_key(msgid) + def setThreadKey(self, archive, key, msgid): self.__openIndices(archive) self.threadIndex[key]=msgid + def getArticle(self, archive, msgid): self.__openIndices(archive) if not self.__cache.has_key(msgid): @@ -289,18 +282,21 @@ class HyperDatabase(pipermail.Database): def first(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index + 'Index') try: key, msgid = index.first() return msgid - except KeyError: return None + except KeyError: + return None + def next(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index + 'Index') try: key, msgid = index.next() return msgid - except KeyError: return None + except KeyError: + return None def getOldestArticle(self, archive, subject): self.__openIndices(archive) @@ -314,7 +310,9 @@ class HyperDatabase(pipermail.Database): except KeyError: return None - def newArchive(self, archive): pass + def newArchive(self, archive): + pass + def clearIndex(self, archive, index): self.__openIndices(archive) ## index=getattr(self, index+'Index') diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py index 71fb323d2..0cfe93ddd 100644 --- a/Mailman/Archiver/pipermail.py +++ b/Mailman/Archiver/pipermail.py @@ -58,12 +58,13 @@ def fixAuthor(author): # Abstract class for databases -class Database: +class DatabaseInterface: def __init__(self): pass def close(self): pass def getArticle(self, archive, msgid): pass def hasArticle(self, archive, msgid): pass - def addArticle(self, archive, article, subjectkey, authorkey): pass + def addArticle(self, archive, article, subject=None, author=None, + date=None): pass def firstdate(self, archive): pass def lastdate(self, archive): pass def first(self, archive, index): pass @@ -73,6 +74,60 @@ class Database: def setThreadKey(self, archive, key, msgid): pass def getOldestArticle(self, subject): pass +class Database(DatabaseInterface): + """Define the basic sorting logic for a database + + Assumes that the database internally uses dateIndex, authorIndex, + etc. + """ + + # TBD Factor out more of the logic shared between BSDDBDatabase + # and HyperDatabase and place it in this class. + + def __init__(self): + # This method need not be called by subclasses that do their + # own initialization. + self.dateIndex = {} + self.authorIndex = {} + self.subjectIndex = {} + self.articleIndex = {} + self.changed = {} + + def addArticle(self, archive, article, subject=None, author=None, + date=None): + # create the keys; always end w/ msgid which will be unique + authorkey = (author or article.author, article.date, + article.msgid) + subjectkey = (subject or article.subject, article.date, + article.msgid) + datekey = date or article.date, article.msgid + + # Add the new article + self.dateIndex[datekey] = article.msgid + self.authorIndex[authorkey] = article.msgid + self.subjectIndex[subjectkey] = article.msgid + + self.store_article(article) + self.changed[archive, article.msgid] = None + + parentID = article.parentID + if parentID is not None and self.articleIndex.has_key(parentID): + parent = self.getArticle(archive, parentID) + myThreadKey = parent.threadKey + article.date + '-' + else: + myThreadKey = article.date + '-' + article.threadKey = myThreadKey + key = myThreadKey, article.msgid + self.setThreadKey(archive, key, article.msgid) + + def store_article(self, article): + """Store article without message body to save space""" + # TBD this is not thread safe! + temp = article.body + article.body = [] + self.articleIndex[article.msgid] = pickle.dumps(article) + article.body = temp + # The Article class encapsulates a single posting. The attributes # are: # @@ -330,7 +385,7 @@ class T: article.parentID) article.threadKey = parent.threadKey+article.date+'-' self.database.setThreadKey(self.archive, - article.threadKey + '\000' + article.msgid, + (article.threadKey, article.msgid), msgid) msgid = self.database.next(self.archive, 'date') @@ -460,80 +515,84 @@ class T: self.sequence = self.sequence + 1 self.add_article(a) - # Archive an Article object. + def new_archive(self, archive, archivedir): + self.archives.append(archive) + self.update_TOC = 1 + self.database.newArchive(archive) + # If the archive directory doesn't exist, create it + try: + os.stat(archivedir) + except os.error, errdata: + errno, errmsg = errdata + if errno == 2: + mkdir(archivedir, self.DIRMODE) + else: + raise os.error, errdata + self.open_new_archive(archive, archivedir) + def add_article(self, article): - # Determine into what archives the article should be placed archives = self.get_archives(article) if not archives: return if type(archives) == type(''): archives = [archives] - # Add the article to each archive in turn article.filename = filename = self.get_filename(article) - temp = self.format_article(article) # Reformat the article - fmt = "Processing article #%s into archives %s" - self.message(fmt % (article.sequence, archives)) - for i in archives: - self.archive = i - archivedir = os.path.join(self.basedir, i) - # If it's a new archive, create it - if i not in self.archives: - self.archives.append(i) - self.update_TOC = 1 - self.database.newArchive(i) - # If the archive directory doesn't exist, create it - try: - os.stat(archivedir) - except os.error, errdata: - errno, errmsg = errdata - if errno == 2: - mkdir(archivedir, self.DIRMODE) - else: - raise os.error, errdata - self.open_new_archive(i, archivedir) + temp = self.format_article(article) + fmt = "Processing article #%s into archives %s: %s" + self.message(fmt % (article.sequence, archives, article.subject)) + for arch in archives: + self.archive = arch # why do this??? + archivedir = os.path.join(self.basedir, arch) + if arch not in self.archives: + self.new_archive(arch, archivedir) # Write the HTML-ized article - self.write_article(i, temp, os.path.join(archivedir, - filename)) + self.write_article(arch, temp, os.path.join(archivedir, + filename)) - authorkey = fixAuthor(article.author) + '\000' + article.date - subjectkey = string.lower(article.subject ) +'\000' + article.date - - # Update parenting info - parentID = None - if article.in_reply_to: - parentID = article.in_reply_to - elif article.references: - refs = self._remove_external_references(article.references) - if refs: - maxdate = self.database.getArticle(self.archive, - refs[0]) - for ref in refs[1:]: - a = self.database.getArticle(self.archive, ref) - if a.date > maxdate.date: - maxdate = a - parentID = maxdate.msgid - else: - # Get the oldest article with a matching subject, and - # assume this is a follow-up to that article - parentID = self.database.getOldestArticle(self.archive, - article.subject) + author = fixAuthor(article.author) + subject = string.lower(article.subject) - if parentID is not None \ - and not self.database.hasArticle(self.archive, parentID): - parentID = None - article.parentID = parentID - if parentID is not None: - parent = self.database.getArticle(self.archive, parentID) - article.threadKey = parent.threadKey + article.date + '-' - else: + article.parentID = parentID = self.get_parent_info(arch, article) + if parentID: + parent = self.database.getArticle(arch, parentID) + article.threadKey = parent.threadKey + article.date + '-' + else: article.threadKey = article.date + '-' - key = article.threadKey + '\000' + article.msgid - self.database.setThreadKey(self.archive, key, article.msgid) - self.database.addArticle(i, temp, subjectkey, authorkey) - if i not in self._dirty_archives: - self._dirty_archives.append(i) + key = article.threadKey, article.msgid + + self.database.setThreadKey(arch, key, article.msgid) + self.database.addArticle(arch, temp, author=author, + subject=subject) + + if arch not in self._dirty_archives: + self._dirty_archives.append(arch) + + def get_parent_info(self, archive, article): + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: + refs = self._remove_external_references(article.references) + if refs: + maxdate = self.database.getArticle(archive, refs[0]) + for ref in refs[1:]: + a = self.database.getArticle(archive, ref) + if a.date > maxdate.date: + maxdate = a + parentID = maxdate.msgid + else: + # Get the oldest article with a matching subject, and + # assume this is a follow-up to that article + parentID = self.database.getOldestArticle(archive, + article.subject) + + if parentID and not self.database.hasArticle(archive, parentID): + parentID = None + return parentID + + def write_article(self, index, article, path): f = open(path, 'w') @@ -588,13 +647,16 @@ class T: class BSDDBdatabase(Database): + __super_addArticle = Database.addArticle + def __init__(self, basedir): self.__cachekeys = [] self.__cachedict = {} self.__currentOpenArchive = None # The currently open indices self.basedir = os.path.expanduser(basedir) self.changed = {} # Recently added articles, indexed only by - # message ID + # message ID + def firstdate(self, archive): self.__openIndices(archive) date = 'None' @@ -604,6 +666,7 @@ class BSDDBdatabase(Database): except KeyError: pass return date + def lastdate(self, archive): self.__openIndices(archive) date = 'None' @@ -613,41 +676,21 @@ class BSDDBdatabase(Database): except KeyError: pass return date + def numArticles(self, archive): self.__openIndices(archive) return len(self.dateIndex) - # Add a single article to the internal indexes for an archive. - - def addArticle(self, archive, article, subjectkey, authorkey): - self.__openIndices(archive) - - # Add the new article - self.dateIndex[article.date] = article.msgid - self.authorIndex[authorkey] = article.msgid - self.subjectIndex[subjectkey] = article.msgid - # Set the 'body' attribute to empty, to avoid storing the - # whole message - temp = article.body - article.body = [] - self.articleIndex[article.msgid] = pickle.dumps(article) - article.body = temp - self.changed[archive,article.msgid] = None - - parentID = article.parentID - if parentID is not None and self.articleIndex.has_key(parentID): - parent = self.getArticle(archive, parentID) - myThreadKey = parent.threadKey+article.date + '-' - else: - myThreadKey = article.date + '-' - article.threadKey = myThreadKey - key = myThreadKey + '\000' + article.msgid - self.setThreadKey(archive, key, article.msgid) + def addArticle(self, archive, article, subject=None, author=None, + date=None): + self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) # Open the BSDDB files that are being used as indices # (dateIndex, authorIndex, subjectIndex, articleIndex) def __openIndices(self, archive): - if self.__currentOpenArchive == archive: return + if self.__currentOpenArchive == archive: + return import bsddb self.__closeIndices() @@ -676,6 +719,7 @@ class BSDDBdatabase(Database): index.close() delattr(self,attr) self.__currentOpenArchive = None + def close(self): self.__closeIndices() def hasArticle(self, archive, msgid): |
