diff options
| author | jhylton | 2000-09-22 18:23:37 +0000 |
|---|---|---|
| committer | jhylton | 2000-09-22 18:23:37 +0000 |
| commit | ad4ba52f694b2f026be863ab869b8d7ca83ab03c (patch) | |
| tree | 81c03da4aaa50d7c20b3f84609889dbc5dec66ed | |
| parent | bf6d28c3c757820f428772472379cc8848948a85 (diff) | |
| download | mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.gz mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.zst mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.zip | |
| -rw-r--r-- | Mailman/Archiver/HyperDatabase.py | 134 | ||||
| -rw-r--r-- | Mailman/Archiver/pipermail.py | 228 |
2 files changed, 202 insertions, 160 deletions
diff --git a/Mailman/Archiver/HyperDatabase.py b/Mailman/Archiver/HyperDatabase.py index a5779a7c7..c49b487be 100644 --- a/Mailman/Archiver/HyperDatabase.py +++ b/Mailman/Archiver/HyperDatabase.py @@ -20,6 +20,8 @@ import os import marshal import string +import sys +import time import errno # @@ -39,7 +41,6 @@ try: except ImportError: import pickle - # # we're using a python dict in place of # of bsddb.btree database. only defining @@ -47,12 +48,17 @@ except ImportError: # only one thing can access this at a time. # class DumbBTree: - # XXX This dictionary-like object stores pickles of all the - # Article objects. The object itself is stored using marshal. It - # would be much simpler, and probably faster, to store the actual - # objects in the DumbBTree and pickle it. - # XXX Also needs a more sensible name, like IteratableDictionary - # or SortedDictionary. + """Stores pickles of Article objects + + This dictionary-like object stores pickles of all the Article + objects. The object itself is stored using marshal. It would be + much simpler, and probably faster, to store the actual objects in + the DumbBTree and pickle it. + + TBD: Also needs a more sensible name, like IteratableDictionary or + SortedDictionary. + """ + def __init__(self, path): self.current_index = 0 self.path = path @@ -75,6 +81,9 @@ class DumbBTree: else: self.__sort(dirty=1) + def __repr__(self): + return "DumbBTree(%s)" % self.path + def __sort(self, dirty=None): if self.__dirty == 1 or dirty: self.sorted = self.dict.keys() @@ -123,9 +132,8 @@ class DumbBTree: raise KeyError else: key = self.sorted[0] - res = key, self.dict[key] self.current_index = 1 - return res + return key, self.dict[key] def last(self): if not self.sorted: @@ -178,7 +186,6 @@ class DumbBTree: fp.close() self.unlock() - # this is lifted straight out of pipermail with # the bsddb.btree replaced with above class. @@ -186,6 +193,8 @@ class DumbBTree: # __internal stuff that needs to be here -scott # class HyperDatabase(pipermail.Database): + __super_addArticle = pipermail.Database.addArticle + def __init__(self, basedir): self.__cache = {} self.__currentOpenArchive = None # The currently open indices @@ -194,89 +203,73 @@ class HyperDatabase(pipermail.Database): self.changed={} def firstdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: - date, msgid = self.dateIndex.first() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + datekey, msgid = self.dateIndex.first() + date = time.asctime(time.localtime(string.atof(datekey[0]))) + except KeyError: + pass return date def lastdate(self, archive): - import time self.__openIndices(archive) - date='None' + date = 'None' try: - date, msgid = self.dateIndex.last() - date=time.asctime(time.localtime(string.atof(date))) - except KeyError: pass + datekey, msgid = self.dateIndex.last() + date = time.asctime(time.localtime(string.atof(datekey[0]))) + except KeyError: + pass return date def numArticles(self, archive): self.__openIndices(archive) return len(self.dateIndex) - # Add a single article to the internal indexes for an archive. - - def addArticle(self, archive, article, subjectkey, authorkey): + def addArticle(self, archive, article, subject=None, author=None, + date=None): self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) - # Add the new article - self.dateIndex[article.date]=article.msgid - self.authorIndex[authorkey]=article.msgid - self.subjectIndex[subjectkey]=article.msgid - # Set the 'body' attribute to empty, to avoid storing the whole message - temp = article.body ; article.body=[] - self.articleIndex[article.msgid]=pickle.dumps(article) - article.body=temp - self.changed[archive,article.msgid]=None - - parentID=article.parentID - if parentID!=None and self.articleIndex.has_key(parentID): - parent=self.getArticle(archive, parentID) - myThreadKey=parent.threadKey+article.date+'-' - else: myThreadKey = article.date+'-' - article.threadKey=myThreadKey - self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid) - - # Open the BSDDB files that are being used as indices - # (dateIndex, authorIndex, subjectIndex, articleIndex) def __openIndices(self, archive): - if self.__currentOpenArchive==archive: return + if self.__currentOpenArchive == archive: + return self.__closeIndices() - arcdir=os.path.join(self.basedir, 'database') - try: mkdir(arcdir, mode=02770) - except os.error: pass - for i in ['date', 'author', 'subject', 'article', 'thread']: - t=DumbBTree(os.path.join(arcdir, archive+'-'+i)) - setattr(self, i+'Index', t) - self.__currentOpenArchive=archive + arcdir = os.path.join(self.basedir, 'database') + try: + mkdir(arcdir, mode=02770) + except os.error: + pass + for i in ('date', 'author', 'subject', 'article', 'thread'): + t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) + setattr(self, i + 'Index', t) + self.__currentOpenArchive = archive - # Close the BSDDB files that are being used as indices (if they're - # open--this is safe to call if they're already closed) def __closeIndices(self): - if self.__currentOpenArchive!=None: - pass - for i in ['date', 'author', 'subject', 'thread', 'article']: - attr=i+'Index' + for i in ('date', 'author', 'subject', 'thread', 'article'): + attr = i + 'Index' if hasattr(self, attr): - index=getattr(self, attr) - if i=='article': + index = getattr(self, attr) + if i == 'article': if not hasattr(self, 'archive_length'): - self.archive_length={} - self.archive_length[self.__currentOpenArchive]=len(index) + self.archive_length = {} + l = len(index) + self.archive_length[self.__currentOpenArchive] = l index.close() - delattr(self,attr) - self.__currentOpenArchive=None + delattr(self, attr) + self.__currentOpenArchive = None + def close(self): self.__closeIndices() + def hasArticle(self, archive, msgid): self.__openIndices(archive) return self.articleIndex.has_key(msgid) + def setThreadKey(self, archive, key, msgid): self.__openIndices(archive) self.threadIndex[key]=msgid + def getArticle(self, archive, msgid): self.__openIndices(archive) if not self.__cache.has_key(msgid): @@ -289,18 +282,21 @@ class HyperDatabase(pipermail.Database): def first(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index + 'Index') try: key, msgid = index.first() return msgid - except KeyError: return None + except KeyError: + return None + def next(self, archive, index): self.__openIndices(archive) - index=getattr(self, index+'Index') + index = getattr(self, index + 'Index') try: key, msgid = index.next() return msgid - except KeyError: return None + except KeyError: + return None def getOldestArticle(self, archive, subject): self.__openIndices(archive) @@ -314,7 +310,9 @@ class HyperDatabase(pipermail.Database): except KeyError: return None - def newArchive(self, archive): pass + def newArchive(self, archive): + pass + def clearIndex(self, archive, index): self.__openIndices(archive) ## index=getattr(self, index+'Index') diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py index 71fb323d2..0cfe93ddd 100644 --- a/Mailman/Archiver/pipermail.py +++ b/Mailman/Archiver/pipermail.py @@ -58,12 +58,13 @@ def fixAuthor(author): # Abstract class for databases -class Database: +class DatabaseInterface: def __init__(self): pass def close(self): pass def getArticle(self, archive, msgid): pass def hasArticle(self, archive, msgid): pass - def addArticle(self, archive, article, subjectkey, authorkey): pass + def addArticle(self, archive, article, subject=None, author=None, + date=None): pass def firstdate(self, archive): pass def lastdate(self, archive): pass def first(self, archive, index): pass @@ -73,6 +74,60 @@ class Database: def setThreadKey(self, archive, key, msgid): pass def getOldestArticle(self, subject): pass +class Database(DatabaseInterface): + """Define the basic sorting logic for a database + + Assumes that the database internally uses dateIndex, authorIndex, + etc. + """ + + # TBD Factor out more of the logic shared between BSDDBDatabase + # and HyperDatabase and place it in this class. + + def __init__(self): + # This method need not be called by subclasses that do their + # own initialization. + self.dateIndex = {} + self.authorIndex = {} + self.subjectIndex = {} + self.articleIndex = {} + self.changed = {} + + def addArticle(self, archive, article, subject=None, author=None, + date=None): + # create the keys; always end w/ msgid which will be unique + authorkey = (author or article.author, article.date, + article.msgid) + subjectkey = (subject or article.subject, article.date, + article.msgid) + datekey = date or article.date, article.msgid + + # Add the new article + self.dateIndex[datekey] = article.msgid + self.authorIndex[authorkey] = article.msgid + self.subjectIndex[subjectkey] = article.msgid + + self.store_article(article) + self.changed[archive, article.msgid] = None + + parentID = article.parentID + if parentID is not None and self.articleIndex.has_key(parentID): + parent = self.getArticle(archive, parentID) + myThreadKey = parent.threadKey + article.date + '-' + else: + myThreadKey = article.date + '-' + article.threadKey = myThreadKey + key = myThreadKey, article.msgid + self.setThreadKey(archive, key, article.msgid) + + def store_article(self, article): + """Store article without message body to save space""" + # TBD this is not thread safe! + temp = article.body + article.body = [] + self.articleIndex[article.msgid] = pickle.dumps(article) + article.body = temp + # The Article class encapsulates a single posting. The attributes # are: # @@ -330,7 +385,7 @@ class T: article.parentID) article.threadKey = parent.threadKey+article.date+'-' self.database.setThreadKey(self.archive, - article.threadKey + '\000' + article.msgid, + (article.threadKey, article.msgid), msgid) msgid = self.database.next(self.archive, 'date') @@ -460,80 +515,84 @@ class T: self.sequence = self.sequence + 1 self.add_article(a) - # Archive an Article object. + def new_archive(self, archive, archivedir): + self.archives.append(archive) + self.update_TOC = 1 + self.database.newArchive(archive) + # If the archive directory doesn't exist, create it + try: + os.stat(archivedir) + except os.error, errdata: + errno, errmsg = errdata + if errno == 2: + mkdir(archivedir, self.DIRMODE) + else: + raise os.error, errdata + self.open_new_archive(archive, archivedir) + def add_article(self, article): - # Determine into what archives the article should be placed archives = self.get_archives(article) if not archives: return if type(archives) == type(''): archives = [archives] - # Add the article to each archive in turn article.filename = filename = self.get_filename(article) - temp = self.format_article(article) # Reformat the article - fmt = "Processing article #%s into archives %s" - self.message(fmt % (article.sequence, archives)) - for i in archives: - self.archive = i - archivedir = os.path.join(self.basedir, i) - # If it's a new archive, create it - if i not in self.archives: - self.archives.append(i) - self.update_TOC = 1 - self.database.newArchive(i) - # If the archive directory doesn't exist, create it - try: - os.stat(archivedir) - except os.error, errdata: - errno, errmsg = errdata - if errno == 2: - mkdir(archivedir, self.DIRMODE) - else: - raise os.error, errdata - self.open_new_archive(i, archivedir) + temp = self.format_article(article) + fmt = "Processing article #%s into archives %s: %s" + self.message(fmt % (article.sequence, archives, article.subject)) + for arch in archives: + self.archive = arch # why do this??? + archivedir = os.path.join(self.basedir, arch) + if arch not in self.archives: + self.new_archive(arch, archivedir) # Write the HTML-ized article - self.write_article(i, temp, os.path.join(archivedir, - filename)) + self.write_article(arch, temp, os.path.join(archivedir, + filename)) - authorkey = fixAuthor(article.author) + '\000' + article.date - subjectkey = string.lower(article.subject ) +'\000' + article.date - - # Update parenting info - parentID = None - if article.in_reply_to: - parentID = article.in_reply_to - elif article.references: - refs = self._remove_external_references(article.references) - if refs: - maxdate = self.database.getArticle(self.archive, - refs[0]) - for ref in refs[1:]: - a = self.database.getArticle(self.archive, ref) - if a.date > maxdate.date: - maxdate = a - parentID = maxdate.msgid - else: - # Get the oldest article with a matching subject, and - # assume this is a follow-up to that article - parentID = self.database.getOldestArticle(self.archive, - article.subject) + author = fixAuthor(article.author) + subject = string.lower(article.subject) - if parentID is not None \ - and not self.database.hasArticle(self.archive, parentID): - parentID = None - article.parentID = parentID - if parentID is not None: - parent = self.database.getArticle(self.archive, parentID) - article.threadKey = parent.threadKey + article.date + '-' - else: + article.parentID = parentID = self.get_parent_info(arch, article) + if parentID: + parent = self.database.getArticle(arch, parentID) + article.threadKey = parent.threadKey + article.date + '-' + else: article.threadKey = article.date + '-' - key = article.threadKey + '\000' + article.msgid - self.database.setThreadKey(self.archive, key, article.msgid) - self.database.addArticle(i, temp, subjectkey, authorkey) - if i not in self._dirty_archives: - self._dirty_archives.append(i) + key = article.threadKey, article.msgid + + self.database.setThreadKey(arch, key, article.msgid) + self.database.addArticle(arch, temp, author=author, + subject=subject) + + if arch not in self._dirty_archives: + self._dirty_archives.append(arch) + + def get_parent_info(self, archive, article): + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: + refs = self._remove_external_references(article.references) + if refs: + maxdate = self.database.getArticle(archive, refs[0]) + for ref in refs[1:]: + a = self.database.getArticle(archive, ref) + if a.date > maxdate.date: + maxdate = a + parentID = maxdate.msgid + else: + # Get the oldest article with a matching subject, and + # assume this is a follow-up to that article + parentID = self.database.getOldestArticle(archive, + article.subject) + + if parentID and not self.database.hasArticle(archive, parentID): + parentID = None + return parentID + + def write_article(self, index, article, path): f = open(path, 'w') @@ -588,13 +647,16 @@ class T: class BSDDBdatabase(Database): + __super_addArticle = Database.addArticle + def __init__(self, basedir): self.__cachekeys = [] self.__cachedict = {} self.__currentOpenArchive = None # The currently open indices self.basedir = os.path.expanduser(basedir) self.changed = {} # Recently added articles, indexed only by - # message ID + # message ID + def firstdate(self, archive): self.__openIndices(archive) date = 'None' @@ -604,6 +666,7 @@ class BSDDBdatabase(Database): except KeyError: pass return date + def lastdate(self, archive): self.__openIndices(archive) date = 'None' @@ -613,41 +676,21 @@ class BSDDBdatabase(Database): except KeyError: pass return date + def numArticles(self, archive): self.__openIndices(archive) return len(self.dateIndex) - # Add a single article to the internal indexes for an archive. - - def addArticle(self, archive, article, subjectkey, authorkey): - self.__openIndices(archive) - - # Add the new article - self.dateIndex[article.date] = article.msgid - self.authorIndex[authorkey] = article.msgid - self.subjectIndex[subjectkey] = article.msgid - # Set the 'body' attribute to empty, to avoid storing the - # whole message - temp = article.body - article.body = [] - self.articleIndex[article.msgid] = pickle.dumps(article) - article.body = temp - self.changed[archive,article.msgid] = None - - parentID = article.parentID - if parentID is not None and self.articleIndex.has_key(parentID): - parent = self.getArticle(archive, parentID) - myThreadKey = parent.threadKey+article.date + '-' - else: - myThreadKey = article.date + '-' - article.threadKey = myThreadKey - key = myThreadKey + '\000' + article.msgid - self.setThreadKey(archive, key, article.msgid) + def addArticle(self, archive, article, subject=None, author=None, + date=None): + self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) # Open the BSDDB files that are being used as indices # (dateIndex, authorIndex, subjectIndex, articleIndex) def __openIndices(self, archive): - if self.__currentOpenArchive == archive: return + if self.__currentOpenArchive == archive: + return import bsddb self.__closeIndices() @@ -676,6 +719,7 @@ class BSDDBdatabase(Database): index.close() delattr(self,attr) self.__currentOpenArchive = None + def close(self): self.__closeIndices() def hasArticle(self, archive, msgid): |
