summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Mailman/Archiver/HyperDatabase.py134
-rw-r--r--Mailman/Archiver/pipermail.py228
2 files changed, 202 insertions, 160 deletions
diff --git a/Mailman/Archiver/HyperDatabase.py b/Mailman/Archiver/HyperDatabase.py
index a5779a7c7..c49b487be 100644
--- a/Mailman/Archiver/HyperDatabase.py
+++ b/Mailman/Archiver/HyperDatabase.py
@@ -20,6 +20,8 @@
import os
import marshal
import string
+import sys
+import time
import errno
#
@@ -39,7 +41,6 @@ try:
except ImportError:
import pickle
-
#
# we're using a python dict in place of
# of bsddb.btree database. only defining
@@ -47,12 +48,17 @@ except ImportError:
# only one thing can access this at a time.
#
class DumbBTree:
- # XXX This dictionary-like object stores pickles of all the
- # Article objects. The object itself is stored using marshal. It
- # would be much simpler, and probably faster, to store the actual
- # objects in the DumbBTree and pickle it.
- # XXX Also needs a more sensible name, like IteratableDictionary
- # or SortedDictionary.
+ """Stores pickles of Article objects
+
+ This dictionary-like object stores pickles of all the Article
+ objects. The object itself is stored using marshal. It would be
+ much simpler, and probably faster, to store the actual objects in
+ the DumbBTree and pickle it.
+
+ TBD: Also needs a more sensible name, like IteratableDictionary or
+ SortedDictionary.
+ """
+
def __init__(self, path):
self.current_index = 0
self.path = path
@@ -75,6 +81,9 @@ class DumbBTree:
else:
self.__sort(dirty=1)
+ def __repr__(self):
+ return "DumbBTree(%s)" % self.path
+
def __sort(self, dirty=None):
if self.__dirty == 1 or dirty:
self.sorted = self.dict.keys()
@@ -123,9 +132,8 @@ class DumbBTree:
raise KeyError
else:
key = self.sorted[0]
- res = key, self.dict[key]
self.current_index = 1
- return res
+ return key, self.dict[key]
def last(self):
if not self.sorted:
@@ -178,7 +186,6 @@ class DumbBTree:
fp.close()
self.unlock()
-
# this is lifted straight out of pipermail with
# the bsddb.btree replaced with above class.
@@ -186,6 +193,8 @@ class DumbBTree:
# __internal stuff that needs to be here -scott
#
class HyperDatabase(pipermail.Database):
+ __super_addArticle = pipermail.Database.addArticle
+
def __init__(self, basedir):
self.__cache = {}
self.__currentOpenArchive = None # The currently open indices
@@ -194,89 +203,73 @@ class HyperDatabase(pipermail.Database):
self.changed={}
def firstdate(self, archive):
- import time
self.__openIndices(archive)
- date='None'
+ date = 'None'
try:
- date, msgid = self.dateIndex.first()
- date=time.asctime(time.localtime(string.atof(date)))
- except KeyError: pass
+ datekey, msgid = self.dateIndex.first()
+ date = time.asctime(time.localtime(string.atof(datekey[0])))
+ except KeyError:
+ pass
return date
def lastdate(self, archive):
- import time
self.__openIndices(archive)
- date='None'
+ date = 'None'
try:
- date, msgid = self.dateIndex.last()
- date=time.asctime(time.localtime(string.atof(date)))
- except KeyError: pass
+ datekey, msgid = self.dateIndex.last()
+ date = time.asctime(time.localtime(string.atof(datekey[0])))
+ except KeyError:
+ pass
return date
def numArticles(self, archive):
self.__openIndices(archive)
return len(self.dateIndex)
- # Add a single article to the internal indexes for an archive.
-
- def addArticle(self, archive, article, subjectkey, authorkey):
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None):
self.__openIndices(archive)
+ self.__super_addArticle(archive, article, subject, author, date)
- # Add the new article
- self.dateIndex[article.date]=article.msgid
- self.authorIndex[authorkey]=article.msgid
- self.subjectIndex[subjectkey]=article.msgid
- # Set the 'body' attribute to empty, to avoid storing the whole message
- temp = article.body ; article.body=[]
- self.articleIndex[article.msgid]=pickle.dumps(article)
- article.body=temp
- self.changed[archive,article.msgid]=None
-
- parentID=article.parentID
- if parentID!=None and self.articleIndex.has_key(parentID):
- parent=self.getArticle(archive, parentID)
- myThreadKey=parent.threadKey+article.date+'-'
- else: myThreadKey = article.date+'-'
- article.threadKey=myThreadKey
- self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid)
-
- # Open the BSDDB files that are being used as indices
- # (dateIndex, authorIndex, subjectIndex, articleIndex)
def __openIndices(self, archive):
- if self.__currentOpenArchive==archive: return
+ if self.__currentOpenArchive == archive:
+ return
self.__closeIndices()
- arcdir=os.path.join(self.basedir, 'database')
- try: mkdir(arcdir, mode=02770)
- except os.error: pass
- for i in ['date', 'author', 'subject', 'article', 'thread']:
- t=DumbBTree(os.path.join(arcdir, archive+'-'+i))
- setattr(self, i+'Index', t)
- self.__currentOpenArchive=archive
+ arcdir = os.path.join(self.basedir, 'database')
+ try:
+ mkdir(arcdir, mode=02770)
+ except os.error:
+ pass
+ for i in ('date', 'author', 'subject', 'article', 'thread'):
+ t = DumbBTree(os.path.join(arcdir, archive + '-' + i))
+ setattr(self, i + 'Index', t)
+ self.__currentOpenArchive = archive
- # Close the BSDDB files that are being used as indices (if they're
- # open--this is safe to call if they're already closed)
def __closeIndices(self):
- if self.__currentOpenArchive!=None:
- pass
- for i in ['date', 'author', 'subject', 'thread', 'article']:
- attr=i+'Index'
+ for i in ('date', 'author', 'subject', 'thread', 'article'):
+ attr = i + 'Index'
if hasattr(self, attr):
- index=getattr(self, attr)
- if i=='article':
+ index = getattr(self, attr)
+ if i == 'article':
if not hasattr(self, 'archive_length'):
- self.archive_length={}
- self.archive_length[self.__currentOpenArchive]=len(index)
+ self.archive_length = {}
+ l = len(index)
+ self.archive_length[self.__currentOpenArchive] = l
index.close()
- delattr(self,attr)
- self.__currentOpenArchive=None
+ delattr(self, attr)
+ self.__currentOpenArchive = None
+
def close(self):
self.__closeIndices()
+
def hasArticle(self, archive, msgid):
self.__openIndices(archive)
return self.articleIndex.has_key(msgid)
+
def setThreadKey(self, archive, key, msgid):
self.__openIndices(archive)
self.threadIndex[key]=msgid
+
def getArticle(self, archive, msgid):
self.__openIndices(archive)
if not self.__cache.has_key(msgid):
@@ -289,18 +282,21 @@ class HyperDatabase(pipermail.Database):
def first(self, archive, index):
self.__openIndices(archive)
- index=getattr(self, index+'Index')
+ index = getattr(self, index + 'Index')
try:
key, msgid = index.first()
return msgid
- except KeyError: return None
+ except KeyError:
+ return None
+
def next(self, archive, index):
self.__openIndices(archive)
- index=getattr(self, index+'Index')
+ index = getattr(self, index + 'Index')
try:
key, msgid = index.next()
return msgid
- except KeyError: return None
+ except KeyError:
+ return None
def getOldestArticle(self, archive, subject):
self.__openIndices(archive)
@@ -314,7 +310,9 @@ class HyperDatabase(pipermail.Database):
except KeyError:
return None
- def newArchive(self, archive): pass
+ def newArchive(self, archive):
+ pass
+
def clearIndex(self, archive, index):
self.__openIndices(archive)
## index=getattr(self, index+'Index')
diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py
index 71fb323d2..0cfe93ddd 100644
--- a/Mailman/Archiver/pipermail.py
+++ b/Mailman/Archiver/pipermail.py
@@ -58,12 +58,13 @@ def fixAuthor(author):
# Abstract class for databases
-class Database:
+class DatabaseInterface:
def __init__(self): pass
def close(self): pass
def getArticle(self, archive, msgid): pass
def hasArticle(self, archive, msgid): pass
- def addArticle(self, archive, article, subjectkey, authorkey): pass
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None): pass
def firstdate(self, archive): pass
def lastdate(self, archive): pass
def first(self, archive, index): pass
@@ -73,6 +74,60 @@ class Database:
def setThreadKey(self, archive, key, msgid): pass
def getOldestArticle(self, subject): pass
+class Database(DatabaseInterface):
+ """Define the basic sorting logic for a database
+
+ Assumes that the database internally uses dateIndex, authorIndex,
+ etc.
+ """
+
+ # TBD Factor out more of the logic shared between BSDDBDatabase
+ # and HyperDatabase and place it in this class.
+
+ def __init__(self):
+ # This method need not be called by subclasses that do their
+ # own initialization.
+ self.dateIndex = {}
+ self.authorIndex = {}
+ self.subjectIndex = {}
+ self.articleIndex = {}
+ self.changed = {}
+
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None):
+ # create the keys; always end w/ msgid which will be unique
+ authorkey = (author or article.author, article.date,
+ article.msgid)
+ subjectkey = (subject or article.subject, article.date,
+ article.msgid)
+ datekey = date or article.date, article.msgid
+
+ # Add the new article
+ self.dateIndex[datekey] = article.msgid
+ self.authorIndex[authorkey] = article.msgid
+ self.subjectIndex[subjectkey] = article.msgid
+
+ self.store_article(article)
+ self.changed[archive, article.msgid] = None
+
+ parentID = article.parentID
+ if parentID is not None and self.articleIndex.has_key(parentID):
+ parent = self.getArticle(archive, parentID)
+ myThreadKey = parent.threadKey + article.date + '-'
+ else:
+ myThreadKey = article.date + '-'
+ article.threadKey = myThreadKey
+ key = myThreadKey, article.msgid
+ self.setThreadKey(archive, key, article.msgid)
+
+ def store_article(self, article):
+ """Store article without message body to save space"""
+ # TBD this is not thread safe!
+ temp = article.body
+ article.body = []
+ self.articleIndex[article.msgid] = pickle.dumps(article)
+ article.body = temp
+
# The Article class encapsulates a single posting. The attributes
# are:
#
@@ -330,7 +385,7 @@ class T:
article.parentID)
article.threadKey = parent.threadKey+article.date+'-'
self.database.setThreadKey(self.archive,
- article.threadKey + '\000' + article.msgid,
+ (article.threadKey, article.msgid),
msgid)
msgid = self.database.next(self.archive, 'date')
@@ -460,80 +515,84 @@ class T:
self.sequence = self.sequence + 1
self.add_article(a)
- # Archive an Article object.
+ def new_archive(self, archive, archivedir):
+ self.archives.append(archive)
+ self.update_TOC = 1
+ self.database.newArchive(archive)
+ # If the archive directory doesn't exist, create it
+ try:
+ os.stat(archivedir)
+ except os.error, errdata:
+ errno, errmsg = errdata
+ if errno == 2:
+ mkdir(archivedir, self.DIRMODE)
+ else:
+ raise os.error, errdata
+ self.open_new_archive(archive, archivedir)
+
def add_article(self, article):
- # Determine into what archives the article should be placed
archives = self.get_archives(article)
if not archives:
return
if type(archives) == type(''):
archives = [archives]
- # Add the article to each archive in turn
article.filename = filename = self.get_filename(article)
- temp = self.format_article(article) # Reformat the article
- fmt = "Processing article #%s into archives %s"
- self.message(fmt % (article.sequence, archives))
- for i in archives:
- self.archive = i
- archivedir = os.path.join(self.basedir, i)
- # If it's a new archive, create it
- if i not in self.archives:
- self.archives.append(i)
- self.update_TOC = 1
- self.database.newArchive(i)
- # If the archive directory doesn't exist, create it
- try:
- os.stat(archivedir)
- except os.error, errdata:
- errno, errmsg = errdata
- if errno == 2:
- mkdir(archivedir, self.DIRMODE)
- else:
- raise os.error, errdata
- self.open_new_archive(i, archivedir)
+ temp = self.format_article(article)
+ fmt = "Processing article #%s into archives %s: %s"
+ self.message(fmt % (article.sequence, archives, article.subject))
+ for arch in archives:
+ self.archive = arch # why do this???
+ archivedir = os.path.join(self.basedir, arch)
+ if arch not in self.archives:
+ self.new_archive(arch, archivedir)
# Write the HTML-ized article
- self.write_article(i, temp, os.path.join(archivedir,
- filename))
+ self.write_article(arch, temp, os.path.join(archivedir,
+ filename))
- authorkey = fixAuthor(article.author) + '\000' + article.date
- subjectkey = string.lower(article.subject ) +'\000' + article.date
-
- # Update parenting info
- parentID = None
- if article.in_reply_to:
- parentID = article.in_reply_to
- elif article.references:
- refs = self._remove_external_references(article.references)
- if refs:
- maxdate = self.database.getArticle(self.archive,
- refs[0])
- for ref in refs[1:]:
- a = self.database.getArticle(self.archive, ref)
- if a.date > maxdate.date:
- maxdate = a
- parentID = maxdate.msgid
- else:
- # Get the oldest article with a matching subject, and
- # assume this is a follow-up to that article
- parentID = self.database.getOldestArticle(self.archive,
- article.subject)
+ author = fixAuthor(article.author)
+ subject = string.lower(article.subject)
- if parentID is not None \
- and not self.database.hasArticle(self.archive, parentID):
- parentID = None
- article.parentID = parentID
- if parentID is not None:
- parent = self.database.getArticle(self.archive, parentID)
- article.threadKey = parent.threadKey + article.date + '-'
- else:
+ article.parentID = parentID = self.get_parent_info(arch, article)
+ if parentID:
+ parent = self.database.getArticle(arch, parentID)
+ article.threadKey = parent.threadKey + article.date + '-'
+ else:
article.threadKey = article.date + '-'
- key = article.threadKey + '\000' + article.msgid
- self.database.setThreadKey(self.archive, key, article.msgid)
- self.database.addArticle(i, temp, subjectkey, authorkey)
- if i not in self._dirty_archives:
- self._dirty_archives.append(i)
+ key = article.threadKey, article.msgid
+
+ self.database.setThreadKey(arch, key, article.msgid)
+ self.database.addArticle(arch, temp, author=author,
+ subject=subject)
+
+ if arch not in self._dirty_archives:
+ self._dirty_archives.append(arch)
+
+ def get_parent_info(self, archive, article):
+ parentID = None
+ if article.in_reply_to:
+ parentID = article.in_reply_to
+ elif article.references:
+ refs = self._remove_external_references(article.references)
+ if refs:
+ maxdate = self.database.getArticle(archive, refs[0])
+ for ref in refs[1:]:
+ a = self.database.getArticle(archive, ref)
+ if a.date > maxdate.date:
+ maxdate = a
+ parentID = maxdate.msgid
+ else:
+ # Get the oldest article with a matching subject, and
+ # assume this is a follow-up to that article
+ parentID = self.database.getOldestArticle(archive,
+ article.subject)
+
+ if parentID and not self.database.hasArticle(archive, parentID):
+ parentID = None
+ return parentID
+
+
def write_article(self, index, article, path):
f = open(path, 'w')
@@ -588,13 +647,16 @@ class T:
class BSDDBdatabase(Database):
+ __super_addArticle = Database.addArticle
+
def __init__(self, basedir):
self.__cachekeys = []
self.__cachedict = {}
self.__currentOpenArchive = None # The currently open indices
self.basedir = os.path.expanduser(basedir)
self.changed = {} # Recently added articles, indexed only by
- # message ID
+ # message ID
+
def firstdate(self, archive):
self.__openIndices(archive)
date = 'None'
@@ -604,6 +666,7 @@ class BSDDBdatabase(Database):
except KeyError:
pass
return date
+
def lastdate(self, archive):
self.__openIndices(archive)
date = 'None'
@@ -613,41 +676,21 @@ class BSDDBdatabase(Database):
except KeyError:
pass
return date
+
def numArticles(self, archive):
self.__openIndices(archive)
return len(self.dateIndex)
- # Add a single article to the internal indexes for an archive.
-
- def addArticle(self, archive, article, subjectkey, authorkey):
- self.__openIndices(archive)
-
- # Add the new article
- self.dateIndex[article.date] = article.msgid
- self.authorIndex[authorkey] = article.msgid
- self.subjectIndex[subjectkey] = article.msgid
- # Set the 'body' attribute to empty, to avoid storing the
- # whole message
- temp = article.body
- article.body = []
- self.articleIndex[article.msgid] = pickle.dumps(article)
- article.body = temp
- self.changed[archive,article.msgid] = None
-
- parentID = article.parentID
- if parentID is not None and self.articleIndex.has_key(parentID):
- parent = self.getArticle(archive, parentID)
- myThreadKey = parent.threadKey+article.date + '-'
- else:
- myThreadKey = article.date + '-'
- article.threadKey = myThreadKey
- key = myThreadKey + '\000' + article.msgid
- self.setThreadKey(archive, key, article.msgid)
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None):
+ self.__openIndices(archive)
+ self.__super_addArticle(archive, article, subject, author, date)
# Open the BSDDB files that are being used as indices
# (dateIndex, authorIndex, subjectIndex, articleIndex)
def __openIndices(self, archive):
- if self.__currentOpenArchive == archive: return
+ if self.__currentOpenArchive == archive:
+ return
import bsddb
self.__closeIndices()
@@ -676,6 +719,7 @@ class BSDDBdatabase(Database):
index.close()
delattr(self,attr)
self.__currentOpenArchive = None
+
def close(self):
self.__closeIndices()
def hasArticle(self, archive, msgid):