summaryrefslogtreecommitdiff
path: root/Mailman/Archiver/pipermail.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Archiver/pipermail.py')
-rw-r--r--Mailman/Archiver/pipermail.py228
1 files changed, 136 insertions, 92 deletions
diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py
index 71fb323d2..0cfe93ddd 100644
--- a/Mailman/Archiver/pipermail.py
+++ b/Mailman/Archiver/pipermail.py
@@ -58,12 +58,13 @@ def fixAuthor(author):
# Abstract class for databases
-class Database:
+class DatabaseInterface:
def __init__(self): pass
def close(self): pass
def getArticle(self, archive, msgid): pass
def hasArticle(self, archive, msgid): pass
- def addArticle(self, archive, article, subjectkey, authorkey): pass
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None): pass
def firstdate(self, archive): pass
def lastdate(self, archive): pass
def first(self, archive, index): pass
@@ -73,6 +74,60 @@ class Database:
def setThreadKey(self, archive, key, msgid): pass
def getOldestArticle(self, subject): pass
+class Database(DatabaseInterface):
+ """Define the basic sorting logic for a database
+
+ Assumes that the database internally uses dateIndex, authorIndex,
+ etc.
+ """
+
+ # TBD Factor out more of the logic shared between BSDDBDatabase
+ # and HyperDatabase and place it in this class.
+
+ def __init__(self):
+ # This method need not be called by subclasses that do their
+ # own initialization.
+ self.dateIndex = {}
+ self.authorIndex = {}
+ self.subjectIndex = {}
+ self.articleIndex = {}
+ self.changed = {}
+
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None):
+ # create the keys; always end w/ msgid which will be unique
+ authorkey = (author or article.author, article.date,
+ article.msgid)
+ subjectkey = (subject or article.subject, article.date,
+ article.msgid)
+ datekey = date or article.date, article.msgid
+
+ # Add the new article
+ self.dateIndex[datekey] = article.msgid
+ self.authorIndex[authorkey] = article.msgid
+ self.subjectIndex[subjectkey] = article.msgid
+
+ self.store_article(article)
+ self.changed[archive, article.msgid] = None
+
+ parentID = article.parentID
+ if parentID is not None and self.articleIndex.has_key(parentID):
+ parent = self.getArticle(archive, parentID)
+ myThreadKey = parent.threadKey + article.date + '-'
+ else:
+ myThreadKey = article.date + '-'
+ article.threadKey = myThreadKey
+ key = myThreadKey, article.msgid
+ self.setThreadKey(archive, key, article.msgid)
+
+ def store_article(self, article):
+ """Store article without message body to save space"""
+ # TBD this is not thread safe!
+ temp = article.body
+ article.body = []
+ self.articleIndex[article.msgid] = pickle.dumps(article)
+ article.body = temp
+
# The Article class encapsulates a single posting. The attributes
# are:
#
@@ -330,7 +385,7 @@ class T:
article.parentID)
article.threadKey = parent.threadKey+article.date+'-'
self.database.setThreadKey(self.archive,
- article.threadKey + '\000' + article.msgid,
+ (article.threadKey, article.msgid),
msgid)
msgid = self.database.next(self.archive, 'date')
@@ -460,80 +515,84 @@ class T:
self.sequence = self.sequence + 1
self.add_article(a)
- # Archive an Article object.
+ def new_archive(self, archive, archivedir):
+ self.archives.append(archive)
+ self.update_TOC = 1
+ self.database.newArchive(archive)
+ # If the archive directory doesn't exist, create it
+ try:
+ os.stat(archivedir)
+ except os.error, errdata:
+ errno, errmsg = errdata
+ if errno == 2:
+ mkdir(archivedir, self.DIRMODE)
+ else:
+ raise os.error, errdata
+ self.open_new_archive(archive, archivedir)
+
def add_article(self, article):
- # Determine into what archives the article should be placed
archives = self.get_archives(article)
if not archives:
return
if type(archives) == type(''):
archives = [archives]
- # Add the article to each archive in turn
article.filename = filename = self.get_filename(article)
- temp = self.format_article(article) # Reformat the article
- fmt = "Processing article #%s into archives %s"
- self.message(fmt % (article.sequence, archives))
- for i in archives:
- self.archive = i
- archivedir = os.path.join(self.basedir, i)
- # If it's a new archive, create it
- if i not in self.archives:
- self.archives.append(i)
- self.update_TOC = 1
- self.database.newArchive(i)
- # If the archive directory doesn't exist, create it
- try:
- os.stat(archivedir)
- except os.error, errdata:
- errno, errmsg = errdata
- if errno == 2:
- mkdir(archivedir, self.DIRMODE)
- else:
- raise os.error, errdata
- self.open_new_archive(i, archivedir)
+ temp = self.format_article(article)
+ fmt = "Processing article #%s into archives %s: %s"
+ self.message(fmt % (article.sequence, archives, article.subject))
+ for arch in archives:
+ self.archive = arch # why do this???
+ archivedir = os.path.join(self.basedir, arch)
+ if arch not in self.archives:
+ self.new_archive(arch, archivedir)
# Write the HTML-ized article
- self.write_article(i, temp, os.path.join(archivedir,
- filename))
+ self.write_article(arch, temp, os.path.join(archivedir,
+ filename))
- authorkey = fixAuthor(article.author) + '\000' + article.date
- subjectkey = string.lower(article.subject ) +'\000' + article.date
-
- # Update parenting info
- parentID = None
- if article.in_reply_to:
- parentID = article.in_reply_to
- elif article.references:
- refs = self._remove_external_references(article.references)
- if refs:
- maxdate = self.database.getArticle(self.archive,
- refs[0])
- for ref in refs[1:]:
- a = self.database.getArticle(self.archive, ref)
- if a.date > maxdate.date:
- maxdate = a
- parentID = maxdate.msgid
- else:
- # Get the oldest article with a matching subject, and
- # assume this is a follow-up to that article
- parentID = self.database.getOldestArticle(self.archive,
- article.subject)
+ author = fixAuthor(article.author)
+ subject = string.lower(article.subject)
- if parentID is not None \
- and not self.database.hasArticle(self.archive, parentID):
- parentID = None
- article.parentID = parentID
- if parentID is not None:
- parent = self.database.getArticle(self.archive, parentID)
- article.threadKey = parent.threadKey + article.date + '-'
- else:
+ article.parentID = parentID = self.get_parent_info(arch, article)
+ if parentID:
+ parent = self.database.getArticle(arch, parentID)
+ article.threadKey = parent.threadKey + article.date + '-'
+ else:
article.threadKey = article.date + '-'
- key = article.threadKey + '\000' + article.msgid
- self.database.setThreadKey(self.archive, key, article.msgid)
- self.database.addArticle(i, temp, subjectkey, authorkey)
- if i not in self._dirty_archives:
- self._dirty_archives.append(i)
+ key = article.threadKey, article.msgid
+
+ self.database.setThreadKey(arch, key, article.msgid)
+ self.database.addArticle(arch, temp, author=author,
+ subject=subject)
+
+ if arch not in self._dirty_archives:
+ self._dirty_archives.append(arch)
+
+ def get_parent_info(self, archive, article):
+ parentID = None
+ if article.in_reply_to:
+ parentID = article.in_reply_to
+ elif article.references:
+ refs = self._remove_external_references(article.references)
+ if refs:
+ maxdate = self.database.getArticle(archive, refs[0])
+ for ref in refs[1:]:
+ a = self.database.getArticle(archive, ref)
+ if a.date > maxdate.date:
+ maxdate = a
+ parentID = maxdate.msgid
+ else:
+ # Get the oldest article with a matching subject, and
+ # assume this is a follow-up to that article
+ parentID = self.database.getOldestArticle(archive,
+ article.subject)
+
+ if parentID and not self.database.hasArticle(archive, parentID):
+ parentID = None
+ return parentID
+
+
def write_article(self, index, article, path):
f = open(path, 'w')
@@ -588,13 +647,16 @@ class T:
class BSDDBdatabase(Database):
+ __super_addArticle = Database.addArticle
+
def __init__(self, basedir):
self.__cachekeys = []
self.__cachedict = {}
self.__currentOpenArchive = None # The currently open indices
self.basedir = os.path.expanduser(basedir)
self.changed = {} # Recently added articles, indexed only by
- # message ID
+ # message ID
+
def firstdate(self, archive):
self.__openIndices(archive)
date = 'None'
@@ -604,6 +666,7 @@ class BSDDBdatabase(Database):
except KeyError:
pass
return date
+
def lastdate(self, archive):
self.__openIndices(archive)
date = 'None'
@@ -613,41 +676,21 @@ class BSDDBdatabase(Database):
except KeyError:
pass
return date
+
def numArticles(self, archive):
self.__openIndices(archive)
return len(self.dateIndex)
- # Add a single article to the internal indexes for an archive.
-
- def addArticle(self, archive, article, subjectkey, authorkey):
- self.__openIndices(archive)
-
- # Add the new article
- self.dateIndex[article.date] = article.msgid
- self.authorIndex[authorkey] = article.msgid
- self.subjectIndex[subjectkey] = article.msgid
- # Set the 'body' attribute to empty, to avoid storing the
- # whole message
- temp = article.body
- article.body = []
- self.articleIndex[article.msgid] = pickle.dumps(article)
- article.body = temp
- self.changed[archive,article.msgid] = None
-
- parentID = article.parentID
- if parentID is not None and self.articleIndex.has_key(parentID):
- parent = self.getArticle(archive, parentID)
- myThreadKey = parent.threadKey+article.date + '-'
- else:
- myThreadKey = article.date + '-'
- article.threadKey = myThreadKey
- key = myThreadKey + '\000' + article.msgid
- self.setThreadKey(archive, key, article.msgid)
+ def addArticle(self, archive, article, subject=None, author=None,
+ date=None):
+ self.__openIndices(archive)
+ self.__super_addArticle(archive, article, subject, author, date)
# Open the BSDDB files that are being used as indices
# (dateIndex, authorIndex, subjectIndex, articleIndex)
def __openIndices(self, archive):
- if self.__currentOpenArchive == archive: return
+ if self.__currentOpenArchive == archive:
+ return
import bsddb
self.__closeIndices()
@@ -676,6 +719,7 @@ class BSDDBdatabase(Database):
index.close()
delattr(self,attr)
self.__currentOpenArchive = None
+
def close(self):
self.__closeIndices()
def hasArticle(self, archive, msgid):