Fix index generation bug that oocasionally prevented messages from

appearing in index. pipermail generated several indexes by assuming that date was unique. If two messages arrived with, e.g., the same author and date, then the author index treated them as identical. As a result, both messages were archived, but only the last one was included in the index. Solution is to always include the msgid, which is unique, in the index key. Change database keys to combine elements using tuples instead of string concatenation with \000 as separator. Fix was accomplished by refactoring on pipermail.Database and its subclasses. Push index-key generation into common concrete base class Database; rename abstract base class to DatabaseInterface. Break up addArticle method into several pieces. TBD There is still more refactoring to do on Database class. Because date key has changed, HyperDatabase method to return first and last date changed to reflect format of date key. Refactor pipermail.T.add_article into several pieces.
author: jhylton 2000-09-22 18:23:37 +0000
committer: jhylton 2000-09-22 18:23:37 +0000
commit: ad4ba52f694b2f026be863ab869b8d7ca83ab03c (patch)
tree: 81c03da4aaa50d7c20b3f84609889dbc5dec66ed
parent: bf6d28c3c757820f428772472379cc8848948a85 (diff)
download: mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.gz
mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.zst
mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.zip
2 files changed, 202 insertions, 160 deletions
diff --git a/Mailman/Archiver/HyperDatabase.py b/Mailman/Archiver/HyperDatabase.py
index a5779a7c7..c49b487be 100644
--- a/Mailman/Archiver/HyperDatabase.py
+++ b/Mailman/Archiver/HyperDatabase.py
@@ -20,6 +20,8 @@
 import os
 import marshal
 import string
+import sys
+import time
 import errno
 
 #
@@ -39,7 +41,6 @@ try:
 except ImportError:
     import pickle
 
-
 #
 # we're using a python dict in place of
 # of bsddb.btree database.  only defining
@@ -47,12 +48,17 @@ except ImportError:
 # only one thing can access this at a time.
 #
 class DumbBTree:
-    # XXX This dictionary-like object stores pickles of all the
-    # Article objects.  The object itself is stored using marshal.  It 
-    # would be much simpler, and probably faster, to store the actual
-    # objects in the DumbBTree and pickle it.
-    # XXX Also needs a more sensible name, like IteratableDictionary
-    # or SortedDictionary.
+    """Stores pickles of Article objects
+
+    This dictionary-like object stores pickles of all the Article
+    objects.  The object itself is stored using marshal.  It would be
+    much simpler, and probably faster, to store the actual objects in
+    the DumbBTree and pickle it.
+    
+    TBD: Also needs a more sensible name, like IteratableDictionary or
+    SortedDictionary.
+    """
+    
     def __init__(self, path):
         self.current_index = 0
         self.path = path
@@ -75,6 +81,9 @@ class DumbBTree:
         else:
             self.__sort(dirty=1)
 
+    def __repr__(self):
+        return "DumbBTree(%s)" % self.path
+
     def __sort(self, dirty=None):
         if self.__dirty == 1 or dirty:
             self.sorted = self.dict.keys()
@@ -123,9 +132,8 @@ class DumbBTree:
             raise KeyError
         else:
             key = self.sorted[0]
-            res = key, self.dict[key]
             self.current_index = 1
-	    return res
+	    return key, self.dict[key]
 
     def last(self):
         if not self.sorted:
@@ -178,7 +186,6 @@ class DumbBTree:
         fp.close()
         self.unlock()
 
-
 
 # this is lifted straight out of pipermail with
 # the bsddb.btree replaced with above class.
@@ -186,6 +193,8 @@ class DumbBTree:
 # __internal stuff that needs to be here -scott
 #
 class HyperDatabase(pipermail.Database):
+    __super_addArticle = pipermail.Database.addArticle
+    
     def __init__(self, basedir):
         self.__cache = {}
 	self.__currentOpenArchive = None   # The currently open indices
@@ -194,89 +203,73 @@ class HyperDatabase(pipermail.Database):
         self.changed={}
 
     def firstdate(self, archive):
-	import time
 	self.__openIndices(archive)
-	date='None'
+	date = 'None'
 	try:
-	    date, msgid = self.dateIndex.first()
-	    date=time.asctime(time.localtime(string.atof(date)))
-	except KeyError: pass
+	    datekey, msgid = self.dateIndex.first()
+	    date = time.asctime(time.localtime(string.atof(datekey[0])))
+	except KeyError:
+            pass
 	return date
 
     def lastdate(self, archive):
-	import time
 	self.__openIndices(archive)
-	date='None'
+	date = 'None'
 	try:
-	    date, msgid = self.dateIndex.last()
-	    date=time.asctime(time.localtime(string.atof(date)))
-	except KeyError: pass
+	    datekey, msgid = self.dateIndex.last()
+	    date = time.asctime(time.localtime(string.atof(datekey[0])))
+	except KeyError:
+            pass
 	return date
 
     def numArticles(self, archive):
 	self.__openIndices(archive)
 	return len(self.dateIndex)    
 
-    # Add a single article to the internal indexes for an archive.
-
-    def addArticle(self, archive, article, subjectkey, authorkey):
+    def addArticle(self, archive, article, subject=None, author=None,
+                   date=None):
 	self.__openIndices(archive)
+        self.__super_addArticle(archive, article, subject, author, date)
 
-	# Add the new article
-	self.dateIndex[article.date]=article.msgid
-	self.authorIndex[authorkey]=article.msgid
-	self.subjectIndex[subjectkey]=article.msgid
-	# Set the 'body' attribute to empty, to avoid storing the whole message
-	temp = article.body ; article.body=[]
-	self.articleIndex[article.msgid]=pickle.dumps(article)
-	article.body=temp
-	self.changed[archive,article.msgid]=None
-
-	parentID=article.parentID
-	if parentID!=None and self.articleIndex.has_key(parentID): 
-	    parent=self.getArticle(archive, parentID)
-	    myThreadKey=parent.threadKey+article.date+'-'
-	else: myThreadKey = article.date+'-'
-	article.threadKey=myThreadKey
-	self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid)
-
-    # Open the BSDDB files that are being used as indices
-    # (dateIndex, authorIndex, subjectIndex, articleIndex)
     def __openIndices(self, archive):
-	if self.__currentOpenArchive==archive: return
+	if self.__currentOpenArchive == archive:
+            return
 	self.__closeIndices()
-	arcdir=os.path.join(self.basedir, 'database')
-	try: mkdir(arcdir, mode=02770)
-        except os.error: pass
-	for i in ['date', 'author', 'subject', 'article', 'thread']:
-	    t=DumbBTree(os.path.join(arcdir, archive+'-'+i)) 
-	    setattr(self, i+'Index', t)
-	self.__currentOpenArchive=archive
+	arcdir = os.path.join(self.basedir, 'database')
+	try:
+            mkdir(arcdir, mode=02770)
+        except os.error:
+            pass
+	for i in ('date', 'author', 'subject', 'article', 'thread'):
+	    t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) 
+	    setattr(self, i + 'Index', t)
+	self.__currentOpenArchive = archive
 
-    # Close the BSDDB files that are being used as indices (if they're
-    # open--this is safe to call if they're already closed)
     def __closeIndices(self):
-	if self.__currentOpenArchive!=None: 
-	    pass
-	for i in ['date', 'author', 'subject', 'thread', 'article']:
-	    attr=i+'Index'
+	for i in ('date', 'author', 'subject', 'thread', 'article'):
+	    attr = i + 'Index'
 	    if hasattr(self, attr): 
-		index=getattr(self, attr) 
-		if i=='article': 
+		index = getattr(self, attr) 
+		if i == 'article': 
 	            if not hasattr(self, 'archive_length'):
-                        self.archive_length={}
-		    self.archive_length[self.__currentOpenArchive]=len(index)
+                        self.archive_length = {}
+                    l = len(index)
+                    self.archive_length[self.__currentOpenArchive] = l
 		index.close() 
-		delattr(self,attr)
-	self.__currentOpenArchive=None
+		delattr(self, attr)
+	self.__currentOpenArchive = None
+        
     def close(self):
 	self.__closeIndices()
+        
     def hasArticle(self, archive, msgid): 
 	self.__openIndices(archive)
 	return self.articleIndex.has_key(msgid)
+    
     def setThreadKey(self, archive, key, msgid):
 	self.__openIndices(archive)
 	self.threadIndex[key]=msgid
+        
     def getArticle(self, archive, msgid):
 	self.__openIndices(archive)
         if not self.__cache.has_key(msgid):
@@ -289,18 +282,21 @@ class HyperDatabase(pipermail.Database):
 
     def first(self, archive, index): 
 	self.__openIndices(archive)
-	index=getattr(self, index+'Index')
+	index = getattr(self, index + 'Index')
 	try: 
 	    key, msgid = index.first()
 	    return msgid
-	except KeyError: return None
+	except KeyError:
+            return None
+        
     def next(self, archive, index): 
 	self.__openIndices(archive)
-	index=getattr(self, index+'Index')
+	index = getattr(self, index + 'Index')
 	try: 
 	    key, msgid = index.next()
 	    return msgid
-	except KeyError: return None
+	except KeyError:
+            return None
 	
     def getOldestArticle(self, archive, subject):
 	self.__openIndices(archive)
@@ -314,7 +310,9 @@ class HyperDatabase(pipermail.Database):
 	except KeyError: 
 	    return None
 
-    def newArchive(self, archive): pass
+    def newArchive(self, archive):
+        pass
+    
     def clearIndex(self, archive, index):
 	self.__openIndices(archive)
 ##	index=getattr(self, index+'Index')
diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py
index 71fb323d2..0cfe93ddd 100644
--- a/Mailman/Archiver/pipermail.py
+++ b/Mailman/Archiver/pipermail.py
@@ -58,12 +58,13 @@ def fixAuthor(author):
 
 # Abstract class for databases
 
-class Database:    
+class DatabaseInterface:    
     def __init__(self): pass
     def close(self): pass
     def getArticle(self, archive, msgid): pass
     def hasArticle(self, archive, msgid): pass
-    def addArticle(self, archive, article, subjectkey, authorkey): pass
+    def addArticle(self, archive, article, subject=None, author=None,
+                   date=None): pass
     def firstdate(self, archive): pass
     def lastdate(self, archive): pass
     def first(self, archive, index): pass
@@ -73,6 +74,60 @@ class Database:
     def setThreadKey(self, archive, key, msgid): pass
     def getOldestArticle(self, subject): pass
 
+class Database(DatabaseInterface):
+    """Define the basic sorting logic for a database
+
+    Assumes that the database internally uses dateIndex, authorIndex,
+    etc.
+    """
+
+    # TBD Factor out more of the logic shared between BSDDBDatabase
+    # and HyperDatabase and place it in this class.
+    
+    def __init__(self):
+        # This method need not be called by subclasses that do their
+        # own initialization.
+        self.dateIndex = {}
+        self.authorIndex = {}
+        self.subjectIndex = {}
+        self.articleIndex = {}
+        self.changed = {}
+    
+    def addArticle(self, archive, article, subject=None, author=None,
+                   date=None):
+        # create the keys; always end w/ msgid which will be unique
+        authorkey = (author or article.author, article.date,
+                     article.msgid)
+        subjectkey = (subject or article.subject, article.date,
+                      article.msgid)
+        datekey = date or article.date, article.msgid
+
+	# Add the new article
+	self.dateIndex[datekey] = article.msgid
+	self.authorIndex[authorkey] = article.msgid
+	self.subjectIndex[subjectkey] = article.msgid
+
+        self.store_article(article)
+	self.changed[archive, article.msgid] = None
+
+	parentID = article.parentID
+	if parentID is not None and self.articleIndex.has_key(parentID): 
+	    parent = self.getArticle(archive, parentID)
+	    myThreadKey = parent.threadKey + article.date + '-'
+	else:
+            myThreadKey = article.date + '-'
+	article.threadKey = myThreadKey
+        key = myThreadKey, article.msgid
+	self.setThreadKey(archive, key, article.msgid)
+
+    def store_article(self, article):
+        """Store article without message body to save space"""
+        # TBD this is not thread safe!
+	temp = article.body
+        article.body = []
+	self.articleIndex[article.msgid] = pickle.dumps(article)
+	article.body = temp
+
 # The Article class encapsulates a single posting.  The attributes 
 # are:
 #
@@ -330,7 +385,7 @@ class T:
                                                     article.parentID)
                     article.threadKey = parent.threadKey+article.date+'-' 
                 self.database.setThreadKey(self.archive,
-                    article.threadKey + '\000' + article.msgid,
+                    (article.threadKey, article.msgid),
                     msgid)
 	    msgid = self.database.next(self.archive, 'date')
 
@@ -460,80 +515,84 @@ class T:
 	    self.sequence = self.sequence + 1
 	    self.add_article(a)
 
-    # Archive an Article object.
+    def new_archive(self, archive, archivedir):
+        self.archives.append(archive)
+        self.update_TOC = 1
+        self.database.newArchive(archive)
+        # If the archive directory doesn't exist, create it
+        try:
+            os.stat(archivedir)
+        except os.error, errdata:
+            errno, errmsg = errdata
+            if errno == 2: 
+                mkdir(archivedir, self.DIRMODE)
+            else:
+                raise os.error, errdata
+        self.open_new_archive(archive, archivedir)
+
     def add_article(self, article):
-	# Determine into what archives the article should be placed
 	archives = self.get_archives(article)
         if not archives:
             return
 	if type(archives) == type(''):
             archives = [archives]
 
-	# Add the article to each archive in turn
 	article.filename = filename = self.get_filename(article)
-	temp = self.format_article(article) # Reformat the article
-        fmt = "Processing article #%s into archives %s"
-	self.message(fmt % (article.sequence, archives))
-	for i in archives:
-	    self.archive = i
-	    archivedir = os.path.join(self.basedir, i)
-	    # If it's a new archive, create it
-	    if i not in self.archives: 
-		self.archives.append(i)
-                self.update_TOC = 1
-		self.database.newArchive(i)
-		# If the archive directory doesn't exist, create it
-		try:
-                    os.stat(archivedir)
-		except os.error, errdata:
-		    errno, errmsg = errdata
-		    if errno == 2: 
-			mkdir(archivedir, self.DIRMODE)
-		    else:
-                        raise os.error, errdata
-		self.open_new_archive(i, archivedir)
+	temp = self.format_article(article)
+        fmt = "Processing article #%s into archives %s: %s"
+	self.message(fmt % (article.sequence, archives, article.subject))
+	for arch in archives:
+	    self.archive = arch # why do this???
+	    archivedir = os.path.join(self.basedir, arch)
+	    if arch not in self.archives:
+                self.new_archive(arch, archivedir)
 		
 	    # Write the HTML-ized article
-            self.write_article(i, temp, os.path.join(archivedir,
-                                                     filename))  
+            self.write_article(arch, temp, os.path.join(archivedir,
+                                                        filename))  
 
-	    authorkey = fixAuthor(article.author) + '\000' + article.date
-	    subjectkey = string.lower(article.subject ) +'\000' + article.date
-
-	    # Update parenting info
-	    parentID = None
-	    if article.in_reply_to:
-                parentID = article.in_reply_to
-	    elif article.references: 
-		refs = self._remove_external_references(article.references)
-                if refs:
-                    maxdate = self.database.getArticle(self.archive,
-                                                       refs[0])
-                    for ref in refs[1:]:
-                        a = self.database.getArticle(self.archive, ref)
-                        if a.date > maxdate.date:
-                            maxdate = a
-		    parentID = maxdate.msgid
-	    else:
-		# Get the oldest article with a matching subject, and
-		# assume this is a follow-up to that article
-		parentID = self.database.getOldestArticle(self.archive,
-                                                          article.subject) 
+            author = fixAuthor(article.author)
+            subject = string.lower(article.subject)
 
-	    if parentID is not None \
-               and not self.database.hasArticle(self.archive, parentID): 
-		parentID = None
-	    article.parentID = parentID 
-	    if parentID is not None:
-		parent = self.database.getArticle(self.archive, parentID)
-		article.threadKey = parent.threadKey + article.date + '-'
-	    else:
+            article.parentID = parentID = self.get_parent_info(arch, article)
+            if parentID:
+                parent = self.database.getArticle(arch, parentID)
+                article.threadKey = parent.threadKey + article.date + '-'
+            else:
                 article.threadKey = article.date + '-'
-            key = article.threadKey + '\000' + article.msgid
-   	    self.database.setThreadKey(self.archive, key, article.msgid)
-	    self.database.addArticle(i, temp, subjectkey, authorkey)
-	    if i not in self._dirty_archives: 
-		self._dirty_archives.append(i)
+            key = article.threadKey, article.msgid
+            
+   	    self.database.setThreadKey(arch, key, article.msgid)
+	    self.database.addArticle(arch, temp, author=author,
+                                     subject=subject)
+            
+	    if arch not in self._dirty_archives: 
+		self._dirty_archives.append(arch)
+
+    def get_parent_info(self, archive, article):
+        parentID = None
+        if article.in_reply_to:
+            parentID = article.in_reply_to
+        elif article.references: 
+            refs = self._remove_external_references(article.references)
+            if refs:
+                maxdate = self.database.getArticle(archive, refs[0])
+                for ref in refs[1:]:
+                    a = self.database.getArticle(archive, ref)
+                    if a.date > maxdate.date:
+                        maxdate = a
+                parentID = maxdate.msgid
+        else:
+            # Get the oldest article with a matching subject, and
+            # assume this is a follow-up to that article
+            parentID = self.database.getOldestArticle(archive,
+                                                      article.subject) 
+
+        if parentID and not self.database.hasArticle(archive, parentID): 
+            parentID = None
+        return parentID
+    
+        
 
     def write_article(self, index, article, path):
         f = open(path, 'w')
@@ -588,13 +647,16 @@ class T:
 
 
 class BSDDBdatabase(Database):
+    __super_addArticle = Database.addArticle
+    
     def __init__(self, basedir):
 	self.__cachekeys = []
         self.__cachedict = {}
 	self.__currentOpenArchive = None # The currently open indices
 	self.basedir = os.path.expanduser(basedir)
 	self.changed = {} # Recently added articles, indexed only by
-	                  # message ID 
+	                  # message ID
+                          
     def firstdate(self, archive):
 	self.__openIndices(archive)
 	date = 'None'
@@ -604,6 +666,7 @@ class BSDDBdatabase(Database):
 	except KeyError:
             pass
 	return date
+    
     def lastdate(self, archive):
 	self.__openIndices(archive)
 	date = 'None'
@@ -613,41 +676,21 @@ class BSDDBdatabase(Database):
 	except KeyError:
             pass
 	return date
+    
     def numArticles(self, archive):
 	self.__openIndices(archive)
 	return len(self.dateIndex)    
 
-    # Add a single article to the internal indexes for an archive.
-
-    def addArticle(self, archive, article, subjectkey, authorkey):
-	self.__openIndices(archive)
-
-	# Add the new article
-	self.dateIndex[article.date] = article.msgid
-	self.authorIndex[authorkey] = article.msgid
-	self.subjectIndex[subjectkey] = article.msgid
-	# Set the 'body' attribute to empty, to avoid storing the
-	# whole message 
-	temp = article.body
-        article.body = []
-	self.articleIndex[article.msgid] = pickle.dumps(article)
-	article.body = temp
-	self.changed[archive,article.msgid] = None
-
-	parentID = article.parentID
-	if parentID is not None and self.articleIndex.has_key(parentID): 
-	    parent = self.getArticle(archive, parentID)
-	    myThreadKey = parent.threadKey+article.date + '-'
-	else:
-            myThreadKey = article.date + '-'
-	article.threadKey = myThreadKey
-        key = myThreadKey + '\000' + article.msgid
-	self.setThreadKey(archive, key, article.msgid)
+    def addArticle(self, archive, article, subject=None, author=None,
+                   date=None):
+        self.__openIndices(archive)
+        self.__super_addArticle(archive, article, subject, author, date)
 
     # Open the BSDDB files that are being used as indices
     # (dateIndex, authorIndex, subjectIndex, articleIndex)
     def __openIndices(self, archive):
-	if self.__currentOpenArchive == archive: return
+	if self.__currentOpenArchive == archive:
+            return
 
 	import bsddb
 	self.__closeIndices()
@@ -676,6 +719,7 @@ class BSDDBdatabase(Database):
 		index.close() 
 		delattr(self,attr)
 	self.__currentOpenArchive = None
+        
     def close(self):
 	self.__closeIndices()
     def hasArticle(self, archive, msgid):
author	jhylton	2000-09-22 18:23:37 +0000
committer	jhylton	2000-09-22 18:23:37 +0000
commit	ad4ba52f694b2f026be863ab869b8d7ca83ab03c (patch)
tree	81c03da4aaa50d7c20b3f84609889dbc5dec66ed
parent	bf6d28c3c757820f428772472379cc8848948a85 (diff)
download	mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.gz mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.tar.zst mailman-ad4ba52f694b2f026be863ab869b8d7ca83ab03c.zip