2 files changed, 45 insertions, 15 deletions
diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py
index 10307f416..a2345082f 100644
--- a/Mailman/Archiver/HyperArch.py
+++ b/Mailman/Archiver/HyperArch.py
@@ -194,7 +194,7 @@ class Article(pipermail.Article):
     text_tmpl = article_text_template
 
     # for compatibility with old archives loaded via pickle
-    charset = None
+    charset = mm_cfg.DEFAULT_CHARSET
     cenc = None
     decoded = {}
 
@@ -226,9 +226,11 @@ class Article(pipermail.Article):
             self.check_header_charsets(string.lower(mo.group(1)))
         else:
             self.check_header_charsets()
-        if self.charset:
-            assert self.charset == string.lower(self.charset), \
-                   self.charset
+        if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING:
+            self.quote = lambda x:x
+
+    def quote(self, buf):
+        return html_quote(buf)
 
     def check_header_charsets(self, msg_charset=None):
         """Check From and Subject for encoded-words
@@ -276,12 +278,12 @@ class Article(pipermail.Article):
         d["prev"], d["prev_wsubj"] = self._get_prev()
         d["next"], d["next_wsubj"] = self._get_next()
 	
-	d["email_html"] = html_quote(self.email)
-	d["title"] = html_quote(self.subject)
-	d["subject_html"] = html_quote(self.subject)
-	d["author_html"] = html_quote(self.author)
+	d["email_html"] = self.quote(self.email)
+	d["title"] = self.quote(self.subject)
+	d["subject_html"] = self.quote(self.subject)
+	d["author_html"] = self.quote(self.author)
 	d["email_url"] = url_quote(self.email)
-	d["datestr_html"] = html_quote(self.datestr)
+	d["datestr_html"] = self.quote(self.datestr)
         d["body"] = self._get_body()
 
         if self.charset is not None:
@@ -302,7 +304,7 @@ class Article(pipermail.Article):
 	    prev_wsubj = ('<LI> Previous message:'
                           ' <A HREF="%s">%s\n</A></li>'
                           % (url_quote(self.prev.filename),
-                             html_quote(subject)))
+                             self.quote(subject)))
         else:
             prev = prev_wsubj = ""
         return prev, prev_wsubj
@@ -328,7 +330,7 @@ class Article(pipermail.Article):
 	    next_wsubj = ('<LI> Next message:'
                           ' <A HREF="%s">%s\n</A></li>'
                           % (url_quote(self.next.filename),
-                             html_quote(subject)))
+                             self.quote(subject)))
         else:
             next = next_wsubj = ""
         return next, next_wsubj
@@ -338,13 +340,17 @@ class Article(pipermail.Article):
 
     def _get_body(self):
         """Return the message body ready for HTML, decoded if necessary"""
+        try:
+            body = self.html_body
+        except AttributeError:
+            body = self.body
         if self.charset is None or self.cenc != "quoted-printable":
-            return null_to_space(string.join(self.body, ""))
+            return null_to_space(string.join(body, ""))
         # the charset is specified and the body is quoted-printable
         # first get rid of soft line breaks, then decode literals
         lines = []
         rx = self._rx_softline
-        for line in self.body:
+        for line in body:
             mo = rx.search(line)
             if mo:
                 i = string.rfind(line, "=")
@@ -401,6 +407,8 @@ class Article(pipermail.Article):
     def __getstate__(self):
         d={}
         for each in self.__dict__.keys():
+            if each == "quote":
+                continue
             if each in ['maillist','prev','next','body']:
                 d[each] = None
             else:
@@ -888,6 +896,7 @@ class HyperArchive(pipermail.T):
         toc.close()
 
     def write_article(self, index, article, path):
+        # called by add_article
         f = open_ex(path, 'w')
         f.write(article.as_html())
         f.close()
@@ -1049,9 +1058,14 @@ class HyperArchive(pipermail.T):
                 i = i + 1
 	    
     def format_article(self, article):
+        # called from add_article
+        # TBD: Why do the HTML formatting here and keep it in the
+        # pipermail database?  It makes more sense to do the html
+        # formatting as the article is being written as html and toss
+        # the data after it has been written to the archive file.
 	lines = filter(None, article.body)
 	# Handle <HTML> </HTML> directives
-	if self.ALLOWHTML: 
+	if self.ALLOWHTML:
 	    self.__processbody_HTML(lines)
 	self.__processbody_URLquote(lines)
 	if not self.SHOWHTML and lines: 
@@ -1066,7 +1080,7 @@ class HyperArchive(pipermail.T):
                     s = lines[i]
 		    if s[0:1] in ' \t\n':
                         lines[i] = '<P>' + s
-        article.body = lines
+        article.html_body = lines
 	return article
 
     def update_article(self, arcdir, article, prev, next):
diff --git a/Mailman/Defaults.py.in b/Mailman/Defaults.py.in
index b70bbe81e..036da76f3 100644
--- a/Mailman/Defaults.py.in
+++ b/Mailman/Defaults.py.in
@@ -117,6 +117,22 @@ GZIP_ARCHIVE_TXT_FILES = 0
 # in the archives too.
 ARCHIVER_OBSCURES_EMAILADDRS = 0
 
+# Pipermail assumes that messages bodies contain US-ASCII text.
+# Change this option to define a different character set to be used as
+# the default character set for the archive.  The term "character set"
+# is used in MIME to refer to a method of converting a sequence of
+# octets into a sequence of characters.  If you change the default
+# charset, you might need to add it to VERBATIM_ENCODING below.
+DEFAULT_CHARSET = None
+
+# Most character set encodings require special HTML entity characters
+# to be quoted, otherwise they won't look right in the Pipermail
+# archives.  However some character sets must not quote these
+# characters so that they can be rendered properly in the browsers.
+# The primary issue is multi-byte encodings where the octet 0x26 does
+# not always represent the & character.  This variable contains a list
+# of such characters sets which are not HTML-quoted in the archives.
+VERBATIM_ENCODING = ['iso-2022-jp']
 
 
 #####