summaryrefslogtreecommitdiff
path: root/Mailman/Archiver/HyperArch.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Archiver/HyperArch.py')
-rw-r--r--Mailman/Archiver/HyperArch.py156
1 files changed, 48 insertions, 108 deletions
diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py
index 17b9e1a2a..affe66633 100644
--- a/Mailman/Archiver/HyperArch.py
+++ b/Mailman/Archiver/HyperArch.py
@@ -37,6 +37,7 @@ import types
import HyperDatabase
import pipermail
import weakref
+import binascii
from email.Header import decode_header, make_header
@@ -183,7 +184,7 @@ def quick_maketext(templatefile, dict=None, lang=None, mlist=None):
except UnicodeError:
# Try again after coercing the template to unicode
utemplate = unicode(template,
- charset or Utils.GetCharSet(lang),
+ Utils.GetCharSet(lang),
'replace')
text = sdict.interpolate(utemplate)
except (TypeError, ValueError):
@@ -219,9 +220,6 @@ class Article(pipermail.Article):
_last_article_time = time.time()
- # Default
- charset = Utils.GetCharSet(mm_cfg.DEFAULT_SERVER_LANGUAGE)
-
def __init__(self, message=None, sequence=0, keepHeaders=[],
lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None):
self.__super_init(message, sequence, keepHeaders)
@@ -266,13 +264,20 @@ class Article(pipermail.Article):
charset = charset[1:-1]
if charset[0]=="'" and charset[-1]=="'":
charset = charset[1:-1]
- # check_header_charsets() sets self.charset
- self.check_header_charsets(charset)
- if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING:
- self.quote = Utils.uquote
- # Only one 'uquote' is left here. I wonder if this is of
- # any use because 'quoting' conflicts 'verbatim'. There should
- # not be any verbatim charset IMHO. (TK)
+ try:
+ body = message.get_payload(decode=1)
+ except binascii.Error:
+ body = None
+ if body and charset != Utils.GetCharSet(self._lang):
+ # decode body
+ try:
+ body = unicode(body, charset)
+ except (UnicodeError, LookupError):
+ body = None
+ if body:
+ self.body = [l + "\n" for l in body.splitlines()]
+
+ self.decode_headers()
# Mapping of listnames to MailList instances as a weak value dictionary.
# This code is copied from Runner.py but there's one important operational
@@ -332,8 +337,6 @@ class Article(pipermail.Article):
self._lang = self._mlist.preferred_language
else:
self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
- if not d.has_key('charset'):
- self.charset = Utils.GetCharSet(self._lang)
if not d.has_key('cenc'):
self.cenc = None
if not d.has_key('decoded'):
@@ -346,32 +349,22 @@ class Article(pipermail.Article):
def quote(self, buf):
return html_quote(buf, self._lang)
- def check_header_charsets(self, msg_charset=None):
- """Check From and Subject for encoded-words
+ def decode_headers(self):
+ """MIME-decode headers.
If the email, subject, or author attributes contain non-ASCII
- characters using the encoded-word syntax of RFC 2047, decoded
- versions of those attributes are placed in the self.decoded (a
- dictionary).
+ characters using the encoded-word syntax of RFC 2047, decoded versions
+ of those attributes are placed in the self.decoded (a dictionary).
- If the charsets used by these headers differ from each other
- or from the charset specified by the message's Content-Type
- header, then an arbitrary charset is chosen. If the decoded
- fields match that charset, they are preserved
- literally. Otherwise, an attempt is made to decode them as
- Unicode. If that fails, they are left undecoded.
+ If the list's charset differs from the header charset, an attempt is
+ made to decode the headers as Unicode. If that fails, they are left
+ undecoded.
"""
-
- self.charset = msg_charset
- author, a_charset = self.decode_charset(self.author)
- if self.charset is None and a_charset:
- self.charset = a_charset
- subject, s_charset = self.decode_charset(self.subject)
- if self.charset is None and s_charset:
- self.charset = s_charset
+ author = self.decode_charset(self.author)
+ subject = self.decode_charset(self.subject)
if author:
self.decoded['author'] = author
- email, e_charset = self.decode_charset(self.email)
+ email = self.decode_charset(self.email)
if email:
self.decoded['email'] = email
if subject:
@@ -379,28 +372,19 @@ class Article(pipermail.Article):
def decode_charset(self, field):
if field.find("=?") == -1:
- return None, None
+ return None
# Get the decoded header as a list of (s, charset) tuples
pairs = decode_header(field)
- mustunicode = 0
- for s, c in pairs:
- # If the charset of all the header parts match the article's
- # charset, leave it as encoded, otherwise try converting to
- # Unicode.
- if c and c <> 'us-ascii' and c <> self.charset:
- mustunicode = 1
- break
- if mustunicode:
- # Use __unicode__() until we can guarantee Python 2.2
- try:
- # Use a large number for maxlinelen so it won't get wrapped
- h = make_header(pairs, 99999)
- return h.__unicode__(), None
- except (UnicodeError, LookupError):
- # Unknown encoding
- return None, None
+ # Use __unicode__() until we can guarantee Python 2.2
+ try:
+ # Use a large number for maxlinelen so it won't get wrapped
+ h = make_header(pairs, 99999)
+ return h.__unicode__()
+ except (UnicodeError, LookupError):
+ # Unknown encoding
+ return None
# The last value for c will have the proper charset in it
- return EMPTYSTRING.join([s for s, c in pairs]), c
+ return EMPTYSTRING.join([s for s, c in pairs])
def as_html(self):
d = self.__dict__.copy()
@@ -461,17 +445,7 @@ class Article(pipermail.Article):
If the charset of the current message and art match and the
article's subject is encoded, decode it.
"""
- subj = art.decoded.get('subject')
- if not subj:
- return art.subject
- if isinstance(subj, types.UnicodeType):
- return subj
- if self.charset and self.charset == art.charset:
- return subj
- try:
- return unicode(subj, art.charset)
- except (UnicodeError, LookupError):
- return art.subject
+ return art.decoded.get('subject', art.subject)
def _get_next(self):
"""Return the href and subject for the previous message"""
@@ -496,33 +470,7 @@ class Article(pipermail.Article):
body = self.html_body
except AttributeError:
body = self.body
- if self.charset is None or self.cenc != "quoted-printable":
- return null_to_space(EMPTYSTRING.join(body))
- # the charset is specified and the body is quoted-printable
- # first get rid of soft line breaks, then decode literals
- lines = []
- rx = self._rx_softline
- for line in body:
- mo = rx.search(line)
- if mo:
- i = line.rfind("=")
- line = line[:i]
- lines.append(line)
- buf = EMPTYSTRING.join(lines)
-
- chunks = []
- offset = 0
- rx = self._rx_quote
- while 1:
- mo = rx.search(buf, offset)
- if not mo:
- chunks.append(buf[offset:])
- break
- i = mo.start()
- chunks.append(buf[offset:i])
- offset = i + 3
- chunks.append(chr(int(mo.group(1), 16)))
- return null_to_space(EMPTYSTRING.join(chunks))
+ return null_to_space(EMPTYSTRING.join(body))
def _add_decoded(self, d):
"""Add encoded-word keys to HTML output"""
@@ -554,9 +502,10 @@ class Article(pipermail.Article):
headers.append('References: %(_references)s')
if d['_message_id']:
headers.append('Message-ID: %(_message_id)s')
- return NL.join(headers) % d + \
- '\n\n' + \
- EMPTYSTRING.join(self.body)
+ body = EMPTYSTRING.join(self.body)
+ if isinstance(body, types.UnicodeType):
+ body = body.encode(Utils.GetCharSet(self._lang), 'replace')
+ return NL.join(headers) % d + '\n\n' + body
def _set_date(self, message):
self.__super_set_date(message)
@@ -1030,16 +979,8 @@ class HyperArchive(pipermail.T):
result = article.decoded.get(field)
if result is None:
return getattr(article, field)
- # if the encodings match, use the result
- if self.charset == article.charset:
- return result
- # otherwise, try to return a Unicode result
- if isinstance(result, types.UnicodeType):
- return result
- try:
- return unicode(result, article.charset)
- except (UnicodeError, LookupError):
- return getattr(article, field)
+ # otherwise, the decoded one will be Unicode
+ return result
def write_threadindex_entry(self, article, depth):
if depth < 0:
@@ -1193,17 +1134,16 @@ class HyperArchive(pipermail.T):
#self.message("URL: %s %s %s \n"
# % (CGIescape(L[:pos]), URL, CGIescape(text)))
L2 += '%s<A HREF="%s">%s</A>' % (
- CGIescape(L[:pos], self.lang),
- URL, CGIescape(text, self.lang))
+ CGIescape(L[:pos], self.lang),
+ html_quote(URL), CGIescape(text, self.lang))
L = L[pos+length:]
jr = emailpat.search(L)
kr = urlpat.search(L)
if jr is None and kr is None:
L = CGIescape(L, self.lang)
L = prefix + L2 + L + suffix
- if L != Lorig:
- source[i] = None
- dest[i] = L
+ source[i] = None
+ dest[i] = L
# Perform Hypermail-style processing of <HTML></HTML> directives
# in message bodies. Lines between <HTML> and </HTML> will be written