diff options
Diffstat (limited to 'Mailman/Archiver/HyperArch.py')
| -rw-r--r-- | Mailman/Archiver/HyperArch.py | 156 |
1 files changed, 48 insertions, 108 deletions
diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py index 17b9e1a2a..affe66633 100644 --- a/Mailman/Archiver/HyperArch.py +++ b/Mailman/Archiver/HyperArch.py @@ -37,6 +37,7 @@ import types import HyperDatabase import pipermail import weakref +import binascii from email.Header import decode_header, make_header @@ -183,7 +184,7 @@ def quick_maketext(templatefile, dict=None, lang=None, mlist=None): except UnicodeError: # Try again after coercing the template to unicode utemplate = unicode(template, - charset or Utils.GetCharSet(lang), + Utils.GetCharSet(lang), 'replace') text = sdict.interpolate(utemplate) except (TypeError, ValueError): @@ -219,9 +220,6 @@ class Article(pipermail.Article): _last_article_time = time.time() - # Default - charset = Utils.GetCharSet(mm_cfg.DEFAULT_SERVER_LANGUAGE) - def __init__(self, message=None, sequence=0, keepHeaders=[], lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None): self.__super_init(message, sequence, keepHeaders) @@ -266,13 +264,20 @@ class Article(pipermail.Article): charset = charset[1:-1] if charset[0]=="'" and charset[-1]=="'": charset = charset[1:-1] - # check_header_charsets() sets self.charset - self.check_header_charsets(charset) - if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING: - self.quote = Utils.uquote - # Only one 'uquote' is left here. I wonder if this is of - # any use because 'quoting' conflicts 'verbatim'. There should - # not be any verbatim charset IMHO. (TK) + try: + body = message.get_payload(decode=1) + except binascii.Error: + body = None + if body and charset != Utils.GetCharSet(self._lang): + # decode body + try: + body = unicode(body, charset) + except (UnicodeError, LookupError): + body = None + if body: + self.body = [l + "\n" for l in body.splitlines()] + + self.decode_headers() # Mapping of listnames to MailList instances as a weak value dictionary. # This code is copied from Runner.py but there's one important operational @@ -332,8 +337,6 @@ class Article(pipermail.Article): self._lang = self._mlist.preferred_language else: self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE - if not d.has_key('charset'): - self.charset = Utils.GetCharSet(self._lang) if not d.has_key('cenc'): self.cenc = None if not d.has_key('decoded'): @@ -346,32 +349,22 @@ class Article(pipermail.Article): def quote(self, buf): return html_quote(buf, self._lang) - def check_header_charsets(self, msg_charset=None): - """Check From and Subject for encoded-words + def decode_headers(self): + """MIME-decode headers. If the email, subject, or author attributes contain non-ASCII - characters using the encoded-word syntax of RFC 2047, decoded - versions of those attributes are placed in the self.decoded (a - dictionary). + characters using the encoded-word syntax of RFC 2047, decoded versions + of those attributes are placed in the self.decoded (a dictionary). - If the charsets used by these headers differ from each other - or from the charset specified by the message's Content-Type - header, then an arbitrary charset is chosen. If the decoded - fields match that charset, they are preserved - literally. Otherwise, an attempt is made to decode them as - Unicode. If that fails, they are left undecoded. + If the list's charset differs from the header charset, an attempt is + made to decode the headers as Unicode. If that fails, they are left + undecoded. """ - - self.charset = msg_charset - author, a_charset = self.decode_charset(self.author) - if self.charset is None and a_charset: - self.charset = a_charset - subject, s_charset = self.decode_charset(self.subject) - if self.charset is None and s_charset: - self.charset = s_charset + author = self.decode_charset(self.author) + subject = self.decode_charset(self.subject) if author: self.decoded['author'] = author - email, e_charset = self.decode_charset(self.email) + email = self.decode_charset(self.email) if email: self.decoded['email'] = email if subject: @@ -379,28 +372,19 @@ class Article(pipermail.Article): def decode_charset(self, field): if field.find("=?") == -1: - return None, None + return None # Get the decoded header as a list of (s, charset) tuples pairs = decode_header(field) - mustunicode = 0 - for s, c in pairs: - # If the charset of all the header parts match the article's - # charset, leave it as encoded, otherwise try converting to - # Unicode. - if c and c <> 'us-ascii' and c <> self.charset: - mustunicode = 1 - break - if mustunicode: - # Use __unicode__() until we can guarantee Python 2.2 - try: - # Use a large number for maxlinelen so it won't get wrapped - h = make_header(pairs, 99999) - return h.__unicode__(), None - except (UnicodeError, LookupError): - # Unknown encoding - return None, None + # Use __unicode__() until we can guarantee Python 2.2 + try: + # Use a large number for maxlinelen so it won't get wrapped + h = make_header(pairs, 99999) + return h.__unicode__() + except (UnicodeError, LookupError): + # Unknown encoding + return None # The last value for c will have the proper charset in it - return EMPTYSTRING.join([s for s, c in pairs]), c + return EMPTYSTRING.join([s for s, c in pairs]) def as_html(self): d = self.__dict__.copy() @@ -461,17 +445,7 @@ class Article(pipermail.Article): If the charset of the current message and art match and the article's subject is encoded, decode it. """ - subj = art.decoded.get('subject') - if not subj: - return art.subject - if isinstance(subj, types.UnicodeType): - return subj - if self.charset and self.charset == art.charset: - return subj - try: - return unicode(subj, art.charset) - except (UnicodeError, LookupError): - return art.subject + return art.decoded.get('subject', art.subject) def _get_next(self): """Return the href and subject for the previous message""" @@ -496,33 +470,7 @@ class Article(pipermail.Article): body = self.html_body except AttributeError: body = self.body - if self.charset is None or self.cenc != "quoted-printable": - return null_to_space(EMPTYSTRING.join(body)) - # the charset is specified and the body is quoted-printable - # first get rid of soft line breaks, then decode literals - lines = [] - rx = self._rx_softline - for line in body: - mo = rx.search(line) - if mo: - i = line.rfind("=") - line = line[:i] - lines.append(line) - buf = EMPTYSTRING.join(lines) - - chunks = [] - offset = 0 - rx = self._rx_quote - while 1: - mo = rx.search(buf, offset) - if not mo: - chunks.append(buf[offset:]) - break - i = mo.start() - chunks.append(buf[offset:i]) - offset = i + 3 - chunks.append(chr(int(mo.group(1), 16))) - return null_to_space(EMPTYSTRING.join(chunks)) + return null_to_space(EMPTYSTRING.join(body)) def _add_decoded(self, d): """Add encoded-word keys to HTML output""" @@ -554,9 +502,10 @@ class Article(pipermail.Article): headers.append('References: %(_references)s') if d['_message_id']: headers.append('Message-ID: %(_message_id)s') - return NL.join(headers) % d + \ - '\n\n' + \ - EMPTYSTRING.join(self.body) + body = EMPTYSTRING.join(self.body) + if isinstance(body, types.UnicodeType): + body = body.encode(Utils.GetCharSet(self._lang), 'replace') + return NL.join(headers) % d + '\n\n' + body def _set_date(self, message): self.__super_set_date(message) @@ -1030,16 +979,8 @@ class HyperArchive(pipermail.T): result = article.decoded.get(field) if result is None: return getattr(article, field) - # if the encodings match, use the result - if self.charset == article.charset: - return result - # otherwise, try to return a Unicode result - if isinstance(result, types.UnicodeType): - return result - try: - return unicode(result, article.charset) - except (UnicodeError, LookupError): - return getattr(article, field) + # otherwise, the decoded one will be Unicode + return result def write_threadindex_entry(self, article, depth): if depth < 0: @@ -1193,17 +1134,16 @@ class HyperArchive(pipermail.T): #self.message("URL: %s %s %s \n" # % (CGIescape(L[:pos]), URL, CGIescape(text))) L2 += '%s<A HREF="%s">%s</A>' % ( - CGIescape(L[:pos], self.lang), - URL, CGIescape(text, self.lang)) + CGIescape(L[:pos], self.lang), + html_quote(URL), CGIescape(text, self.lang)) L = L[pos+length:] jr = emailpat.search(L) kr = urlpat.search(L) if jr is None and kr is None: L = CGIescape(L, self.lang) L = prefix + L2 + L + suffix - if L != Lorig: - source[i] = None - dest[i] = L + source[i] = None + dest[i] = L # Perform Hypermail-style processing of <HTML></HTML> directives # in message bodies. Lines between <HTML> and </HTML> will be written |
