diff options
| -rw-r--r-- | Mailman/Utils.py | 22 |
1 files changed, 17 insertions, 5 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py index e5be24c0c..267f85f71 100644 --- a/Mailman/Utils.py +++ b/Mailman/Utils.py @@ -721,7 +721,11 @@ def canonstr(s, lang=None): break ref = parts.pop(0) if ref.startswith('#'): - appchr(int(ref[1:])) + try: + appchr(int(ref[1:])) + except ValueError: + # Non-convertable, stick with what we got + newparts.append('&'+ref+';') else: c = htmlentitydefs.entitydefs.get(ref, '?') if c.startswith('#') and c.endswith(';'): @@ -746,8 +750,9 @@ def canonstr(s, lang=None): # The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the -# charset of the given language, and failing that, replaces non-ASCII -# characters with their html references. +# charset of the given language, which is the character set that the page will +# be rendered in, and failing that, replaces non-ASCII characters with their +# html references. It always returns a byte string. def uncanonstr(s, lang=None): if s is None: s = u'' @@ -755,10 +760,17 @@ def uncanonstr(s, lang=None): charset = 'us-ascii' else: charset = GetCharSet(lang) - # BAW should change this to a type types of s + # See if the string contains characters only in the desired character + # set. If so, return it unchanged, except for coercing it to a byte + # string. try: - return s.encode(charset, 'strict') + if isinstance(s, UnicodeType): + return s.encode(charset) + else: + u = unicode(s, charset) + return s except UnicodeError: + # Nope, it contains funny characters, so html-ref it a = [] for c in s: o = ord(c) |
