diff options
| author | bwarsaw | 2002-10-04 20:45:53 +0000 |
|---|---|---|
| committer | bwarsaw | 2002-10-04 20:45:53 +0000 |
| commit | 9447ff68aa82ea696844e3a73f5ae79373dd5a9d (patch) | |
| tree | 4d1f09040a2bfa3901a894ac61683548539f44c2 | |
| parent | b1f0a5b8f5108fea2466ed04a4de9286b05198b9 (diff) | |
| download | mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.gz mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.zst mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.zip | |
uncanonstr(): We always want this to return a byte string, but it
needs to be "safe" for the given charset, which will be the charset
for the list's language, and the charset the page will be rendered
in.
If it's a Unicode and can be encoded in the desired charset, fine,
just return the encoded byte string. If it's already a byte string
encoded in the charset, again that's fine, just return the string.
Otherwise, html-ify the string and return it as a byte string.
| -rw-r--r-- | Mailman/Utils.py | 22 |
1 files changed, 17 insertions, 5 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py index e5be24c0c..267f85f71 100644 --- a/Mailman/Utils.py +++ b/Mailman/Utils.py @@ -721,7 +721,11 @@ def canonstr(s, lang=None): break ref = parts.pop(0) if ref.startswith('#'): - appchr(int(ref[1:])) + try: + appchr(int(ref[1:])) + except ValueError: + # Non-convertable, stick with what we got + newparts.append('&'+ref+';') else: c = htmlentitydefs.entitydefs.get(ref, '?') if c.startswith('#') and c.endswith(';'): @@ -746,8 +750,9 @@ def canonstr(s, lang=None): # The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the -# charset of the given language, and failing that, replaces non-ASCII -# characters with their html references. +# charset of the given language, which is the character set that the page will +# be rendered in, and failing that, replaces non-ASCII characters with their +# html references. It always returns a byte string. def uncanonstr(s, lang=None): if s is None: s = u'' @@ -755,10 +760,17 @@ def uncanonstr(s, lang=None): charset = 'us-ascii' else: charset = GetCharSet(lang) - # BAW should change this to a type types of s + # See if the string contains characters only in the desired character + # set. If so, return it unchanged, except for coercing it to a byte + # string. try: - return s.encode(charset, 'strict') + if isinstance(s, UnicodeType): + return s.encode(charset) + else: + u = unicode(s, charset) + return s except UnicodeError: + # Nope, it contains funny characters, so html-ref it a = [] for c in s: o = ord(c) |
