uncanonstr(): We always want this to return a byte string, but it

needs to be "safe" for the given charset, which will be the charset for the list's language, and the charset the page will be rendered in. If it's a Unicode and can be encoded in the desired charset, fine, just return the encoded byte string. If it's already a byte string encoded in the charset, again that's fine, just return the string. Otherwise, html-ify the string and return it as a byte string.
author: bwarsaw 2002-10-04 20:45:53 +0000
committer: bwarsaw 2002-10-04 20:45:53 +0000
commit: 9447ff68aa82ea696844e3a73f5ae79373dd5a9d (patch)
tree: 4d1f09040a2bfa3901a894ac61683548539f44c2
parent: b1f0a5b8f5108fea2466ed04a4de9286b05198b9 (diff)
download: mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.gz
mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.zst
mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.zip
1 files changed, 17 insertions, 5 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index e5be24c0c..267f85f71 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -721,7 +721,11 @@ def canonstr(s, lang=None):
             break
         ref = parts.pop(0)
         if ref.startswith('#'):
-            appchr(int(ref[1:]))
+            try:
+                appchr(int(ref[1:]))
+            except ValueError:
+                # Non-convertable, stick with what we got
+                newparts.append('&'+ref+';')
         else:
             c = htmlentitydefs.entitydefs.get(ref, '?')
             if c.startswith('#') and c.endswith(';'):
@@ -746,8 +750,9 @@ def canonstr(s, lang=None):
 
 
 # The opposite of canonstr() -- sorta.  I.e. it attempts to encode s in the
-# charset of the given language, and failing that, replaces non-ASCII
-# characters with their html references.
+# charset of the given language, which is the character set that the page will
+# be rendered in, and failing that, replaces non-ASCII characters with their
+# html references.  It always returns a byte string.
 def uncanonstr(s, lang=None):
     if s is None:
         s = u''
@@ -755,10 +760,17 @@ def uncanonstr(s, lang=None):
         charset = 'us-ascii'
     else:
         charset = GetCharSet(lang)
-    # BAW should change this to a type types of s
+    # See if the string contains characters only in the desired character
+    # set.  If so, return it unchanged, except for coercing it to a byte
+    # string.
     try:
-        return s.encode(charset, 'strict')
+        if isinstance(s, UnicodeType):
+            return s.encode(charset)
+        else:
+            u = unicode(s, charset)
+            return s
     except UnicodeError:
+        # Nope, it contains funny characters, so html-ref it
         a = []
         for c in s:
             o = ord(c)
author	bwarsaw	2002-10-04 20:45:53 +0000
committer	bwarsaw	2002-10-04 20:45:53 +0000
commit	9447ff68aa82ea696844e3a73f5ae79373dd5a9d (patch)
tree	4d1f09040a2bfa3901a894ac61683548539f44c2
parent	b1f0a5b8f5108fea2466ed04a4de9286b05198b9 (diff)
download	mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.gz mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.zst mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.zip