From 0efe92058114a180ea14d254049e79b4f4bea057 Mon Sep 17 00:00:00 2001
From: bwarsaw
Date: Wed, 18 Sep 2002 05:38:51 +0000
Subject: canonstr(): With help from Tokio Kikuchi, this patch should make for
 better Japanese support.  If we're converting to &#XYZ; form, use chr() if
 the integer is < 256 otherwise use unichr().  If the resulting joined string
 is already a Unicode, don't convert it.

uncanonstr(): Use u'' instead of '' as the string when s is None.  The
Japanese codecs require a Unicode string as its argument, not an 8-bit
string.
---
 Mailman/Utils.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'Mailman/Utils.py')
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 12141c2c8..3443393fa 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -23,6 +23,8 @@ the mailing lists, and whatever else doesn't belong elsewhere.
 
 """
 
+from __future__ import nested_scopes
+
 import os
 import re
 import random
@@ -33,6 +35,7 @@ import time
 import cgi
 import htmlentitydefs
 import email.Iterators
+from types import UnicodeType
 from string import whitespace, digits
 try:
     # Python 2.2
@@ -703,20 +706,27 @@ def percent_identifiers(s):
 def canonstr(s, lang=None):
     newparts = []
     parts = re.split(r'&(?P<ref>[^;]+);', s)
+    def appchr(i):
+        if i < 256:
+            newparts.append(chr(i))
+        else:
+            newparts.append(unichr(i))
     while 1:
         newparts.append(parts.pop(0))
         if not parts:
             break
         ref = parts.pop(0)
         if ref.startswith('#'):
-            newparts.append(chr(int(ref[1:])))
+            appchr(int(ref[1:]))
         else:
             c = htmlentitydefs.entitydefs.get(ref, '?')
             if c.startswith('#') and c.endswith(';'):
-                newparts.append(chr(ref[1:-1]))
+                appchr(int(ref[1:-1]))
             else:
                 newparts.append(c)
     newstr = EMPTYSTRING.join(newparts)
+    if isinstance(newstr, UnicodeType):
+        return newstr
     # We want the default fallback to be iso-8859-1 even if the language is
     # English (us-ascii).  This seems like a practical compromise so that
     # non-ASCII characters in names can be used in English lists w/o having to
@@ -736,7 +746,7 @@ def canonstr(s, lang=None):
 # characters with their html references.
 def uncanonstr(s, lang=None):
     if s is None:
-        s = ''
+        s = u''
     if lang is None:
         charset = 'us-ascii'
     else:
-- 
cgit v1.2.3-70-g09d2