[PATCH 2 of 2 STABLE] i18n: fix case folding problem in some problematic encodings
FUJIWARA Katsunori
foozy at lares.dti.ne.jp
Thu Nov 24 06:19:20 UTC 2011
# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1322114737 -32400
# Branch stable
# Node ID ccae15e928bfd34266aca2ecf257207597af832a
# Parent 8b3cfad9c07307cabb8a49861563bc57ddf37fbd
i18n: fix case folding problem in some problematic encodings
changeset 28e98a8b173d for case folding problem with ambiguous
encoding was not enough.
this patch covers up a fault of fix in it.
this patch switches:
- from "str.lower()" to "encoding.lower()"
"str.lower()" on byte sequence may break string in some character
encoding (e.g.: cp932 for Japanese)
this patch also add comments to lines where "str.lower()" on
filenames are kept because it is enough, for future maintenance.
- from "os.path.normcase()" to "util.normcase()"
in changeset 6eff984d8e76 and b2fd4746414a, some of
"os.path.normcase()" invocations are replaced by
"util.normcase()", but not all.
for consistency, this patch replace all "os.path.normcase()"
invocations by "util.normcase()" other than ones in "windows.py".
patch hunk to catch LookupError in "encoding.lower()" is for passing
test-encoding.t, because switching to "encoding.lower()" causes
configuration error detection here.
diff -r 8b3cfad9c073 -r ccae15e928bf hgext/win32mbcs.py
--- a/hgext/win32mbcs.py Thu Nov 24 14:59:26 2011 +0900
+++ b/hgext/win32mbcs.py Thu Nov 24 15:05:37 2011 +0900
@@ -130,7 +130,8 @@
os.path.splitunc os.path.normpath os.path.normcase os.makedirs
mercurial.util.endswithsep mercurial.util.splitpath mercurial.util.checkcase
mercurial.util.fspath mercurial.util.pconvert mercurial.util.normpath
- mercurial.util.checkwinfilename mercurial.util.checkosfilename'''
+ mercurial.util.checkwinfilename mercurial.util.checkosfilename
+ mercurial.util.normcase'''
# codec and alias names of sjis and big5 to be faked.
problematic_encodings = '''big5 big5-tw csbig5 big5hkscs big5-hkscs
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/encoding.py
--- a/mercurial/encoding.py Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/encoding.py Thu Nov 24 15:05:37 2011 +0900
@@ -171,3 +171,5 @@
return lu.encode(encoding)
except UnicodeError:
return s.lower() # we don't know how to fold this except in ASCII
+ except LookupError, k:
+ raise error.Abort("%s, please check your locale settings" % k)
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/posix.py
--- a/mercurial/posix.py Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/posix.py Thu Nov 24 15:05:37 2011 +0900
@@ -6,6 +6,7 @@
# GNU General Public License version 2 or any later version.
from i18n import _
+import encoding
import os, sys, errno, stat, getpass, pwd, grp, tempfile, unicodedata
posixfile = open
@@ -166,7 +167,8 @@
# os.path.normcase is a no-op, which doesn't help us on non-native filesystems
def normcase(path):
- return path.lower()
+ # 'path' may be byte sequence in problematic character encoding
+ return encoding.lower(path)
if sys.platform == 'darwin':
import fcntl # only needed on darwin, missing on jython
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/scmutil.py
--- a/mercurial/scmutil.py Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/scmutil.py Thu Nov 24 15:05:37 2011 +0900
@@ -86,8 +86,11 @@
# AIX ignores "/" at end of path, others raise EISDIR.
if util.endswithsep(path):
raise util.Abort(_("path ends in directory separator: %s") % path)
- normpath = os.path.normcase(path)
+ normpath = util.normcase(path)
parts = util.splitpath(normpath)
+ # below 'lower()'s are for comparison only with ASCII strings,
+ # so encoding.lower() is not needed
+ # even if encoding of 'path' is problematic one
if (os.path.splitdrive(path)[0]
or parts[0].lower() in ('.hg', '.hg.', '')
or os.pardir in parts):
@@ -451,6 +454,9 @@
return rcpath
value = value.replace('/', os.sep)
for p in value.split(os.pathsep):
+ # below 'lower()' is for comparison only with ASCII string,
+ # so encoding.lower() is not needed
+ # even if encoding of 'path' is problematic one
if p.lower().endswith('mercurial.ini'):
rcpath.append(p)
elif os.path.isdir(p):
diff -r 8b3cfad9c073 -r ccae15e928bf mercurial/util.py
--- a/mercurial/util.py Thu Nov 24 14:59:26 2011 +0900
+++ b/mercurial/util.py Thu Nov 24 15:05:37 2011 +0900
@@ -593,6 +593,10 @@
Requires a path (like /foo/.hg) ending with a foldable final
directory component.
"""
+ # this is invoked with 'path' of which last element consists of
+ # only ASCII characters (e.g.: '.hg'),
+ # so upper()/lower() never cause problem,
+ # even if encoding of 'path' is problematic one
s1 = os.stat(path)
d, b = os.path.split(path)
p2 = os.path.join(d, b.upper())
@@ -614,8 +618,17 @@
with root. Note that this function is unnecessary, and should not be
called, for case-sensitive filesystems (simply because it's expensive).
'''
+
+ # 'name' may be byte sequence in problematic character encoding,
+ # because win32mbcs is enabled only in Windows native environment.
+ def lower(x):
+ if isinstance(x, unicode): # win32mbcs enabled
+ return x.lower()
+ else:
+ return encoding.lower(x)
+
# If name is absolute, make it relative
- if name.lower().startswith(root.lower()):
+ if lower(name).startswith(lower(root)):
l = len(root)
if name[l] == os.sep or name[l] == os.altsep:
l = l + 1
@@ -630,8 +643,19 @@
# Protect backslashes. This gets silly very quickly.
seps.replace('\\','\\\\')
pattern = re.compile(r'([^%s]+)|([%s]+)' % (seps, seps))
- dir = os.path.normcase(os.path.normpath(root))
+ dir = normcase(os.path.normpath(root))
result = []
+
+ # encoding.fromlocal() on 'name' is not needed because:
+ #
+ # - Windows ('\\' as path separator) + problematic encoding
+ # should enable win32mbcs which wraps this function for
+ # unicode-nize of 'name'
+ #
+ # - other posix environment (e.g.: cygwin) uses '/' as path
+ # separator, so re.findall() does not break byte sequence,
+ # even though it uses problematic encoding
+
for part, sep in pattern.findall(name):
if sep:
result.append(sep)
@@ -641,10 +665,10 @@
_fspathcache[dir] = os.listdir(dir)
contents = _fspathcache[dir]
- lpart = part.lower()
+ lpart = lower(part)
lenp = len(part)
for n in contents:
- if lenp == len(n) and n.lower() == lpart:
+ if lenp == len(n) and lower(n) == lpart:
result.append(n)
break
else:
More information about the Mercurial-devel
mailing list