[PATCH 1 of 3] encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara
yuya at tcha.org
Mon May 7 13:17:14 UTC 2018
# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1524364733 -32400
# Sun Apr 22 11:38:53 2018 +0900
# Node ID bcf0435282e041532b2c66c131d34180ac85bf94
# Parent a7e53b70e5026bac9772f9869e754a2a6f530587
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
If 's' is a localstr, 's._utf8' must be returned to get the original UTF-8
sequence back. Because of this, it was totally wrong to test if '"\xed" not
in s', which should be either '"\xed" not in s._utf8' or just omitted.
This patch moves the localstr handling to top as the validity of 's._utf8'
should be pre-checked by encoding.tolocal().
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -504,11 +504,13 @@ def toutf8b(s):
internal surrogate encoding as a UTF-8 string.)
'''
- if not isinstance(s, localstr) and isasciistr(s):
+ if isinstance(s, localstr):
+ # assume that the original UTF-8 sequence would never contain
+ # invalid characters in U+DCxx range
+ return s._utf8
+ elif isasciistr(s):
return s
if "\xed" not in s:
- if isinstance(s, localstr):
- return s._utf8
try:
s.decode('utf-8', _utf8strict)
return s
diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py
--- a/tests/test-encoding-func.py
+++ b/tests/test-encoding-func.py
@@ -35,11 +35,32 @@ class LocalEncodingTest(unittest.TestCas
self.assertTrue(s is encoding.fromlocal(s))
class Utf8bEncodingTest(unittest.TestCase):
+ def setUp(self):
+ self.origencoding = encoding.encoding
+
+ def tearDown(self):
+ encoding.encoding = self.origencoding
+
def testasciifastpath(self):
s = b'\0' * 100
self.assertTrue(s is encoding.toutf8b(s))
self.assertTrue(s is encoding.fromutf8b(s))
+ def testlossylatin(self):
+ encoding.encoding = b'ascii'
+ s = u'\xc0'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertEqual(l, b'?') # lossy
+ self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+
+ def testlossy0xed(self):
+ encoding.encoding = b'euc-kr' # U+Dxxx Hangul
+ s = u'\ud1bc\xc0'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertIn(b'\xed', l)
+ self.assertTrue(l.endswith(b'?')) # lossy
+ self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+
if __name__ == '__main__':
import silenttestrunner
silenttestrunner.main(__name__)
More information about the Mercurial-devel
mailing list