[PATCH 4 of 5] encoding: use getutf8char in toutf8b
Matt Mackall
mpm at selenic.com
Fri Nov 6 21:48:49 UTC 2015
# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1446765703 21600
# Thu Nov 05 17:21:43 2015 -0600
# Node ID aec423dc477c9e9f0eecde6ae436c6f8ef7d9999
# Parent ac268a2bea78b28e3317b9bebf204ad58a1ff62e
encoding: use getutf8char in toutf8b
This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.
diff -r ac268a2bea78 -r aec423dc477c mercurial/encoding.py
--- a/mercurial/encoding.py Thu Nov 05 17:11:50 2015 -0600
+++ b/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600
@@ -470,17 +470,20 @@
s.decode('utf-8')
return s
except UnicodeDecodeError:
- # surrogate-encode any characters that don't round-trip
- s2 = s.decode('utf-8', 'ignore').encode('utf-8')
- r = ""
- pos = 0
- for c in s:
- if s2[pos:pos + 1] == c:
- r += c
- pos += 1
- else:
- r += unichr(0xdc00 + ord(c)).encode('utf-8')
- return r
+ pass
+
+ r = ""
+ pos = 0
+ l = len(s)
+ while pos < l:
+ try:
+ c = getutf8char(s, pos)
+ pos += len(c)
+ except UnicodeDecodeError:
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ pos += 1
+ r += c
+ return r
def fromutf8b(s):
'''Given a UTF-8b string, return a local, possibly-binary string.
More information about the Mercurial-devel
mailing list