[PATCH 4 of 5] encoding: use getutf8char in toutf8b

Matt Mackall mpm at selenic.com
Fri Nov 6 21:48:49 UTC 2015


# HG changeset patch
# User Matt Mackall <mpm at selenic.com>
# Date 1446765703 21600
#      Thu Nov 05 17:21:43 2015 -0600
# Node ID aec423dc477c9e9f0eecde6ae436c6f8ef7d9999
# Parent  ac268a2bea78b28e3317b9bebf204ad58a1ff62e
encoding: use getutf8char in toutf8b

This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.

diff -r ac268a2bea78 -r aec423dc477c mercurial/encoding.py
--- a/mercurial/encoding.py	Thu Nov 05 17:11:50 2015 -0600
+++ b/mercurial/encoding.py	Thu Nov 05 17:21:43 2015 -0600
@@ -470,17 +470,20 @@
         s.decode('utf-8')
         return s
     except UnicodeDecodeError:
-        # surrogate-encode any characters that don't round-trip
-        s2 = s.decode('utf-8', 'ignore').encode('utf-8')
-        r = ""
-        pos = 0
-        for c in s:
-            if s2[pos:pos + 1] == c:
-                r += c
-                pos += 1
-            else:
-                r += unichr(0xdc00 + ord(c)).encode('utf-8')
-        return r
+        pass
+
+    r = ""
+    pos = 0
+    l = len(s)
+    while pos < l:
+        try:
+            c = getutf8char(s, pos)
+            pos += len(c)
+        except UnicodeDecodeError:
+            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+            pos += 1
+        r += c
+    return r
 
 def fromutf8b(s):
     '''Given a UTF-8b string, return a local, possibly-binary string.



More information about the Mercurial-devel mailing list