[PATCH 1 of 2] encoding: add BOM (byte-order-mark) handling function (issue2162)
Yuya Nishihara
yuya at tcha.org
Sat Jun 19 13:56:34 UTC 2010
# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1276951076 -32400
# Node ID 19dc05994d6cc2727efbde32c53bcbb1dfbdc83b
# Parent e5a2134c083b223bd2998b6694e430e4999caee3
encoding: add BOM (byte-order-mark) handling function (issue2162)
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -6,7 +6,7 @@
# GNU General Public License version 2 or any later version.
import error
-import sys, unicodedata, locale, os
+import sys, unicodedata, locale, os, codecs
_encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
@@ -75,3 +75,39 @@ def colwidth(s):
return sum([w(c) in 'WFA' and 2 or 1 for c in d])
return len(d)
+_bommap = [
+ ('utf_8', codecs.BOM_UTF8),
+ ('utf_32_le', codecs.BOM_UTF32_LE), ('utf_32_be', codecs.BOM_UTF32_BE),
+ # BOM_UTF16 must be after BOM_UTF32 because they have the same
+ # leading characters.
+ ('utf_16_le', codecs.BOM_UTF16_LE), ('utf_16_be', codecs.BOM_UTF16_BE)]
+
+def detectbom(s):
+ """Strip BOM from the given string; return (stripped, encoding)"""
+ if (not s) or (ord(s[0]) < 0x80 and ord(s[0]) != 0x00):
+ return s, None # obviously not a unicode
+
+ for enc, bom in _bommap:
+ if s.startswith(bom):
+ return s[len(bom):], enc
+
+ return s, None
+
+def bomtolocal(s):
+ """Convert a string to local encoding if BOM detected
+
+ UTF-16/32 and sometimes UTF-8 prepend BOM (byte-order-mark).
+ This function tries to detect character encoding according to BOM,
+ then converts to local encoding. If it doesn't include BOM, this
+ returns the original string.
+ """
+ s, enc = detectbom(s)
+ if not enc:
+ return s
+
+ try:
+ return s.decode(enc).encode(encoding, 'replace')
+ except LookupError, k:
+ raise error.Abort("%s, please check your locale settings" % k)
+ except UnicodeDecodeError:
+ return s # BOM-like string detected, but it isn't
More information about the Mercurial-devel
mailing list