[PATCH v2] lazymanifest: write a more efficient, pypy friendly version of lazymanifest
Augie Fackler
raf at durin42.com
Tue Sep 20 14:39:05 UTC 2016
On Sat, Sep 17, 2016 at 08:23:36PM +0200, Maciej Fijalkowski wrote:
> This should fix the issues presented.
>
> There is one problem which is that the hash in test-rebase-detach
> changes. The way I see it is just an RNG phase-shift, but it might be
> a real bug. Some actual input will be appreciated.
It's not possibly RNG related - there's no RNG that goes into the sha1
of a node. I've added --debug to the failure in test-rebase-detach and
compared them between cpython and pypy:
cpython:
@@ -366,11 +366,24 @@
$ hg log --rev tip --debug
- changeset: 8:9472f4b1d736
+ changeset: 8:9472f4b1d7366902d0098aa1401e3f170000cc56
tag: tip
+ phase: draft
+ parent: 7:02de42196ebee42ef284b6780a87cdc96e8eaab6
+ parent: -1:0000000000000000000000000000000000000000
+ manifest: 8:8dbdcf066a97239fde2d0dba19fcb1e699fa66d2
user: test
date: Thu Jan 01 00:00:00 1970 +0000
- summary: Collapsed revision
+ files: F
+ files+: E
+ extra: branch=default
+ extra: rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
+ description:
+ Collapsed revision
+ * I
+ * Merge
+ * J
+
pypy:
@@ -366,11 +366,24 @@
$ hg log --rev tip --debug
- changeset: 8:9472f4b1d736
+ changeset: 8:122ceff3b303b13e5ca323742a8ebe589b9f8066
tag: tip
+ phase: draft
+ parent: 7:02de42196ebee42ef284b6780a87cdc96e8eaab6
+ parent: -1:0000000000000000000000000000000000000000
+ manifest: 8:09f1dc369d2667dcaeb9cf828b0fcb76bda29f33
user: test
date: Thu Jan 01 00:00:00 1970 +0000
- summary: Collapsed revision
+ files: F
+ files+: E
+ extra: branch=default
+ extra: rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
+ description:
+ Collapsed revision
+ * I
+ * Merge
+ * J
+
Which means the manifest is hashing out differently. That suggests
there's a subtle bug in how the manifest is hashing out in the pypy
version. Adding a `hg debugdata -m 8` (which prints raw manifest
data), and this is what I get on cpython:
$ hg debugdata -m 8
+ A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
+ E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
+ F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
+ H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
on pypy:
$ hg debugdata -m 8
+ A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
+ F\x0022bfcfd62a21a3287edbd4d656218d0f525ed76a (esc)
+ E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
+ F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
+ H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
So you've got a subtle bug here where the compaction of the
lazymanifest isn't working right - probably around merging manifests
somehow. Does this give you enough to get started?
(I'd probably start debugging this by littering _lazymanifest._compact
and _lazymanifest.text with asserts until I managed to trip over the
un-sorted lines, and then backtrack from there to figure out how they
managed to survive.)
>
>
>
> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall at gmail.com> wrote:
> > # HG changeset patch
> > # User Maciej Fijalkowski <fijall at gmail.com>
> > # Date 1473680234 -7200
> > # Mon Sep 12 13:37:14 2016 +0200
> > # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
> > # Parent df05c43bd1e64f1620d0b2e502f4603c1e5a8341
> > lazymanifest: write a more efficient, pypy friendly version of lazymanifest
> >
> > diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> > --- a/mercurial/manifest.py
> > +++ b/mercurial/manifest.py
> > @@ -104,69 +104,297 @@
> > _checkforbidden(files)
> > return ''.join(lines)
> >
> > -class _lazymanifest(dict):
> > - """This is the pure implementation of lazymanifest.
> > -
> > - It has not been optimized *at all* and is not lazy.
> > - """
> > -
> > - def __init__(self, data):
> > - dict.__init__(self)
> > - for f, n, fl in _parse(data):
> > - self[f] = n, fl
> > -
> > - def __setitem__(self, k, v):
> > - node, flag = v
> > - assert node is not None
> > - if len(node) > 21:
> > - node = node[:21] # match c implementation behavior
> > - dict.__setitem__(self, k, (node, flag))
> > +class lazymanifestiter(object):
> > + def __init__(self, lm):
> > + self.pos = 0
> > + self.lm = lm
> >
> > def __iter__(self):
> > - return iter(sorted(dict.keys(self)))
> > + return self
> >
> > - def iterkeys(self):
> > - return iter(sorted(dict.keys(self)))
> > + def next(self):
> > + try:
> > + data, pos = self.lm._get(self.pos)
> > + except IndexError:
> > + raise StopIteration
> > + if pos == -1:
> > + self.pos += 1
> > + return data[0]
> > + self.pos += 1
> > + zeropos = data.find('\x00', pos)
> > + return data[pos:zeropos]
> >
> > - def iterentries(self):
> > - return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
> > +class lazymanifestiterentries(object):
> > + def __init__(self, lm):
> > + self.lm = lm
> > + self.pos = 0
> > +
> > + def __iter__(self):
> > + return self
> > +
> > + def next(self):
> > + try:
> > + data, pos = self.lm._get(self.pos)
> > + except IndexError:
> > + raise StopIteration
> > + if pos == -1:
> > + self.pos += 1
> > + return data
> > + zeropos = data.find('\x00', pos)
> > + hashval = unhexlify(data, self.lm.extrainfo[self.pos],
> > + zeropos + 1, 40)
> > + flags = self.lm._getflags(data, self.pos, zeropos)
> > + self.pos += 1
> > + return (data[pos:zeropos], hashval, flags)
> > +
> > +def unhexlify(data, extra, pos, length):
> > + s = data[pos:pos + length].decode('hex')
> > + if extra:
> > + s += chr(extra & 0xff)
> > + return s
> > +
> > +def _cmp(a, b):
> > + return (a > b) - (a < b)
> > +
> > +class _lazymanifest(object):
> > + def __init__(self, data, positions=None, extrainfo=None, extradata=None):
> > + if positions is None:
> > + self.positions = self.findlines(data)
> > + self.extrainfo = [0] * len(self.positions)
> > + self.data = data
> > + self.extradata = []
> > + else:
> > + self.positions = positions[:]
> > + self.extrainfo = extrainfo[:]
> > + self.extradata = extradata[:]
> > + self.data = data
> > +
> > + def findlines(self, data):
> > + if not data:
> > + return []
> > + pos = data.find("\n")
> > + positions = [0]
> > + while pos < len(data) - 1 and pos != -1:
> > + positions.append(pos + 1)
> > + pos = data.find("\n", pos + 1)
> > + return positions
> > +
> > + def _get(self, index):
> > + # get the position encoded in pos:
> > + # positive number is an index in 'data'
> > + # negative number is in extrapieces
> > + pos = self.positions[index]
> > + if pos >= 0:
> > + return self.data, pos
> > + return self.extradata[-pos - 1], -1
> > +
> > + def _getkey(self, pos):
> > + if pos >= 0:
> > + return self.data[pos:self.data.find('\x00', pos + 1)]
> > + return self.extradata[-pos - 1][0]
> > +
> > + def bsearch(self, key):
> > + first = 0
> > + last = len(self.positions) - 1
> > +
> > + while first <= last:
> > + midpoint = (first + last)//2
> > + nextpos = self.positions[midpoint]
> > + candidate = self._getkey(nextpos)
> > + r = _cmp(key, candidate)
> > + if r == 0:
> > + return midpoint
> > + else:
> > + if r < 0:
> > + last = midpoint - 1
> > + else:
> > + first = midpoint + 1
> > + return -1
> > +
> > + def bsearch2(self, key):
> > + # same as the above, but will always return the position
> > + # done for performance reasons
> > + first = 0
> > + last = len(self.positions) - 1
> > +
> > + while first <= last:
> > + midpoint = (first + last)//2
> > + nextpos = self.positions[midpoint]
> > + candidate = self._getkey(nextpos)
> > + r = _cmp(key, candidate)
> > + if r == 0:
> > + return (midpoint, True)
> > + else:
> > + if r < 0:
> > + last = midpoint - 1
> > + else:
> > + first = midpoint + 1
> > + return (first, False)
> > +
> > + def __contains__(self, key):
> > + return self.bsearch(key) != -1
> > +
> > + def _getflags(self, data, needle, pos):
> > + start = pos + 41
> > + end = data.find("\n", start)
> > + if end == -1:
> > + end = len(data) - 1
> > + if start == end:
> > + return ''
> > + return self.data[start:end]
> > +
> > + def __getitem__(self, key):
> > + if not isinstance(key, str):
> > + raise TypeError("getitem: manifest keys must be a string.")
> > + needle = self.bsearch(key)
> > + if needle == -1:
> > + raise KeyError
> > + data, pos = self._get(needle)
> > + if pos == -1:
> > + return (data[1], data[2])
> > + zeropos = data.find('\x00', pos)
> > + assert 0 <= needle <= len(self.positions)
> > + assert len(self.extrainfo) == len(self.positions)
> > + hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
> > + flags = self._getflags(data, needle, zeropos)
> > + return (hashval, flags)
> > +
> > + def __delitem__(self, key):
> > + needle, found = self.bsearch2(key)
> > + if not found:
> > + raise KeyError
> > + cur = self.positions[needle]
> > + self.positions = self.positions[:needle] + self.positions[needle + 1:]
> > + self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
> > + if cur >= 0:
> > + self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
> > +
> > + def __setitem__(self, key, value):
> > + if not isinstance(key, str):
> > + raise TypeError("setitem: manifest keys must be a string.")
> > + if not isinstance(value, tuple) or len(value) != 2:
> > + raise TypeError("Manifest values must be a tuple of (node, flags).")
> > + hashval = value[0]
> > + if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
> > + raise TypeError("node must be a 20-byte string")
> > + flags = value[1]
> > + if not isinstance(flags, str) or len(flags) > 1:
> > + raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
> > + needle, found = self.bsearch2(key)
> > + if found:
> > + # put the item
> > + pos = self.positions[needle]
> > + if pos < 0:
> > + self.extradata[-pos - 1] = (key, value[0], value[1])
> > + else:
> > + # just don't bother
> > + self.extradata.append((key, value[0], value[1]))
> > + self.positions[needle] = -len(self.extradata)
> > + else:
> > + # not found, put it in with extra positions
> > + self.extradata.append((key, value[0], value[1]))
> > + self.positions = (self.positions[:needle] + [-len(self.extradata)]
> > + + self.positions[needle:])
> > + self.extrainfo = (self.extrainfo[:needle] + [0] +
> > + self.extrainfo[needle:])
> >
> > def copy(self):
> > - c = _lazymanifest('')
> > - c.update(self)
> > - return c
> > + # XXX call _compact like in C?
> > + return _lazymanifest(self.data, self.positions, self.extrainfo,
> > + self.extradata)
> > +
> > + def _compact(self):
> > + # hopefully not called TOO often
> > + def findnextposition(positions, start, lgt):
> > + i = start
> > + while i < len(positions):
> > + if positions[i] >= 0:
> > + return positions[i]
> > + i += 1
> > + return lgt
> > +
> > + if len(self.extradata) == 0:
> > + return
> > + l = []
> > + last_cut = 0
> > + i = 0
> > + offset = 0
> > + self.extrainfo = [0] * len(self.positions)
> > + while i < len(self.positions):
> > + if self.positions[i] >= 0:
> > + cur = self.positions[i]
> > + last_cut = cur
> > + while True:
> > + self.positions[i] = offset
> > + i += 1
> > + if i == len(self.positions) or self.positions[i] < 0:
> > + break
> > + offset += self.positions[i] - cur
> > + cur = self.positions[i]
> > + end_cut = findnextposition(self.positions, i, len(self.data))
> > + offset += end_cut - cur
> > + l.append(self.data[last_cut:end_cut])
> > + else:
> > + while i < len(self.positions) and self.positions[i] < 0:
> > + cur = self.positions[i]
> > + t = self.extradata[-cur - 1]
> > + l.append(self._pack(t))
> > + self.positions[i] = offset
> > + if len(t[1]) > 20:
> > + self.extrainfo[i] = ord(t[1][21])
> > + offset += len(l[-1])
> > + i += 1
> > + self.data = ''.join(l)
> > + self.extradata = []
> > +
> > + def _pack(self, d):
> > + return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
> > +
> > + def text(self):
> > + self._compact()
> > + return self.data
> >
> > def diff(self, m2, clean=False):
> > '''Finds changes between the current manifest and m2.'''
> > + # XXX think whether efficiency matters here
> > diff = {}
> >
> > - for fn, e1 in self.iteritems():
> > + for fn, e1, flags in self.iterentries():
> > if fn not in m2:
> > - diff[fn] = e1, (None, '')
> > + diff[fn] = (e1, flags), (None, '')
> > else:
> > e2 = m2[fn]
> > - if e1 != e2:
> > - diff[fn] = e1, e2
> > + if (e1, flags) != e2:
> > + diff[fn] = (e1, flags), e2
> > elif clean:
> > diff[fn] = None
> >
> > - for fn, e2 in m2.iteritems():
> > + for fn, e2, flags in m2.iterentries():
> > if fn not in self:
> > - diff[fn] = (None, ''), e2
> > + diff[fn] = (None, ''), (e2, flags)
> >
> > return diff
> >
> > + def iterentries(self):
> > + return lazymanifestiterentries(self)
> > +
> > + def iterkeys(self):
> > + return lazymanifestiter(self)
> > +
> > + def __iter__(self):
> > + return lazymanifestiter(self)
> > +
> > + def __len__(self):
> > + return len(self.positions)
> > +
> > def filtercopy(self, filterfn):
> > + # XXX should be optimized
> > c = _lazymanifest('')
> > for f, n, fl in self.iterentries():
> > if filterfn(f):
> > c[f] = n, fl
> > return c
> >
> > - def text(self):
> > - """Get the full data of this manifest as a bytestring."""
> > - return _textv1(self.iterentries())
> > -
> > try:
> > _lazymanifest = parsers.lazymanifest
> > except AttributeError:
> > diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
> > --- a/mercurial/pure/bdiff.py
> > +++ b/mercurial/pure/bdiff.py
> > @@ -111,8 +111,8 @@
> > def blocks(sa, sb):
> > a = ffi.new("struct bdiff_line**")
> > b = ffi.new("struct bdiff_line**")
> > - ac = ffi.new("char[]", sa)
> > - bc = ffi.new("char[]", sb)
> > + ac = ffi.new("char[]", str(sa))
> > + bc = ffi.new("char[]", str(sb))
> > l = ffi.new("struct bdiff_hunk*")
> > try:
> > an = lib.bdiff_splitlines(ac, len(sa), a)
> > @@ -138,8 +138,8 @@
> > def bdiff(sa, sb):
> > a = ffi.new("struct bdiff_line**")
> > b = ffi.new("struct bdiff_line**")
> > - ac = ffi.new("char[]", sa)
> > - bc = ffi.new("char[]", sb)
> > + ac = ffi.new("char[]", str(sa))
> > + bc = ffi.new("char[]", str(sb))
> > l = ffi.new("struct bdiff_hunk*")
> > try:
> > an = lib.bdiff_splitlines(ac, len(sa), a)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
More information about the Mercurial-devel
mailing list