[PATCH 1 of 2 v4] localrepo: persistent caching of branch names
Pierre-Yves David
pierre-yves.david at ens-lyon.org
Thu Oct 16 07:27:34 UTC 2014
On 10/15/2014 06:48 PM, Mads Kiilerich wrote:
> # HG changeset patch
> # User Mads Kiilerich <madski at unity3d.com>
> # Date 1413424006 -7200
> # Thu Oct 16 03:46:46 2014 +0200
> # Node ID d87009e720063a8d6d80afdbe6bb9323e2849030
> # Parent 48c0b101a9de1fdbd638daa858da845cd05a6be7
> localrepo: persistent caching of branch names
>
> It is expensive to retrieve the branch name. Very expensive when creating a
> changectx and calling .branch() - slightly less when using
> changelog.branchinfo().
>
> Now, to really speed things up, cache the results on disk. To get efficient
> lookup for revisions (constant size records) and avoid storing the same branch
> name over and ever, store the name of each branch once with a fixed ordering.
> For each repo revision, store the node hash and the index of the branch name.
> To make it 100% stable against repository mutations, always check the node hash
> before using the cache content.
>
> The code for this is kind of similar to the branchmap handling and is placed in
> the same module even though the name is not completely spot on.
>
> This new method promise to make some operations up 20 times faster once it
> actually is used.
>
> A simpler approach that didn't store and validate node hashes for every
> revision was significantly faster (x2) but could be tricked when modifying
> history. The usual worst case would be that the whole cache was invalidated
> when the repository history was modified, but when trying very hard it could be
> tricked into not noticing changes.
>
> diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
> --- a/mercurial/branchmap.py
> +++ b/mercurial/branchmap.py
> @@ -9,6 +9,7 @@ from node import bin, hex, nullid, nullr
> import encoding
> import util
> import time
> +import struct, array
>
> def _filename(repo):
> """name of a branchcache file for a given repo or repoview"""
> @@ -285,3 +286,97 @@ class branchcache(dict):
> duration = time.time() - starttime
> repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
> repo.filtername, duration)
> +
> +class revbranchcache(object):
> + """Persistent cache mapping from revision number to branch name.
> + Consistency is guaranteed by verifying the node hash."""
> +
> + filename = 'cache/branchnames'
> + formatversion = 2345164374
> + headerfmt = '>LLL' # file header: version, records start, records length
> + recfmt = '>20sH' # a record: node hash, branch name reference
> + headersize = struct.calcsize(headerfmt)
> + recsize = struct.calcsize(recfmt)
We could use a textual description of the format I've trouble
identifying how branch name are first introduced.
Your formatversion value looks temporary
> +
> + def __init__(self, repo):
> + self._repo = repo
> + self._loaded = False
> + self._dirty = False
> + self._names = [] # branch names referenced from recfmt records
> + self._records = array.array('c') # bytes with structs of type recfmt
> +
> + def _load(self):
> + """Load cached branch names."""
> + try:
> + data = self._repo.vfs.open(self.filename).read()
> + except IOError:
> + data = ''
missing the usual.
if err.errno != errno.ENOENT:
raise
> + self._dirty = True
Not sure why we default to dirty as we just read the data from disk?
> + reporecslen = len(self._repo) * self.recsize
> + if len(data) >= self.headersize:
> + # header
> + v, recsstart, recslen = struct.unpack_from(self.headerfmt, data)
> + if v == self.formatversion and len(data) == recsstart + recslen:
> + # between header and records: \0 separated branch names
> + if recsstart != self.headersize:
> + self._names = \
> + data[self.headersize:recsstart].split('\0')
Please use an intermediate variable instead of ugly \
continuation.
Now I know how you handle branch name (but should stil be documented. I
see this as a minor issue that the cache has to be wiped for every new
branch.
> + # read records, cap at repo size
> + self._records.fromstring(
> + buffer(data, recsstart, min(recslen, reporecslen)))
wrong identation, should be aligned with the opening parents. You could
use and intermediate variable.
> + # only dirty if too many records (after strip)
> + self._dirty = recslen > reporecslen
> + else:
> + self._repo.ui.debug('branch cache file was invalid\n')
Consider reversing the logic and putting the short branch of the if/else
at start. This having having to refetch the context N line higher when
meeting this else.
> + # pad to repo size
> + if len(self._records) < reporecslen:
> + self._records.extend(
> + '\xff' * (reporecslen - len(self._records)))
> +
> + self._branchnamesindex = dict((b, r)
> + for r, b in enumerate(self._names))
> + self._node = self._repo.changelog.node
> + self._branchinfo = self._repo.changelog.branchinfo
> + self._loaded = True
> +
> + def branch(self, rev):
> + """Return branch name of rev, using and updating persistent cache."""
> + if not self._loaded:
> + self._load()
> +
> + node = self._node(rev)
> + cachenode, branchidx = struct.unpack_from(self.recfmt, self._records,
> + rev * self.recsize)
The runtime packaing and depacking make me shivers a bit but it seems
fairly reasonable.
> + if cachenode == node and branchidx < len(self._names):
> + return self._names[branchidx]
> + b, _close = self._branchinfo(rev)
> + if b in self._branchnamesindex:
> + branchidx = self._branchnamesindex[b]
> + else:
> + branchidx = len(self._names)
> + self._names.append(b)
> + self._branchnamesindex[b] = branchidx
> + struct.pack_into(self.recfmt, self._records, rev * self.recsize,
> + node, branchidx)
> + self._dirty = True
> + return b
> +
> + def save(self):
> + """Save branch cache if it is dirty."""
> + if self._dirty:
> + self._repo.ui.debug('writing branch cache file\n')
> + try:
> + f = self._repo.vfs.open(self.filename, 'w', atomictemp=True)
> + s = '\0'.join(self._names)
> + f.write(struct.pack(self.headerfmt, self.formatversion,
> + self.headersize + len(s),
> + len(self._records)))
> + f.write(s)
> + f.write(self._records)
> + f.close()
> + except IOError:
Same here need to propage some of them
> + pass
> + self._dirty = False
> +
> diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
> --- a/mercurial/localrepo.py
> +++ b/mercurial/localrepo.py
> @@ -297,8 +297,11 @@ class localrepository(object):
> # - bookmark changes
> self.filteredrevcache = {}
>
> + self.revbranchcache = branchmap.revbranchcache(self)
> +
> def close(self):
> - pass
> + if self.revbranchcache:
> + self.revbranchcache.save()
Should we do that sooner instead? and check if the data we are about to
write are still valid?
>
> def _restrictcapabilities(self, caps):
> # bundle2 is not ready for prime time, drop it unless explicitly
> diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
> --- a/mercurial/statichttprepo.py
> +++ b/mercurial/statichttprepo.py
> @@ -141,6 +141,7 @@ class statichttprepository(localrepo.loc
> self._branchcaches = {}
> self.encodepats = None
> self.decodepats = None
> + self.revbranchcache = None
>
> def _restrictcapabilities(self, caps):
> caps = super(statichttprepository, self)._restrictcapabilities(caps)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
--
Pierre-Yves David
More information about the Mercurial-devel
mailing list