[PATCH 1 of 5 v6] branchcache: introduce persistent caching of branch names
Mads Kiilerich
mads at kiilerich.com
Sat Oct 18 18:43:33 UTC 2014
# HG changeset patch
# User Mads Kiilerich <madski at unity3d.com>
# Date 1413657692 -7200
# Sat Oct 18 20:41:32 2014 +0200
# Node ID a504712dc5ac07e5ddf550f47818267f783f85de
# Parent f484be02bd351fbd084d93f5ba4180e2b22cd4eb
branchcache: introduce persistent caching of branch names
It is expensive to retrieve the branch name of a changeset. Very expensive when
creating a changectx and calling .branch() - slightly less when using
changelog.branchinfo().
Now, to really speed things up, cache the results on disk. To get efficient
lookup for revisions (constant size records) and avoid storing the same branch
name over and ever, store the name of each branch once with a fixed ordering.
For each repo revision, store the node hash and the index of the branch name.
To make it 100% stable against repository mutations, always check the node hash
before using the cache content.
The code for this is kind of similar to the branchmap handling and is placed in
the same module.
The cache is owned by repo instances and will be saved when they are closed.
This new method promise to make some operations up 20 times faster once it
actually is used.
diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -9,6 +9,7 @@ from node import bin, hex, nullid, nullr
import encoding
import util
import time
+import struct, array
def _filename(repo):
"""name of a branchcache file for a given repo or repoview"""
@@ -285,3 +286,125 @@ class branchcache(dict):
duration = time.time() - starttime
repo.ui.log('branchcache', 'updated %s branch cache in %.4f seconds\n',
repo.filtername, duration)
+
+# Revision branch name cache
+
+branchcachename = 'cache/branchnames'
+bcheaderfmt = '>LL' # file header: records start, records length
+bcrecfmt = '>20sH' # a record: node hash, branch name reference
+bcheadersize = struct.calcsize(bcheaderfmt)
+bcrecsize = struct.calcsize(bcrecfmt)
+bcclosebitmask = 1 << (8 * struct.calcsize('>H') - 1)
+bcbranchidxmask = bcclosebitmask - 1
+
+class revbranchcache(object):
+ """Persistent cache mapping from revision number to branch name.
+ Consistency is guaranteed by verifying the node hash.
+
+ The file format is binary, in one file to ensure consistency:
+ First a header (bcheaderfmt) with start and length of the records.
+ After header and until records starts: UTF-8 branch names separated by 0.
+ Branch names are added to thelist on demand, with fixed indices once added.
+ Records run until end of file and is records (bcrecfmt) with the hash of
+ the corresponding node, the index of the corresponding branch name, and a
+ flag for closed.
+ """
+
+ def __init__(self, repo):
+ self._repo = repo
+ self._loaded = False
+ self._dirty = False
+ self._namesutf8 = [] # utf8 branch names referenced from the records
+ self._nameslocal = [] # local encoding of branch names, on demand
+ self._records = array.array('c') # bytes with structs of type bcrecfmt
+
+ def load(self):
+ """Load cached branch names."""
+ reporecslen = len(self._repo) * bcrecsize
+ if self._loaded and reporecslen == len(self._records):
+ return
+
+ try:
+ data = self._repo.vfs.open(branchcachename).read()
+ except (IOError, OSError, util.Abort):
+ data = ''
+
+ if len(data) >= bcheadersize:
+ # header
+ recsstart, recslen = struct.unpack_from(bcheaderfmt, data)
+ if len(data) == recsstart + recslen:
+ # between header and records: \0 separated branch names
+ if recsstart != bcheadersize:
+ self._namesutf8 = \
+ data[bcheadersize:recsstart].split('\0')
+ # read records, cap at repo size
+ self._records.fromstring(
+ buffer(data, recsstart, min(recslen, reporecslen)))
+
+ # pad to repo size
+ if len(self._records) < reporecslen:
+ self._records.extend(
+ '\xff' * (reporecslen - len(self._records)))
+
+ self._nameslocal = [None] * len(self._namesutf8)
+ self._branchnamesindex = dict((b, r)
+ for r, b in enumerate(self._namesutf8))
+ self._loaded = True
+
+ def branchinfo(self, rev):
+ """Return branch name and close flag for rev, using and updating
+ persistent cache. The cache must be loaded first."""
+ node = self._repo.changelog.node(rev)
+ cachenode, branchidx = struct.unpack_from(bcrecfmt, self._records,
+ rev * bcrecsize)
+ close = branchidx & bcclosebitmask
+ branchidx &= bcbranchidxmask
+ if cachenode == node and branchidx < len(self._nameslocal):
+ b = self._nameslocal[branchidx]
+ if b:
+ return b, close
+ b = encoding.tolocal(self._namesutf8[branchidx])
+ self._nameslocal[branchidx] = b
+ return b, close
+
+ b, close = self._repo.changelog.branchinfo(rev)
+ butf8 = encoding.fromlocal(b)
+ if butf8 in self._branchnamesindex:
+ branchidx = self._branchnamesindex[butf8]
+ else:
+ branchidx = len(self._namesutf8)
+ if branchidx >= bcbranchidxmask:
+ # fall-back to no caching if cache full
+ return b, close
+ self._namesutf8.append(butf8)
+ self._nameslocal.append(b)
+ self._branchnamesindex[butf8] = branchidx
+ if close:
+ branchidx |= bcclosebitmask
+ struct.pack_into(bcrecfmt, self._records, rev * bcrecsize,
+ node, branchidx)
+ self._dirty = True
+ return b, close
+
+ def branch(self, rev):
+ """Return branch name of rev, using and updating persistent cache.
+ The cache will be loaded on demand."""
+ if not self._loaded:
+ self.load()
+
+ return self.branchinfo(rev)[0]
+
+ def save(self):
+ """Save branch cache if it is dirty."""
+ if self._dirty:
+ try:
+ f = self._repo.vfs.open(branchcachename, 'w', atomictemp=True)
+ s = '\0'.join(self._namesutf8)
+ f.write(struct.pack(bcheaderfmt, bcheadersize + len(s),
+ len(self._records)))
+ f.write(s)
+ f.write(self._records)
+ f.close()
+ except (IOError, OSError, util.Abort):
+ pass
+ self._dirty = False
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -297,8 +297,10 @@ class localrepository(object):
# - bookmark changes
self.filteredrevcache = {}
+ self.revbranchcache = branchmap.revbranchcache(self)
+
def close(self):
- pass
+ self.revbranchcache.save()
def _restrictcapabilities(self, caps):
# bundle2 is not ready for prime time, drop it unless explicitly
diff --git a/mercurial/statichttprepo.py b/mercurial/statichttprepo.py
--- a/mercurial/statichttprepo.py
+++ b/mercurial/statichttprepo.py
@@ -90,6 +90,12 @@ class statichttppeer(localrepo.localpeer
def canpush(self):
return False
+class _fakerevbranchcache(object):
+ def load(self):
+ pass
+ def save(self):
+ pass
+
class statichttprepository(localrepo.localrepository):
supported = localrepo.localrepository._basesupported
@@ -141,6 +147,7 @@ class statichttprepository(localrepo.loc
self._branchcaches = {}
self.encodepats = None
self.decodepats = None
+ self.revbranchcache = _fakerevbranchcache(self)
def _restrictcapabilities(self, caps):
caps = super(statichttprepository, self)._restrictcapabilities(caps)
More information about the Mercurial-devel
mailing list