[PATCH 2 of 2] hgext: add a new packed repository extension, packrepo

Bryan O'Sullivan bos at serpentine.com
Wed Jun 27 17:37:57 UTC 2012


# HG changeset patch
# User Bryan O'Sullivan <bryano at fb.com>
# Date 1340818576 25200
# Node ID 989483d028d1881eb775815c9d66361f9bed4d06
# Parent  080b8d275cfe8f355fbe778a175a1c4cf1e08083
hgext: add a new packed repository extension, packrepo

diff --git a/hgext/packrepo.py b/hgext/packrepo.py
new file mode 100644
--- /dev/null
+++ b/hgext/packrepo.py
@@ -0,0 +1,404 @@
+# packrepo.py - packed repository support for mercurial
+#
+# Copyright 2012 Facebook
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+'''pack small metadata files to reduce disk seeks
+
+Mercurial's revlog storage format uses a large number of files, many
+of which are small.  In a repository that contains many thousands of
+revlog files, the disk seeks associated with reading or writing these
+files can be expensive.
+
+This extension is aimed at a narrow use case: the performance of "hg
+clone --uncompressed", of a repository that contains many revlogs, over
+a fast LAN or WAN.
+
+In these cases, the performance of "hg clone --uncompressed" can be
+limited by the seek rate of the client's disk.  Use of this extension
+can improve performance by a factor of 10.  (Please be sure to measure
+performance in your own environment before you decide to use this
+extension.)
+
+A repository created using "hg clone" without "--uncompressed" will
+not be packed (too messy to get right, and seeks are less of a
+factor); neither will a repository created from scratch.
+
+When a repository is first cloned with "--uncompressed", this
+extension redirects writes of small revlogs to a single immutable pack
+file. Large revlog files do not benefit from being packed, and are
+written normally. Afterwards, metadata reads are transparently
+serviced from either the pack file or a normal revlog file.  When
+Mercurial needs to write to a packed revlog, it is first copied out to
+a normal revlog file.
+
+To find out whether a repository is packed (and if so, how much is
+still packed), run "hg packinfo".
+
+To repack all revlog files in an already packed repository, use "hg
+repack".  An existing repository can be converted to packed form using
+"hg repack --convert".
+'''
+
+from mercurial import error, extensions, localrepo, scmutil, store, util
+from mercurial import cmdutil
+from mercurial.i18n import _
+import errno, os, struct
+
+ at extensions.replaceclass(localrepo, 'localrepository')
+class packrepo(localrepo.localrepository):
+    def __init__(self, baseui, path=None, create=False):
+        super(packrepo, self).__init__(baseui, path, create)
+        if 'packrepo' in self.requirements:
+            self._replacestore()
+
+    def _replacestore(self, newopener=None):
+        def opener(path, audit=True):
+            if newopener is None:
+                op = packopener(path, audit)
+                op.readindex()
+            else:
+                op = newopener
+            self._packopener = op
+            return op
+        mode = self.store.createmode
+        self.store = store.store(self.requirements, self.sharedpath, opener)
+        self.store._packopener = self._packopener
+        self.sopener = self.store.opener
+        self.sopener.createmode = self.store.createmode = mode
+
+    def _makepacked(self):
+        if 'packrepo' in self.requirements:
+            return
+        self._replacestore()
+        reqs = set(self.requirements)
+        reqs.add('packrepo')
+        self._applyrequirements(reqs)
+        self._writerequirements()
+
+    def stream_in(self, remote, requirements):
+        pack = self.ui.configbool('format', 'pack', True)
+        if pack:
+            self._makepacked()
+            self._packopener.packstart()
+        try:
+            return super(packrepo, self).stream_in(remote, requirements)
+        finally:
+            if pack:
+                self._packopener.packend()
+
+    def packiter(self, label=None):
+        df = list(self.store.datafiles())
+        total = len(df)
+        progress = self.ui.progress
+        try:
+            lookup = self._packopener._index.get
+        except AttributeError:
+            lookup = {}.get
+        for i, (f, f2, size) in enumerate(df):
+            if label:
+                progress(label, i, total=total)
+            sl = lookup(f2)
+            if sl:
+                start = sl[0]
+            else:
+                start = None
+            yield f, f2, start, size
+
+    def repack(self):
+        if self.revs('draft()'):
+            pass # raise util.Abort(_('cannot repack with draft commits present'))
+        df = list(self.store.datafiles())
+        total = len(df)
+        read = self._packopener.read
+        opener = packopener(self.store.path)
+        unlink = []
+        try:
+            opener.packstart()
+            packed = 0
+            unpacked = 0
+            for (f, f2, start, size) in self.packiter(_('repacking')):
+                if size < opener._maxpacksize:
+                    opener.write(f2, read(f2))
+                    packed += size
+                    unlink.append(self.sjoin(f))
+                else:
+                    unpacked += size
+        except:
+            opener.packabort()
+            raise
+        opener.packend()
+        self._replacestore(opener)
+        unlinked = 0
+        progress = self.ui.progress
+        unlinking = _('unlinking')
+        total = len(unlink)
+        for i, f in enumerate(unlink):
+            progress(unlinking, i, total=total)
+            try:
+                os.unlink(f)
+                unlinked += 1
+            except OSError, err:
+                if err.errno != errno.ENOENT:
+                    raise
+        self.ui.status(_('%s packed, %s untouched, %d unlinked\n') %
+                       (util.bytecount(packed), util.bytecount(unpacked),
+                        unlinked))
+
+cls = localrepo.localrepository
+for reqs in 'supported openerreqs'.split():
+    getattr(cls, reqs).add('packrepo')
+
+class writefile(object):
+    def __init__(self, path, closed):
+        self.path = path
+        self.written = []
+        self.write = self.written.append
+        self._closed = closed
+
+    def close(self):
+        self._closed(self.path, self.written)
+
+class readfile(object):
+    def __init__(self, fp, path, start, len):
+        self._fp = fp
+        self.path = path
+        self._start = start
+        self._len = len
+        self._pos = 0
+
+    def read(self, count=None):
+        self._fp.seek(self._start + self._pos)
+        if count is None:
+            return self._fp.read(self._len - self._pos)
+        return self._fp.read(min(count, self._len - self._pos))
+
+    def seek(self, offset, whence=0):
+        if whence == 2:
+            assert offset == 0
+            offset = self._len
+        npos = min(offset, self._len)
+        self._fp.seek(self._start + npos)
+        self._pos = npos
+
+    def tell(self):
+        return self._pos
+
+    def close(self):
+        pass
+
+_copylist = ['store/00pack.dat', 'store/00pack.idx']
+
+ at extensions.replaceclass(store, 'basicstore')
+class packbasicstore(store.basicstore):
+    def copylist(self):
+        return _copylist + super(packbasicstore, self).copylist()
+
+    def _walk(self, relpath, recurse):
+        l = super(packbasicstore, self)._walk(relpath, recurse)
+        try:
+            ll = [(store.decodedir(n), n, f[1])
+                  for n, f in self._packopener._index.iteritems()]
+            if ll:
+                l.extend(ll)
+                l.sort()
+        except AttributeError:
+            pass
+        return l
+
+ at extensions.replaceclass(store, 'encodedstore')
+class packencodedstore(store.encodedstore):
+    def copylist(self):
+        return _copylist + super(packencodedstore, self).copylist()
+
+ at extensions.replaceclass(store, 'fncachestore')
+class packfncachestore(store.fncachestore):
+    def copylist(self):
+        return _copylist + super(packfncachestore, self).copylist()
+
+    def getsize(self, path):
+        try:
+            return self.opener.opener._index[path][1]
+        except (AttributeError, KeyError):
+            return super(packfncachestore, self).getsize(path)
+
+class packopener(scmutil.abstractopener):
+    _maxpacksize = 128 * 1024
+    _idxformat = '>HIIH'
+    _hdrformat = '>H'
+
+    def __init__(self, path, audit=True):
+        self.opener = scmutil.opener(path, audit)
+        self._packing = False
+        self._dfp = None
+        self._ifp = None
+        self._index = {}
+
+    def _getcreatemode(self):
+        return self.opener.createmode
+
+    def _setcreatemode(self, m):
+        self.opener.createmode = m
+
+    createmode = property(_getcreatemode, _setcreatemode)
+
+    def readindex(self):
+        index = self._index = {}
+        data = self.opener.tryread('00pack.idx')
+        end = len(data)
+        if end < 6:
+            return
+        flags, = struct.unpack_from(self._hdrformat, data, 4)
+        assert data.startswith('HGPX') and flags == 0, repr(data[:6])
+        n = struct.calcsize(self._idxformat)
+        i = 6
+        while i < end:
+            hi, lo, l, lp = struct.unpack_from(self._idxformat, data, i)
+            i += n
+            j = i + lp
+            path = data[i:j]
+            start = hi << 32 | lo
+            if start:
+                index[path] = start, l
+            else:
+                del index[path]
+            i = j
+
+    def packstart(self):
+        if self._dfp:
+            self._dfp.close()
+        if self._ifp:
+            self._ifp.close()
+        self._dfp = self.opener('00pack.dat', 'w+b', atomictemp=True)
+        self._ifp = self.opener('00pack.idx', 'w+b', atomictemp=True)
+        self._dfp.write('HGPD')
+        self._ifp.write('HGPX')
+        flags = 0
+        self._ifp.write(struct.pack(self._hdrformat, flags))
+        self._packing = True
+
+    def packend(self):
+        self._packing = False
+        self._dfp.close()
+        self._ifp.close()
+        self._dfp = None
+        self._ifp = None
+
+    def packabort(self):
+        self._dfp.discard()
+        self._dfp = None
+        self._ifp.discard()
+        self._ifp = None
+        self._index = {}
+
+    def _writeindex(self, path, start, length):
+        self._ifp.write(struct.pack(self._idxformat, start >> 32,
+                                    start & 0xffffffff, length, len(path)))
+        self._ifp.write(path)
+
+    def _written(self, path, data):
+        l = sum(map(len, data))
+        if not l:
+            return
+        if l < self._maxpacksize:
+            start = self._dfp.tell()
+            for d in data:
+                self._dfp.write(d)
+            self._index[path] = start, l
+            self._writeindex(path, start, l)
+        else:
+            fp = self.opener(path, 'w')
+            for d in data:
+                fp.write(d)
+            fp.close()
+
+    packable = set(('.i', '.d'))
+
+    def __call__(self, path, mode='r', text=False, atomictemp=False):
+        if (self._packing and mode.startswith('w') and
+            path.startswith('data') and path[-2:] in self.packable):
+            return writefile(path, self._written)
+        ix = self._index.get(path)
+        if ix and mode.startswith('r'):
+            if not self._dfp:
+                self._dfp = self.opener('00pack.dat', 'r')
+            return readfile(self._dfp, path, ix[0], ix[1])
+        fp = self.opener(path, mode, text, atomictemp)
+        if ix and mode[0] in 'aw':
+            fp.write(self.read(path))
+            if self._ifp is None:
+                self._ifp = self.opener('00pack.idx', 'a+')
+            self._writeindex(path, 0, 0)
+            del self._index[path]
+        return fp
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+
+ at command('repack',
+         [('', 'convert', None, _('convert an unpacked repo to packed'))],
+         _('hg repack'))
+def repack(ui, repo, **opts):
+    '''repack a packed repository, or convert an unpacked repository
+
+    Any revlog files that are packed will be deleted.'''
+
+    if 'packrepo' not in repo.requirements:
+        if opts['convert']:
+            repo._makepacked()
+        else:
+            raise util.Abort(_('this repository is not packed '
+                               '(use --convert to convert)'))
+    repo.repack()
+
+def packlist(ui, repo):
+    notpacked = ' ' * 9 + '-'
+    for f, f2, start, size in repo.packiter(_('scanning')):
+        if start is None:
+            start = notpacked
+        else:
+            start = '%10d' % start
+        if len(f) > 60:
+            ui.write(f + '\n')
+            f = ''
+        ui.write('%s %s %8d\n' % (f.ljust(60), start, size))
+
+def packsummary(ui, repo):
+    bpacked = fpacked = 0
+    bunpacked = funpacked = 0
+    btoobig = ftoobig = 0
+    for f, f2, start, size in repo.packiter(_('scanning')):
+        if start is None:
+            if size < packopener._maxpacksize:
+                bunpacked += size
+                funpacked += 1
+            else:
+                btoobig += size
+                ftoobig += 1
+        else:
+            bpacked += size
+            fpacked += 1
+    ui.write(_('%s packed, saving %d files\n') %
+             (util.bytecount(bpacked), fpacked))
+    ui.write(_('%s not packed in %d files (could be packed)\n')
+             % (util.bytecount(bunpacked), funpacked))
+    ui.write(_('%s large revlogs in %d files (no need to pack)\n')
+             % (util.bytecount(btoobig), ftoobig))
+
+ at command('packinfo',
+         [('l', 'list', None,
+           _('list packed/unpacked files (useful for debugging)')),
+          ('s', 'summary', None, _('overview of what is and is not packed'))],
+         _('hg packinfo [OPTION]...'))
+def packinfo(ui, repo, **opts):
+    '''print pack-related information about a repository'''
+
+    if not opts['list'] and not opts['summary']:
+        opts['summary'] = 1
+
+    if opts['list']:
+        packlist(ui, repo)
+    if opts['summary']:
+        packsummary(ui, repo)
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -819,6 +819,7 @@ class atomictempfile(object):
 
         # delegated methods
         self.write = self._fp.write
+        self.tell = self._fp.tell
         self.fileno = self._fp.fileno
 
     def close(self):
diff --git a/tests/test-packrepo.t b/tests/test-packrepo.t
new file mode 100644
--- /dev/null
+++ b/tests/test-packrepo.t
@@ -0,0 +1,192 @@
+create a normal source repo
+
+  $ hg init a
+  $ cd a
+  $ echo a>a
+  $ hg ci -q -A -m a
+  $ hg verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+  $ echo "[extensions]" >> $HGRCPATH
+  $ echo "packrepo=" >> $HGRCPATH
+
+try repacking
+
+  $ hg clone . ../repacked
+  updating to branch default
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ cd ../repacked
+  $ cp .hg/requires .hg/oldrequires
+  $ hg packinfo
+  0 bytes packed, saving 0 files
+  67 bytes not packed in 1 files (could be packed)
+  0 bytes large revlogs in 0 files (no need to pack)
+  $ hg repack
+  abort: this repository is not packed (use --convert to convert)
+  [255]
+  $ hg repack --convert
+  67 bytes packed, 0 bytes untouched, 1 unlinked
+  $ diff -u .hg/oldrequires .hg/requires | grep '^[-+]'
+  --- .hg/oldrequires* (glob)
+  +++ .hg/requires* (glob)
+  +packrepo
+  $ hg packinfo
+  67 bytes packed, saving 1 files
+  0 bytes not packed in 0 files (could be packed)
+  0 bytes large revlogs in 0 files (no need to pack)
+  $ hg verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+repeated repack should be idempotent
+
+  $ hg repack
+  67 bytes packed, 0 bytes untouched, 0 unlinked
+  $ hg repack
+  67 bytes packed, 0 bytes untouched, 0 unlinked
+
+  $ cd ../a
+  $ hg --config server.uncompressed=True serve -p $HGPORT -d --pid-file=../hg1.pid
+
+  $ hg clone --uncompressed http://localhost:$HGPORT/ ../b
+  streaming all changes
+  3 files to transfer, 294 bytes of data
+  transferred * bytes in * seconds (*/sec) (glob)
+  updating to branch default
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+
+ensure that dest is packed and sane
+
+  $ cd ../b
+  $ ls .hg/store/00pack.*
+  .hg/store/00pack.dat
+  .hg/store/00pack.idx
+  $ ls .hg/store/data/a.[id]
+  ls: .hg/store/data/a.[id]: No such file or directory
+  [1]
+  $ grep packrepo .hg/requires
+  packrepo
+  $ hg verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+get some info
+
+  $ hg packinfo
+  67 bytes packed, saving 1 files
+  0 bytes not packed in 0 files (could be packed)
+  0 bytes large revlogs in 0 files (no need to pack)
+
+does a hardlinked clone work?
+
+  $ hg clone . ../hl
+  updating to branch default
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ hg --cwd ../hl verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+try a regular commit
+
+  $ echo b >> b
+  $ hg ci -A -m b
+  adding b
+  $ hg packinfo -l -s
+  data/b.i                                                              -       67
+  data/a.i                                                              4       67
+  67 bytes packed, saving 1 files
+  67 bytes not packed in 1 files (could be packed)
+  0 bytes large revlogs in 0 files (no need to pack)
+
+does COW work?
+
+  $ echo a >> a
+  $ hg ci -m b
+  $ ls .hg/store/data/a.[id]
+  .hg/store/data/a.i
+
+get some post-COW info
+
+  $ hg packinfo
+  0 bytes packed, saving 0 files
+  203 bytes not packed in 2 files (could be packed)
+  0 bytes large revlogs in 0 files (no need to pack)
+
+does a hardlinked clone work after COW?
+
+  $ hg clone . ../hl2
+  updating to branch default
+  2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ hg --cwd ../hl2 verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  2 files, 3 changesets, 3 total revisions
+
+disable fncache, then check functionality
+
+  $ hg --config format.usefncache=False clone --uncompressed http://localhost:$HGPORT/ ../c
+  streaming all changes
+  3 files to transfer, 294 bytes of data
+  transferred * bytes in * seconds (*/sec) (glob)
+  updating to branch default
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+
+no fncache: ensure that dest is packed and sane
+
+  $ cd ../c
+  $ ls .hg/store/00pack.*
+  .hg/store/00pack.dat
+  .hg/store/00pack.idx
+  $ ls .hg/store/data/a.[id]
+  ls: .hg/store/data/a.[id]: No such file or directory
+  [1]
+  $ sort .hg/requires
+  packrepo
+  revlogv1
+  store
+  $ hg verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+no fncache: try a clone
+
+  $ hg clone . ../nofncache
+  updating to branch default
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ hg --cwd ../nofncache verify
+  checking changesets
+  checking manifests
+  crosschecking files in changesets and manifests
+  checking files
+  1 files, 1 changesets, 1 total revisions
+
+no fncache: does COW work?
+
+  $ cd ../nofncache
+  $ echo a >> a
+  $ hg ci -m b
+  $ ls .hg/store/data/a.[id]
+  .hg/store/data/a.i
+
+all done
+
+  $ cd ..
+  $ cat hg1.pid >> $DAEMON_PIDS



More information about the Mercurial-devel mailing list