speed up relink script
Brendan Cully
brendan at kublai.com
Fri Mar 23 19:04:14 UTC 2007
On Friday, 23 March 2007 at 13:47, TK Soh wrote:
> On 3/20/07, Matt Mackall <mpm at selenic.com> wrote:
> >On Tue, Mar 20, 2007 at 06:11:26AM -0500, TK Soh wrote:
> >> Coming to think about it again, perhaps reading from back to front
> >> isn't going to help much either. Apart from the fact that it's will be
> >> slow to read that way as pointed out by Bryan, most files in the repos
> >> are likely to stay unchanged over time. So it may actually slow down
> >> the comparison.
> >>
> >> I wonder if we can somehow compare the latest chunk of the index or
> >> data files checked in. Or, perhaps the last rev data in the index file
> >> will be representative? I'm not too confident on my understanding on
> >> the inner of hg to decide. Any input?
> >
> >Some notes:
> >
> >Two revlogs are identical if their indices are the same
> >Two revlogs don't match if they have different numbers of entries
> >Two revlogs are identical if their heads are the same.
> >Two revlogs may still be identical if their sizes are different, if
> >their last records are different, etc.
> >
> >The first observations says we can avoid ever reading .d files.
> >
> >This suggest the following approach:
> >
> >For each .i file in repo A:
> > record size, MD5 hash, number of entries, and sorted list of heads
> >
> >For each .i file in repo B:
> > if sizes match:
> > if hashes match:
> > relink files
> > else:
> > read index
> > if counts don't match:
> > continue
> > find heads
> > if heads match:
> > relink files
>
> Attached is the patch where I rewrote the script using the algo given
> by mpm above. The benchmark are:
>
> Crew rev:
> Relinked 1350 files (361915752 bytes reclaimed)
> 7.45u 9.39s 0:38.60 43.6%
>
> patched rev:
> Relinked 1350 files (361915752 bytes reclaimed)
> 4.69u 2.10s 0:13.23 51.3%
>
> Please help review the patch. The only think I don't like that much
> are the repeating os.stat() calls, but I was try to keep the original
> layout of the code. Thanks.
It's better if collect skips .d files, and you don't do any reading at
all (eg the md5) until after the prune phase. I doubt the md5 is
needed - directly comparing the indexes is probably enough. I don't
think the head-comparing stuff is about speed - it's about possibly
relinking even more files.
> # HG changeset patch
> # User TK Soh <teekaysoh at yahoo.com>
> # Date 1174674962 18000
> # Node ID eef14cac989d39f759ca5bfb9717f23f886508eb
> # Parent fe0fe0b4d73b32f9197f386140f2eb9363a13f7f
> optimize relink script with mpm's algorithm
>
> diff -r fe0fe0b4d73b -r eef14cac989d contrib/hg-relink
> +++ b/contrib/hg-relink Fri Mar 23 13:36:02 2007 -0500
> @@ -4,8 +4,40 @@
> #
> # This software may be used and distributed according to the terms
> # of the GNU General Public License, incorporated herein by reference.
> +#
> +# The script uses the following algorithm to check for files to
> +# to relinked, as suggested by Matt Mackall:
> +#
> +# Two revlogs are identical if their indices are the same
> +# Two revlogs don't match if they have different numbers of entries
> +# Two revlogs are identical if their heads are the same.
> +# Two revlogs may still be identical if their sizes are different, if
> +# their last records are different, etc.
> +#
> +# The first observations says we can avoid ever reading .d files.
> +#
> +# This suggest the following approach:
> +#
> +# For each .i file in repo A:
> +# record size, MD5 hash, number of entries, and sorted list of heads
> +#
> +# For each .i file in repo B:
> +# if sizes match:
> +# if hashes match:
> +# relink files
> +# else:
> +# read index
> +# if counts don't match:
> +# continue
> +# find heads
> +# if heads match:
> +# relink files
> +#
>
> -import os, sys
> +import os, sys, md5
> +from mercurial import revlog, util
> +
> +CHUNKLEN = 65536
>
> class ConfigError(Exception): pass
>
> @@ -30,22 +62,35 @@ except ConfigError, inst:
> usage()
> sys.exit(1)
>
> +def md5hash(filename):
> + md = md5.new()
> + sfp = file(filename)
> + sin = sfp.read(CHUNKLEN)
> + while sin:
> + md.update(sin)
> + sin = sfp.read(CHUNKLEN)
> + return md.hexdigest()
> +
> def collect(src):
> seplen = len(os.path.sep)
> candidates = []
> for dirpath, dirnames, filenames in os.walk(src):
> relpath = dirpath[len(src) + seplen:]
> for filename in filenames:
> - if not (filename.endswith('.i') or filename.endswith('.d')):
> + if not filename.endswith('.i'):
> continue
> + mhash = md5hash(os.path.join(dirpath, filename))
> + r = revlog.revlog(util.opener(os.getcwd(),
> + audit=False), filename, "", 0)
> st = os.stat(os.path.join(dirpath, filename))
> - candidates.append((os.path.join(relpath, filename), st))
> + candidates.append((os.path.join(relpath, filename), st,
> + mhash, r.count(), r.heads()))
>
> return candidates
>
> def prune(candidates, dst):
> targets = []
> - for fn, st in candidates:
> + for fn, st, mh, ct, hd in candidates:
> tgt = os.path.join(dst, fn)
> try:
> ts = os.stat(tgt)
> @@ -56,43 +101,50 @@ def prune(candidates, dst):
> continue
> if st.st_dev != ts.st_dev:
> raise Exception('Source and destination are on different devices')
> - if st.st_size != ts.st_size:
> - continue
> - targets.append((fn, ts.st_size))
> +
> + if st.st_size == ts.st_size:
> + mhash = md5hash(tgt)
> + if mhash == mh:
> + targets.append(fn)
> + else:
> + r = revlog.revlog(util.opener(os.getcwd(),
> + audit=False), tgt, "", 0)
> + if ct != r.count():
> + continue
> + heads = r.heads()
> + heads.sort()
> + hd.sort()
> + if heads == hd:
> + targets.append(fn)
>
> return targets
>
> def relink(src, dst, files):
> - CHUNKLEN = 65536
> relinked = 0
> savedbytes = 0
>
> - for f, sz in files:
> - source = os.path.join(src, f)
> - tgt = os.path.join(dst, f)
> - sfp = file(source)
> - dfp = file(tgt)
> - sin = sfp.read(CHUNKLEN)
> - while sin:
> - din = dfp.read(CHUNKLEN)
> - if sin != din:
> - break
> - sin = sfp.read(CHUNKLEN)
> - if sin:
> - continue
> - try:
> - os.rename(tgt, tgt + '.bak')
> + for ifile in files:
> + dfile = ifile[:-1] + "d"
> + for f in (ifile, dfile):
> + source = os.path.join(src, f)
> + tgt = os.path.join(dst, f)
> + if not (os.access(tgt, os.F_OK) and os.access(source, os.F_OK)):
> + continue
> +
> try:
> - os.link(source, tgt)
> - except OSError:
> - os.rename(tgt + '.bak', tgt)
> - raise
> - print 'Relinked %s' % f
> - relinked += 1
> - savedbytes += sz
> - os.remove(tgt + '.bak')
> - except OSError, inst:
> - print '%s: %s' % (tgt, str(inst))
> + ts = os.stat(tgt)
> + os.rename(tgt, tgt + '.bak')
> + try:
> + os.link(source, tgt)
> + except OSError:
> + os.rename(tgt + '.bak', tgt)
> + raise
> + print 'Relinked %s' % f
> + relinked += 1
> + savedbytes += ts.st_size
> + os.remove(tgt + '.bak')
> + except OSError, inst:
> + print '%s: %s' % (tgt, str(inst))
>
> print 'Relinked %d files (%d bytes reclaimed)' % (relinked, savedbytes)
>
> _______________________________________________
> Mercurial mailing list
> Mercurial at selenic.com
> http://selenic.com/mailman/listinfo/mercurial
More information about the Mercurial
mailing list