D9993: sidedata: move to new sidedata storage in revlogv2
Alphare (Raphaël Gomès)
phabricator at mercurial-scm.org
Mon Feb 15 10:30:16 UTC 2021
Alphare created this revision.
Herald added a reviewer: indygreg.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.
REVISION SUMMARY
The current (experimental) sidedata system uses flagprocessors to signify the
presence and store/retrieve sidedata from the raw revlog data. This proved to be
quite fragile from an exchange perspective and a lot more complex than simply
having a dedicated space in the new revlog format.
This change does not handle exchange (ironically), so the test for amend - that
uses a bundle - is broken. This functionality is split into the next patches.
REPOSITORY
rHG Mercurial
BRANCH
default
REVISION DETAIL
https://phab.mercurial-scm.org/D9993
AFFECTED FILES
hgext/lfs/wrapper.py
hgext/remotefilelog/remotefilelog.py
mercurial/revlog.py
mercurial/revlogutils/flagutil.py
mercurial/revlogutils/sidedata.py
tests/flagprocessorext.py
tests/simplestorerepo.py
tests/test-copies-in-changeset.t
tests/test-revlog-raw.py
tests/testlib/ext-sidedata.py
CHANGE DETAILS
diff --git a/tests/testlib/ext-sidedata.py b/tests/testlib/ext-sidedata.py
--- a/tests/testlib/ext-sidedata.py
+++ b/tests/testlib/ext-sidedata.py
@@ -40,19 +40,20 @@
return orig(self, text, transaction, link, p1, p2, *args, **kwargs)
-def wraprevision(orig, self, nodeorrev, *args, **kwargs):
- text = orig(self, nodeorrev, *args, **kwargs)
+def wrap_revisiondata(orig, self, nodeorrev, *args, **kwargs):
+ text, sd = orig(self, nodeorrev, *args, **kwargs)
if getattr(self, 'sidedatanocheck', False):
- return text
+ return text, sd
+ if self.version & 0xFFFF != 2:
+ return text, sd
if nodeorrev != nullrev and nodeorrev != nullid:
- sd = self.sidedata(nodeorrev)
if len(text) != struct.unpack('>I', sd[sidedata.SD_TEST1])[0]:
raise RuntimeError('text size mismatch')
expected = sd[sidedata.SD_TEST2]
got = hashlib.sha256(text).digest()
if got != expected:
raise RuntimeError('sha256 mismatch')
- return text
+ return text, sd
def wrapgetsidedatacompanion(orig, srcrepo, dstrepo):
@@ -81,7 +82,7 @@
def extsetup(ui):
extensions.wrapfunction(revlog.revlog, 'addrevision', wrapaddrevision)
- extensions.wrapfunction(revlog.revlog, 'revision', wraprevision)
+ extensions.wrapfunction(revlog.revlog, '_revisiondata', wrap_revisiondata)
extensions.wrapfunction(
upgrade_engine, 'getsidedatacompanion', wrapgetsidedatacompanion
)
diff --git a/tests/test-revlog-raw.py b/tests/test-revlog-raw.py
--- a/tests/test-revlog-raw.py
+++ b/tests/test-revlog-raw.py
@@ -51,10 +51,10 @@
def readprocessor(self, rawtext):
# True: the returned text could be used to verify hash
text = rawtext[len(_extheader) :].replace(b'i', b'1')
- return text, True, {}
+ return text, True
-def writeprocessor(self, text, sidedata):
+def writeprocessor(self, text):
# False: the returned rawtext shouldn't be used to verify hash
rawtext = _extheader + text.replace(b'1', b'i')
return rawtext, False
@@ -293,7 +293,7 @@
# Verify text, rawtext, and rawsize
if isext:
- rawtext = writeprocessor(None, text, {})[0]
+ rawtext = writeprocessor(None, text)[0]
else:
rawtext = text
if rlog.rawsize(rev) != len(rawtext):
diff --git a/tests/test-copies-in-changeset.t b/tests/test-copies-in-changeset.t
--- a/tests/test-copies-in-changeset.t
+++ b/tests/test-copies-in-changeset.t
@@ -273,12 +273,13 @@
$ hg ci --amend -m 'copy a to j, v2'
saved backup bundle to $TESTTMP/repo/.hg/strip-backup/*-*-amend.hg (glob)
$ hg debugsidedata -c -v -- -1
- 1 sidedata entries
- entry-0014 size 24
- '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj'
+ 1 sidedata entries (missing-correct-output !)
+ entry-0014 size 24 (missing-correct-output !)
+ '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj' (missing-correct-output !)
#endif
$ hg showcopies --config experimental.copies.read-from=filelog-only
- a -> j
+ a -> j (sidedata missing-correct-output !)
+ a -> j (no-sidedata !)
The entries should be written to extras even if they're empty (so the client
won't have to fall back to reading from filelogs)
$ echo x >> j
@@ -356,7 +357,8 @@
saved backup bundle to $TESTTMP/rebase-rename/.hg/strip-backup/*-*-rebase.hg (glob)
$ hg st --change . --copies
A b
- a
+ a (sidedata missing-correct-output !)
+ a (no-sidedata !)
R a
$ cd ..
diff --git a/tests/simplestorerepo.py b/tests/simplestorerepo.py
--- a/tests/simplestorerepo.py
+++ b/tests/simplestorerepo.py
@@ -300,7 +300,7 @@
text = rawtext
else:
r = flagutil.processflagsread(self, rawtext, flags)
- text, validatehash, sidedata = r
+ text, validatehash = r
if validatehash:
self.checkhash(text, node, rev=rev)
diff --git a/tests/flagprocessorext.py b/tests/flagprocessorext.py
--- a/tests/flagprocessorext.py
+++ b/tests/flagprocessorext.py
@@ -31,28 +31,28 @@
return False
-def noopdonothing(self, text, sidedata):
+def noopdonothing(self, text):
return (text, True)
def noopdonothingread(self, text):
- return (text, True, {})
+ return (text, True)
-def b64encode(self, text, sidedata):
+def b64encode(self, text):
return (base64.b64encode(text), False)
def b64decode(self, text):
- return (base64.b64decode(text), True, {})
+ return (base64.b64decode(text), True)
-def gzipcompress(self, text, sidedata):
+def gzipcompress(self, text):
return (zlib.compress(text), False)
def gzipdecompress(self, text):
- return (zlib.decompress(text), True, {})
+ return (zlib.decompress(text), True)
def supportedoutgoingversions(orig, repo):
diff --git a/mercurial/revlogutils/sidedata.py b/mercurial/revlogutils/sidedata.py
--- a/mercurial/revlogutils/sidedata.py
+++ b/mercurial/revlogutils/sidedata.py
@@ -13,9 +13,8 @@
The current implementation is experimental and subject to changes. Do not rely
on it in production.
-Sidedata are stored in the revlog itself, within the revision rawtext. They
-are inserted and removed from it using the flagprocessors mechanism. The following
-format is currently used::
+Sidedata are stored in the revlog itself, thanks to a new version of the
+revlog. The following format is currently used::
initial header:
<number of sidedata; 2 bytes>
@@ -60,48 +59,35 @@
SIDEDATA_ENTRY = struct.Struct('>HL20s')
-def sidedatawriteprocessor(rl, text, sidedata):
+def serialize_sidedata(sidedata):
sidedata = list(sidedata.items())
sidedata.sort()
- rawtext = [SIDEDATA_HEADER.pack(len(sidedata))]
+ buf = [SIDEDATA_HEADER.pack(len(sidedata))]
for key, value in sidedata:
digest = hashutil.sha1(value).digest()
- rawtext.append(SIDEDATA_ENTRY.pack(key, len(value), digest))
+ buf.append(SIDEDATA_ENTRY.pack(key, len(value), digest))
for key, value in sidedata:
- rawtext.append(value)
- rawtext.append(bytes(text))
- return b''.join(rawtext), False
+ buf.append(value)
+ buf = b''.join(buf)
+ return buf
-def sidedatareadprocessor(rl, text):
+def deserialize_sidedata(blob):
sidedata = {}
offset = 0
- (nbentry,) = SIDEDATA_HEADER.unpack(text[: SIDEDATA_HEADER.size])
+ (nbentry,) = SIDEDATA_HEADER.unpack(blob[: SIDEDATA_HEADER.size])
offset += SIDEDATA_HEADER.size
dataoffset = SIDEDATA_HEADER.size + (SIDEDATA_ENTRY.size * nbentry)
for i in range(nbentry):
nextoffset = offset + SIDEDATA_ENTRY.size
- key, size, storeddigest = SIDEDATA_ENTRY.unpack(text[offset:nextoffset])
+ key, size, storeddigest = SIDEDATA_ENTRY.unpack(blob[offset:nextoffset])
offset = nextoffset
# read the data associated with that entry
nextdataoffset = dataoffset + size
- entrytext = text[dataoffset:nextdataoffset]
+ entrytext = bytes(blob[dataoffset:nextdataoffset])
readdigest = hashutil.sha1(entrytext).digest()
if storeddigest != readdigest:
raise error.SidedataHashError(key, storeddigest, readdigest)
sidedata[key] = entrytext
dataoffset = nextdataoffset
- text = text[dataoffset:]
- return text, True, sidedata
-
-
-def sidedatarawprocessor(rl, text):
- # side data modifies rawtext and prevent rawtext hash validation
- return False
-
-
-processors = (
- sidedatareadprocessor,
- sidedatawriteprocessor,
- sidedatarawprocessor,
-)
+ return sidedata
diff --git a/mercurial/revlogutils/flagutil.py b/mercurial/revlogutils/flagutil.py
--- a/mercurial/revlogutils/flagutil.py
+++ b/mercurial/revlogutils/flagutil.py
@@ -84,7 +84,7 @@
flagprocessors[flag] = processor
-def processflagswrite(revlog, text, flags, sidedata):
+def processflagswrite(revlog, text, flags):
"""Inspect revision data flags and applies write transformations defined
by registered flag processors.
@@ -100,9 +100,12 @@
processed text and ``validatehash`` is a bool indicating whether the
returned text should be checked for hash integrity.
"""
- return _processflagsfunc(revlog, text, flags, b'write', sidedata=sidedata)[
- :2
- ]
+ return _processflagsfunc(
+ revlog,
+ text,
+ flags,
+ b'write',
+ )[:2]
def processflagsread(revlog, text, flags):
@@ -145,14 +148,14 @@
return _processflagsfunc(revlog, text, flags, b'raw')[1]
-def _processflagsfunc(revlog, text, flags, operation, sidedata=None):
+def _processflagsfunc(revlog, text, flags, operation):
"""internal function to process flag on a revlog
This function is private to this module, code should never needs to call it
directly."""
# fast path: no flag processors will run
if flags == 0:
- return text, True, {}
+ return text, True
if operation not in (b'read', b'write', b'raw'):
raise error.ProgrammingError(_(b"invalid '%s' operation") % operation)
# Check all flags are known.
@@ -168,7 +171,6 @@
if operation == b'write':
orderedflags = reversed(orderedflags)
- outsidedata = {}
for flag in orderedflags:
# If a flagprocessor has been registered for a known flag, apply the
# related operation transform and update result tuple.
@@ -186,10 +188,9 @@
if operation == b'raw':
vhash = rawtransform(revlog, text)
elif operation == b'read':
- text, vhash, s = readtransform(revlog, text)
- outsidedata.update(s)
+ text, vhash = readtransform(revlog, text)
else: # write operation
- text, vhash = writetransform(revlog, text, sidedata)
+ text, vhash = writetransform(revlog, text)
validatehash = validatehash and vhash
- return text, validatehash, outsidedata
+ return text, validatehash
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -119,10 +119,10 @@
# Flag processors for REVIDX_ELLIPSIS.
def ellipsisreadprocessor(rl, text):
- return text, False, {}
-
-
-def ellipsiswriteprocessor(rl, text, sidedata):
+ return text, False
+
+
+def ellipsiswriteprocessor(rl, text):
return text, False
@@ -560,8 +560,6 @@
if self._mmaplargeindex and b'mmapindexthreshold' in opts:
mmapindexthreshold = opts[b'mmapindexthreshold']
self.hassidedata = bool(opts.get(b'side-data', False))
- if self.hassidedata:
- self._flagprocessors[REVIDX_SIDEDATA] = sidedatautil.processors
self._sparserevlog = bool(opts.get(b'sparse-revlog', False))
withsparseread = bool(opts.get(b'with-sparse-read', False))
# sparse-revlog forces sparse-read
@@ -862,6 +860,11 @@
def length(self, rev):
return self.index[rev][1]
+ def sidedata_length(self, rev):
+ if self.version & 0xFFFF != REVLOGV2:
+ return 0
+ return self.index[rev][11]
+
def rawsize(self, rev):
"""return the length of the uncompressed text for a given revision"""
l = self.index[rev][2]
@@ -923,7 +926,7 @@
# Derived from index values.
def end(self, rev):
- return self.start(rev) + self.length(rev)
+ return self.start(rev) + self.length(rev) + self.sidedata_length(rev)
def parents(self, node):
i = self.index
@@ -1859,7 +1862,7 @@
elif operation == b'read':
return flagutil.processflagsread(self, text, flags)
else: # write operation
- return flagutil.processflagswrite(self, text, flags, None)
+ return flagutil.processflagswrite(self, text, flags)
def revision(self, nodeorrev, _df=None, raw=False):
"""return an uncompressed revision of a given node or revision
@@ -1904,10 +1907,17 @@
# revision or might need to be processed to retrieve the revision.
rev, rawtext, validated = self._rawtext(node, rev, _df=_df)
+ if self.version & 0xFFFF == REVLOGV2:
+ if rev is None:
+ rev = self.rev(node)
+ sidedata = self._sidedata(rev)
+ else:
+ sidedata = {}
+
if raw and validated:
# if we don't want to process the raw text and that raw
# text is cached, we can exit early.
- return rawtext, {}
+ return rawtext, sidedata
if rev is None:
rev = self.rev(node)
# the revlog's flag for this revision
@@ -1916,20 +1926,14 @@
if validated and flags == REVIDX_DEFAULT_FLAGS:
# no extra flags set, no flag processor runs, text = rawtext
- return rawtext, {}
-
- sidedata = {}
+ return rawtext, sidedata
+
if raw:
validatehash = flagutil.processflagsraw(self, rawtext, flags)
text = rawtext
else:
- try:
- r = flagutil.processflagsread(self, rawtext, flags)
- except error.SidedataHashError as exc:
- msg = _(b"integrity check failed on %s:%s sidedata key %d")
- msg %= (self.indexfile, pycompat.bytestr(rev), exc.sidedatakey)
- raise error.RevlogError(msg)
- text, validatehash, sidedata = r
+ r = flagutil.processflagsread(self, rawtext, flags)
+ text, validatehash = r
if validatehash:
self.checkhash(text, node, rev=rev)
if not validated:
@@ -1980,6 +1984,21 @@
del basetext # let us have a chance to free memory early
return (rev, rawtext, False)
+ def _sidedata(self, rev):
+ """Return the sidedata for a given revision number."""
+ index_entry = self.index[rev]
+ sidedata_offset = index_entry[10]
+ sidedata_size = index_entry[11]
+
+ if self._inline:
+ sidedata_offset += self._io.size * (1 + rev)
+ if sidedata_size == 0:
+ return {}
+
+ segment = self._getsegment(sidedata_offset, sidedata_size)
+ sidedata = sidedatautil.deserialize_sidedata(segment)
+ return sidedata
+
def rawdata(self, nodeorrev, _df=None):
"""return an uncompressed raw data of a given node or revision number.
@@ -2113,20 +2132,15 @@
if sidedata is None:
sidedata = {}
- flags = flags & ~REVIDX_SIDEDATA
elif not self.hassidedata:
raise error.ProgrammingError(
_(b"trying to add sidedata to a revlog who don't support them")
)
- else:
- flags |= REVIDX_SIDEDATA
if flags:
node = node or self.hash(text, p1, p2)
- rawtext, validatehash = flagutil.processflagswrite(
- self, text, flags, sidedata=sidedata
- )
+ rawtext, validatehash = flagutil.processflagswrite(self, text, flags)
# If the flag processor modifies the revision data, ignore any provided
# cachedelta.
@@ -2159,6 +2173,7 @@
flags,
cachedelta=cachedelta,
deltacomputer=deltacomputer,
+ sidedata=sidedata,
)
def addrawrevision(
@@ -2172,6 +2187,7 @@
flags,
cachedelta=None,
deltacomputer=None,
+ sidedata=None,
):
"""add a raw revision with known flags, node and parents
useful when reusing a revision not stored in this revlog (ex: received
@@ -2194,6 +2210,7 @@
ifh,
dfh,
deltacomputer=deltacomputer,
+ sidedata=sidedata,
)
finally:
if dfh:
@@ -2287,6 +2304,7 @@
dfh,
alwayscache=False,
deltacomputer=None,
+ sidedata=None,
):
"""internal function to add revisions to the log
@@ -2341,6 +2359,21 @@
deltainfo = deltacomputer.finddeltainfo(revinfo, fh)
+ # TODO support merging revlogs
+ unified_revlog_id = 0
+ # TODO compute rank (for revlog v2)
+ rank = 0
+
+ if sidedata:
+ serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
+ sidedata_offset = offset + deltainfo.deltalen
+ else:
+ serialized_sidedata = b""
+ # Don't store the offset if the sidedata is empty, that way
+ # we can easily detect empty sidedata and they will be no different
+ # than ones we manually add.
+ sidedata_offset = 0
+
e = (
offset_type(offset, flags),
deltainfo.deltalen,
@@ -2350,20 +2383,26 @@
p1r,
p2r,
node,
- 0,
- 0,
- 0,
- 0,
+ unified_revlog_id,
+ rank,
+ sidedata_offset,
+ len(serialized_sidedata),
)
if self.version & 0xFFFF != REVLOGV2:
e = e[:8]
self.index.append(e)
-
entry = self._io.packentry(e, self.node, self.version, curr)
self._writeentry(
- transaction, ifh, dfh, entry, deltainfo.data, link, offset
+ transaction,
+ ifh,
+ dfh,
+ entry,
+ deltainfo.data,
+ link,
+ offset,
+ serialized_sidedata,
)
rawtext = btext[0]
@@ -2376,7 +2415,9 @@
self._chainbasecache[curr] = deltainfo.chainbase
return curr
- def _writeentry(self, transaction, ifh, dfh, entry, data, link, offset):
+ def _writeentry(
+ self, transaction, ifh, dfh, entry, data, link, offset, sidedata
+ ):
# Files opened in a+ mode have inconsistent behavior on various
# platforms. Windows requires that a file positioning call be made
# when the file handle transitions between reads and writes. See
@@ -2400,6 +2441,8 @@
if data[0]:
dfh.write(data[0])
dfh.write(data[1])
+ if sidedata:
+ dfh.write(sidedata)
ifh.write(entry)
else:
offset += curr * self._io.size
@@ -2407,6 +2450,8 @@
ifh.write(entry)
ifh.write(data[0])
ifh.write(data[1])
+ if sidedata:
+ ifh.write(sidedata)
self._enforceinlinesize(transaction, ifh)
nodemaputil.setup_persistent_nodemap(transaction, self)
diff --git a/hgext/remotefilelog/remotefilelog.py b/hgext/remotefilelog/remotefilelog.py
--- a/hgext/remotefilelog/remotefilelog.py
+++ b/hgext/remotefilelog/remotefilelog.py
@@ -155,12 +155,12 @@
# text passed to "addrevision" includes hg filelog metadata header
if node is None:
node = storageutil.hashrevisionsha1(text, p1, p2)
- if sidedata is None:
- sidedata = {}
meta, metaoffset = storageutil.parsemeta(text)
rawtext, validatehash = flagutil.processflagswrite(
- self, text, flags, sidedata=sidedata
+ self,
+ text,
+ flags,
)
return self.addrawrevision(
rawtext,
diff --git a/hgext/lfs/wrapper.py b/hgext/lfs/wrapper.py
--- a/hgext/lfs/wrapper.py
+++ b/hgext/lfs/wrapper.py
@@ -116,10 +116,10 @@
if hgmeta or text.startswith(b'\1\n'):
text = storageutil.packmeta(hgmeta, text)
- return (text, True, {})
+ return (text, True)
-def writetostore(self, text, sidedata):
+def writetostore(self, text):
# hg filelog metadata (includes rename, etc)
hgmeta, offset = storageutil.parsemeta(text)
if offset and offset > 0:
To: Alphare, indygreg, #hg-reviewers
Cc: mercurial-patches, mercurial-devel
More information about the Mercurial-devel
mailing list