D11098: dirstate-v2: Move fixed-size tree metadata into the docket file
SimonSapin
phabricator at mercurial-scm.org
Thu Jul 15 21:04:49 UTC 2021
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.
REVISION SUMMARY
Before this changeset, the dirstate-v2 data file contained not only nodes
and paths that may be reused when appending to an existing file,
but also some fixed-size metadata that applies to the entire tree
and was added at the end of the data file for every append.
This moves that metadata into the docket file, so that repeated "append"
operations without meaningful changes donât actually need to grow any file.
REPOSITORY
rHG Mercurial
BRANCH
default
REVISION DETAIL
https://phab.mercurial-scm.org/D11098
AFFECTED FILES
mercurial/debugcommands.py
mercurial/dirstatemap.py
mercurial/dirstateutils/docket.py
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/dispatch.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/operations/list_tracked_files.rs
rust/hg-cpython/src/dirstate/dirstate_map.rs
rust/hg-cpython/src/dirstate/dispatch.rs
rust/hg-cpython/src/dirstate/owning.rs
rust/rhg/src/commands/status.rs
CHANGE DETAILS
diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs
--- a/rust/rhg/src/commands/status.rs
+++ b/rust/rhg/src/commands/status.rs
@@ -168,13 +168,16 @@
let repo = invocation.repo?;
let dirstate_data_mmap;
let (mut dmap, parents) = if repo.has_dirstate_v2() {
+ let docket_data =
+ repo.hg_vfs().read("dirstate").io_not_found_as_none()?;
let parents;
let dirstate_data;
let data_size;
- if let Some(docket_data) =
- repo.hg_vfs().read("dirstate").io_not_found_as_none()?
- {
- let docket = on_disk::read_docket(&docket_data)?;
+ let docket;
+ let tree_metadata;
+ if let Some(docket_data) = &docket_data {
+ docket = on_disk::read_docket(docket_data)?;
+ tree_metadata = docket.tree_metadata();
parents = Some(docket.parents());
data_size = docket.data_size();
dirstate_data_mmap = repo
@@ -184,10 +187,12 @@
dirstate_data = dirstate_data_mmap.as_deref().unwrap_or(b"");
} else {
parents = None;
+ tree_metadata = b"";
data_size = 0;
dirstate_data = b"";
}
- let dmap = DirstateMap::new_v2(dirstate_data, data_size)?;
+ let dmap =
+ DirstateMap::new_v2(dirstate_data, data_size, tree_metadata)?;
(dmap, parents)
} else {
dirstate_data_mmap =
diff --git a/rust/hg-cpython/src/dirstate/owning.rs b/rust/hg-cpython/src/dirstate/owning.rs
--- a/rust/hg-cpython/src/dirstate/owning.rs
+++ b/rust/hg-cpython/src/dirstate/owning.rs
@@ -49,9 +49,11 @@
py: Python,
on_disk: PyBytes,
data_size: usize,
+ tree_metadata: PyBytes,
) -> Result<Self, DirstateError> {
let bytes: &'_ [u8] = on_disk.data(py);
- let map = DirstateMap::new_v2(bytes, data_size)?;
+ let map =
+ DirstateMap::new_v2(bytes, data_size, tree_metadata.data(py))?;
// Like in `bytes` above, this `'_` lifetime parameter borrows from
// the bytes buffer owned by `on_disk`.
diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs
--- a/rust/hg-cpython/src/dirstate/dispatch.rs
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs
@@ -128,7 +128,7 @@
&mut self,
now: Timestamp,
can_append: bool,
- ) -> Result<(Vec<u8>, bool), DirstateError> {
+ ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
self.get_mut().pack_v2(now, can_append)
}
diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs
@@ -84,12 +84,14 @@
def new_v2(
on_disk: PyBytes,
data_size: usize,
+ tree_metadata: PyBytes,
) -> PyResult<PyObject> {
let dirstate_error = |e: DirstateError| {
PyErr::new::<exc::OSError, _>(py, format!("Dirstate error: {:?}", e))
};
- let inner = OwningDirstateMap::new_v2(py, on_disk, data_size)
- .map_err(dirstate_error)?;
+ let inner = OwningDirstateMap::new_v2(
+ py, on_disk, data_size, tree_metadata,
+ ).map_err(dirstate_error)?;
let map = Self::create_instance(py, Box::new(inner))?;
Ok(map.into_object())
}
@@ -353,9 +355,11 @@
let mut inner = self.inner(py).borrow_mut();
let result = inner.pack_v2(now, can_append);
match result {
- Ok((packed, append)) => {
+ Ok((packed, tree_metadata, append)) => {
let packed = PyBytes::new(py, &packed);
- Ok((packed, append).to_py_object(py).into_object())
+ let tree_metadata = PyBytes::new(py, &tree_metadata);
+ let tuple = (packed, tree_metadata, append);
+ Ok(tuple.to_py_object(py).into_object())
},
Err(_) => Err(PyErr::new::<exc::OSError, _>(
py,
diff --git a/rust/hg-core/src/operations/list_tracked_files.rs b/rust/hg-core/src/operations/list_tracked_files.rs
--- a/rust/hg-core/src/operations/list_tracked_files.rs
+++ b/rust/hg-core/src/operations/list_tracked_files.rs
@@ -22,27 +22,33 @@
pub struct Dirstate {
/// The `dirstate` content.
content: Vec<u8>,
- dirstate_v2: bool,
+ v2_metadata: Option<Vec<u8>>,
}
impl Dirstate {
pub fn new(repo: &Repo) -> Result<Self, HgError> {
let mut content = repo.hg_vfs().read("dirstate")?;
- if repo.has_dirstate_v2() {
+ let v2_metadata = if repo.has_dirstate_v2() {
let docket = read_docket(&content)?;
+ let meta = docket.tree_metadata().to_vec();
content = repo.hg_vfs().read(docket.data_filename())?;
- }
+ Some(meta)
+ } else {
+ None
+ };
Ok(Self {
content,
- dirstate_v2: repo.has_dirstate_v2(),
+ v2_metadata,
})
}
pub fn tracked_files(&self) -> Result<Vec<&HgPath>, DirstateError> {
let mut files = Vec::new();
if !self.content.is_empty() {
- if self.dirstate_v2 {
- for_each_tracked_path(&self.content, |path| files.push(path))?
+ if let Some(meta) = &self.v2_metadata {
+ for_each_tracked_path(&self.content, meta, |path| {
+ files.push(path)
+ })?
} else {
let _parents = parse_dirstate_entries(
&self.content,
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -47,6 +47,18 @@
pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
+/// Must match the constant of the same name in
+/// `mercurial/dirstateutils/docket.py`
+const TREE_METADATA_SIZE: usize = 40;
+
+/// Make sure that size-affecting changes are made knowingly
+#[allow(unused)]
+fn static_assert_size_of() {
+ let _ = std::mem::transmute::<DocketHeader, [u8; 121]>;
+ let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
+ let _ = std::mem::transmute::<Node, [u8; 43]>;
+}
+
// Must match `HEADER` in `mercurial/dirstateutils/docket.py`
#[derive(BytesCast)]
#[repr(C)]
@@ -58,6 +70,8 @@
/// Counted in bytes
data_size: Size,
+ metadata: TreeMetadata,
+
uuid_size: u8,
}
@@ -68,7 +82,7 @@
#[derive(BytesCast)]
#[repr(C)]
-struct Root {
+struct TreeMetadata {
root_nodes: ChildNodes,
nodes_with_entry_count: Size,
nodes_with_copy_source_count: Size,
@@ -134,7 +148,7 @@
/// - All direct children of this directory (as returned by
/// `std::fs::read_dir`) either have a corresponding dirstate node, or
/// are ignored by ignore patterns whose hash is in
- /// `Root::ignore_patterns_hash`.
+ /// `TreeMetadata::ignore_patterns_hash`.
///
/// This means that if `std::fs::symlink_metadata` later reports the
/// same modification time and ignored patterns havenât changed, a run
@@ -205,13 +219,6 @@
/// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
type OptPathSlice = PathSlice;
-/// Make sure that size-affecting changes are made knowingly
-fn _static_assert_size_of() {
- let _ = std::mem::transmute::<DocketHeader, [u8; 81]>;
- let _ = std::mem::transmute::<Root, [u8; 40]>;
- let _ = std::mem::transmute::<Node, [u8; 43]>;
-}
-
/// Unexpected file format found in `.hg/dirstate` with the "v2" format.
///
/// This should only happen if Mercurial is buggy or a repository is corrupted.
@@ -242,6 +249,10 @@
DirstateParents { p1, p2 }
}
+ pub fn tree_metadata(&self) -> &[u8] {
+ self.header.metadata.as_bytes()
+ }
+
pub fn data_size(&self) -> usize {
// This `unwrap` could only panic on a 16-bit CPU
self.header.data_size.get().try_into().unwrap()
@@ -265,40 +276,25 @@
}
}
-fn read_root<'on_disk>(
- on_disk: &'on_disk [u8],
-) -> Result<&'on_disk Root, DirstateV2ParseError> {
- // Find the `Root` at the end of the given slice
- let root_offset = on_disk
- .len()
- .checked_sub(std::mem::size_of::<Root>())
- // A non-empty slice too short is an error
- .ok_or(DirstateV2ParseError)?;
- let (root, _) = Root::from_bytes(&on_disk[root_offset..])
- .map_err(|_| DirstateV2ParseError)?;
- Ok(root)
-}
-
pub(super) fn read<'on_disk>(
on_disk: &'on_disk [u8],
+ metadata: &[u8],
) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
if on_disk.is_empty() {
return Ok(DirstateMap::empty(on_disk));
}
- let root = read_root(on_disk)?;
- let mut unreachable_bytes = root.unreachable_bytes.get();
- // Each append writes a new `Root`, so itâs never reused
- unreachable_bytes += std::mem::size_of::<Root>() as u32;
+ let (meta, _) = TreeMetadata::from_bytes(metadata)
+ .map_err(|_| DirstateV2ParseError)?;
let dirstate_map = DirstateMap {
on_disk,
root: dirstate_map::ChildNodes::OnDisk(read_nodes(
on_disk,
- root.root_nodes,
+ meta.root_nodes,
)?),
- nodes_with_entry_count: root.nodes_with_entry_count.get(),
- nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(),
- ignore_patterns_hash: root.ignore_patterns_hash,
- unreachable_bytes,
+ nodes_with_entry_count: meta.nodes_with_entry_count.get(),
+ nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(),
+ ignore_patterns_hash: meta.ignore_patterns_hash,
+ unreachable_bytes: meta.unreachable_bytes.get(),
};
Ok(dirstate_map)
}
@@ -530,9 +526,11 @@
pub(crate) fn for_each_tracked_path<'on_disk>(
on_disk: &'on_disk [u8],
+ metadata: &[u8],
mut f: impl FnMut(&'on_disk HgPath),
) -> Result<(), DirstateV2ParseError> {
- let root = read_root(on_disk)?;
+ let (meta, _) = TreeMetadata::from_bytes(metadata)
+ .map_err(|_| DirstateV2ParseError)?;
fn recur<'on_disk>(
on_disk: &'on_disk [u8],
nodes: ChildNodes,
@@ -548,23 +546,23 @@
}
Ok(())
}
- recur(on_disk, root.root_nodes, &mut f)
+ recur(on_disk, meta.root_nodes, &mut f)
}
-/// Returns new data together with whether that data should be appended to the
-/// existing data file whose content is at `dirstate_map.on_disk` (true),
-/// instead of written to a new data file (false).
+/// Returns new data and metadata, together with whether that data should be
+/// appended to the existing data file whose content is at
+/// `dirstate_map.on_disk` (true), instead of written to a new data file
+/// (false).
pub(super) fn write(
dirstate_map: &mut DirstateMap,
can_append: bool,
-) -> Result<(Vec<u8>, bool), DirstateError> {
+) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
let append = can_append && dirstate_map.write_should_append();
// This ignores the space for paths, and for nodes without an entry.
// TODO: better estimate? Skip the `Vec` and write to a file directly?
- let size_guess = std::mem::size_of::<Root>()
- + std::mem::size_of::<Node>()
- * dirstate_map.nodes_with_entry_count as usize;
+ let size_guess = std::mem::size_of::<Node>()
+ * dirstate_map.nodes_with_entry_count as usize;
let mut writer = Writer {
dirstate_map,
@@ -574,7 +572,7 @@
let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
- let root = Root {
+ let meta = TreeMetadata {
root_nodes,
nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
nodes_with_copy_source_count: dirstate_map
@@ -583,8 +581,7 @@
unreachable_bytes: dirstate_map.unreachable_bytes.into(),
ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
};
- writer.out.extend(root.as_bytes());
- Ok((writer.out, append))
+ Ok((writer.out, meta.as_bytes().to_vec(), append))
}
struct Writer<'dmap, 'on_disk> {
diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs
@@ -182,16 +182,17 @@
/// serialize bytes to write a dirstate data file to disk in dirstate-v2
/// format.
///
- /// Returns new data together with whether that data should be appended to
- /// the existing data file whose content is at `self.on_disk` (true),
- /// instead of written to a new data file (false).
+ /// Returns new data and metadata together with whether that data should be
+ /// appended to the existing data file whose content is at
+ /// `self.on_disk` (true), instead of written to a new data file
+ /// (false).
///
/// Note: this is only supported by the tree dirstate map.
fn pack_v2(
&mut self,
now: Timestamp,
can_append: bool,
- ) -> Result<(Vec<u8>, bool), DirstateError>;
+ ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError>;
/// Run the status algorithm.
///
@@ -395,7 +396,7 @@
&mut self,
_now: Timestamp,
_can_append: bool,
- ) -> Result<(Vec<u8>, bool), DirstateError> {
+ ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
panic!(
"should have used dirstate_tree::DirstateMap to use the v2 format"
)
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -420,9 +420,10 @@
pub fn new_v2(
on_disk: &'on_disk [u8],
data_size: usize,
+ metadata: &[u8],
) -> Result<Self, DirstateError> {
if let Some(data) = on_disk.get(..data_size) {
- Ok(on_disk::read(data)?)
+ Ok(on_disk::read(data, metadata)?)
} else {
Err(DirstateV2ParseError.into())
}
@@ -1090,15 +1091,16 @@
Ok(packed)
}
- /// Returns new data together with whether that data should be appended to
- /// the existing data file whose content is at `self.on_disk` (true),
- /// instead of written to a new data file (false).
+ /// Returns new data and metadata together with whether that data should be
+ /// appended to the existing data file whose content is at
+ /// `self.on_disk` (true), instead of written to a new data file
+ /// (false).
#[timed]
fn pack_v2(
&mut self,
now: Timestamp,
can_append: bool,
- ) -> Result<(Vec<u8>, bool), DirstateError> {
+ ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
// TODO:Â how do we want to handle this in 2038?
let now: i32 = now.0.try_into().expect("time overflow");
let mut paths = Vec::new();
diff --git a/mercurial/dirstateutils/docket.py b/mercurial/dirstateutils/docket.py
--- a/mercurial/dirstateutils/docket.py
+++ b/mercurial/dirstateutils/docket.py
@@ -14,47 +14,60 @@
V2_FORMAT_MARKER = b"dirstate-v2\n"
+# Must match the constant of the same name in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`
+TREE_METADATA_SIZE = 40
+
# * 12 bytes: format marker
# * 32 bytes: node ID of the working directory's first parent
# * 32 bytes: node ID of the working directory's second parent
# * 4 bytes: big-endian used size of the data file
+# * {TREE_METADATA_SIZE} bytes: tree metadata, parsed separately
# * 1 byte: length of the data file's UUID
# * variable: data file's UUID
#
# Node IDs are null-padded if shorter than 32 bytes.
# A data file shorter than the specified used size is corrupted (truncated)
-HEADER = struct.Struct(">{}s32s32sLB".format(len(V2_FORMAT_MARKER)))
+HEADER = struct.Struct(
+ ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
+)
class DirstateDocket(object):
data_filename_pattern = b'dirstate.%s.d'
- def __init__(self, parents, data_size, uuid):
+ def __init__(self, parents, data_size, tree_metadata, uuid):
self.parents = parents
self.data_size = data_size
+ self.tree_metadata = tree_metadata
self.uuid = uuid
@classmethod
- def with_new_uuid(cls, parents, data):
- return cls(parents, data, docket_mod.make_uid())
+ def with_new_uuid(cls, parents, data_size, tree_metadata):
+ return cls(parents, data_size, tree_metadata, docket_mod.make_uid())
@classmethod
def parse(cls, data, nodeconstants):
if not data:
parents = (nodeconstants.nullid, nodeconstants.nullid)
- return cls(parents, 0, None)
- marker, p1, p2, data_size, uuid_size = HEADER.unpack_from(data)
+ return cls(parents, 0, b'', None)
+ marker, p1, p2, data_size, meta, uuid_size = HEADER.unpack_from(data)
if marker != V2_FORMAT_MARKER:
raise ValueError("expected dirstate-v2 marker")
uuid = data[HEADER.size : HEADER.size + uuid_size]
p1 = p1[: nodeconstants.nodelen]
p2 = p2[: nodeconstants.nodelen]
- return cls((p1, p2), data_size, uuid)
+ return cls((p1, p2), data_size, meta, uuid)
def serialize(self):
p1, p2 = self.parents
header = HEADER.pack(
- V2_FORMAT_MARKER, p1, p2, self.data_size, len(self.uuid)
+ V2_FORMAT_MARKER,
+ p1,
+ p2,
+ self.data_size,
+ self.tree_metadata,
+ len(self.uuid),
)
return header + self.uuid
diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py
--- a/mercurial/dirstatemap.py
+++ b/mercurial/dirstatemap.py
@@ -640,7 +640,7 @@
else:
data = b''
self._rustmap = rustmod.DirstateMap.new_v2(
- data, self.docket.data_size
+ data, self.docket.data_size, self.docket.tree_metadata
)
parents = self.docket.parents
else:
@@ -666,7 +666,7 @@
# We can only append to an existing data file if there is one
can_append = self.docket.uuid is not None
- packed, append = self._rustmap.write_v2(now, can_append)
+ packed, meta, append = self._rustmap.write_v2(now, can_append)
if append:
docket = self.docket
with self._opener(docket.data_filename(), b'ab') as fp:
@@ -678,12 +678,13 @@
assert written == len(packed)
docket.data_size += len(packed)
docket.parents = self.parents()
+ docket.tree_metadata = meta
st.write(docket.serialize())
st.close()
else:
old_docket = self.docket
new_docket = docketmod.DirstateDocket.with_new_uuid(
- self.parents(), len(packed)
+ self.parents(), len(packed), meta
)
self._opener.write(new_docket.data_filename(), packed)
# Write the new docket after the new data file has been
diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
--- a/mercurial/debugcommands.py
+++ b/mercurial/debugcommands.py
@@ -999,11 +999,7 @@
if repo.dirstate._use_dirstate_v2:
docket = repo.dirstate._map.docket
hash_len = 20 # 160 bits for SHA-1
- hash_offset = docket.data_size - hash_len # hash is at the end
- data_filename = docket.data_filename()
- with repo.vfs(data_filename) as f:
- f.seek(hash_offset)
- hash_bytes = f.read(hash_len)
+ hash_bytes = docket.tree_metadata[-hash_len:]
print(binascii.hexlify(hash_bytes).decode())
To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel
More information about the Mercurial-devel
mailing list