D11098: dirstate-v2: Move fixed-size tree metadata into the docket file

SimonSapin phabricator at mercurial-scm.org
Thu Jul 15 21:04:49 UTC 2021


SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  Before this changeset, the dirstate-v2 data file contained not only nodes
  and paths that may be reused when appending to an existing file,
  but also some fixed-size metadata that applies to the entire tree
  and was added at the end of the data file for every append.
  
  This moves that metadata into the docket file, so that repeated "append"
  operations without meaningful changes don’t actually need to grow any file.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11098

AFFECTED FILES
  mercurial/debugcommands.py
  mercurial/dirstatemap.py
  mercurial/dirstateutils/docket.py
  rust/hg-core/src/dirstate_tree/dirstate_map.rs
  rust/hg-core/src/dirstate_tree/dispatch.rs
  rust/hg-core/src/dirstate_tree/on_disk.rs
  rust/hg-core/src/operations/list_tracked_files.rs
  rust/hg-cpython/src/dirstate/dirstate_map.rs
  rust/hg-cpython/src/dirstate/dispatch.rs
  rust/hg-cpython/src/dirstate/owning.rs
  rust/rhg/src/commands/status.rs

CHANGE DETAILS

diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs
--- a/rust/rhg/src/commands/status.rs
+++ b/rust/rhg/src/commands/status.rs
@@ -168,13 +168,16 @@
     let repo = invocation.repo?;
     let dirstate_data_mmap;
     let (mut dmap, parents) = if repo.has_dirstate_v2() {
+        let docket_data =
+            repo.hg_vfs().read("dirstate").io_not_found_as_none()?;
         let parents;
         let dirstate_data;
         let data_size;
-        if let Some(docket_data) =
-            repo.hg_vfs().read("dirstate").io_not_found_as_none()?
-        {
-            let docket = on_disk::read_docket(&docket_data)?;
+        let docket;
+        let tree_metadata;
+        if let Some(docket_data) = &docket_data {
+            docket = on_disk::read_docket(docket_data)?;
+            tree_metadata = docket.tree_metadata();
             parents = Some(docket.parents());
             data_size = docket.data_size();
             dirstate_data_mmap = repo
@@ -184,10 +187,12 @@
             dirstate_data = dirstate_data_mmap.as_deref().unwrap_or(b"");
         } else {
             parents = None;
+            tree_metadata = b"";
             data_size = 0;
             dirstate_data = b"";
         }
-        let dmap = DirstateMap::new_v2(dirstate_data, data_size)?;
+        let dmap =
+            DirstateMap::new_v2(dirstate_data, data_size, tree_metadata)?;
         (dmap, parents)
     } else {
         dirstate_data_mmap =
diff --git a/rust/hg-cpython/src/dirstate/owning.rs b/rust/hg-cpython/src/dirstate/owning.rs
--- a/rust/hg-cpython/src/dirstate/owning.rs
+++ b/rust/hg-cpython/src/dirstate/owning.rs
@@ -49,9 +49,11 @@
         py: Python,
         on_disk: PyBytes,
         data_size: usize,
+        tree_metadata: PyBytes,
     ) -> Result<Self, DirstateError> {
         let bytes: &'_ [u8] = on_disk.data(py);
-        let map = DirstateMap::new_v2(bytes, data_size)?;
+        let map =
+            DirstateMap::new_v2(bytes, data_size, tree_metadata.data(py))?;
 
         // Like in `bytes` above, this `'_` lifetime parameter borrows from
         // the bytes buffer owned by `on_disk`.
diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs
--- a/rust/hg-cpython/src/dirstate/dispatch.rs
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs
@@ -128,7 +128,7 @@
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         self.get_mut().pack_v2(now, can_append)
     }
 
diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs
@@ -84,12 +84,14 @@
     def new_v2(
         on_disk: PyBytes,
         data_size: usize,
+        tree_metadata: PyBytes,
     ) -> PyResult<PyObject> {
         let dirstate_error = |e: DirstateError| {
             PyErr::new::<exc::OSError, _>(py, format!("Dirstate error: {:?}", e))
         };
-        let inner = OwningDirstateMap::new_v2(py, on_disk, data_size)
-                .map_err(dirstate_error)?;
+        let inner = OwningDirstateMap::new_v2(
+            py, on_disk, data_size, tree_metadata,
+        ).map_err(dirstate_error)?;
         let map = Self::create_instance(py, Box::new(inner))?;
         Ok(map.into_object())
     }
@@ -353,9 +355,11 @@
         let mut inner = self.inner(py).borrow_mut();
         let result = inner.pack_v2(now, can_append);
         match result {
-            Ok((packed, append)) => {
+            Ok((packed, tree_metadata, append)) => {
                 let packed = PyBytes::new(py, &packed);
-                Ok((packed, append).to_py_object(py).into_object())
+                let tree_metadata = PyBytes::new(py, &tree_metadata);
+                let tuple = (packed, tree_metadata, append);
+                Ok(tuple.to_py_object(py).into_object())
             },
             Err(_) => Err(PyErr::new::<exc::OSError, _>(
                 py,
diff --git a/rust/hg-core/src/operations/list_tracked_files.rs b/rust/hg-core/src/operations/list_tracked_files.rs
--- a/rust/hg-core/src/operations/list_tracked_files.rs
+++ b/rust/hg-core/src/operations/list_tracked_files.rs
@@ -22,27 +22,33 @@
 pub struct Dirstate {
     /// The `dirstate` content.
     content: Vec<u8>,
-    dirstate_v2: bool,
+    v2_metadata: Option<Vec<u8>>,
 }
 
 impl Dirstate {
     pub fn new(repo: &Repo) -> Result<Self, HgError> {
         let mut content = repo.hg_vfs().read("dirstate")?;
-        if repo.has_dirstate_v2() {
+        let v2_metadata = if repo.has_dirstate_v2() {
             let docket = read_docket(&content)?;
+            let meta = docket.tree_metadata().to_vec();
             content = repo.hg_vfs().read(docket.data_filename())?;
-        }
+            Some(meta)
+        } else {
+            None
+        };
         Ok(Self {
             content,
-            dirstate_v2: repo.has_dirstate_v2(),
+            v2_metadata,
         })
     }
 
     pub fn tracked_files(&self) -> Result<Vec<&HgPath>, DirstateError> {
         let mut files = Vec::new();
         if !self.content.is_empty() {
-            if self.dirstate_v2 {
-                for_each_tracked_path(&self.content, |path| files.push(path))?
+            if let Some(meta) = &self.v2_metadata {
+                for_each_tracked_path(&self.content, meta, |path| {
+                    files.push(path)
+                })?
             } else {
                 let _parents = parse_dirstate_entries(
                     &self.content,
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -47,6 +47,18 @@
 pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
 pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
 
+/// Must match the constant of the same name in
+/// `mercurial/dirstateutils/docket.py`
+const TREE_METADATA_SIZE: usize = 40;
+
+/// Make sure that size-affecting changes are made knowingly
+#[allow(unused)]
+fn static_assert_size_of() {
+    let _ = std::mem::transmute::<DocketHeader, [u8; 121]>;
+    let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
+    let _ = std::mem::transmute::<Node, [u8; 43]>;
+}
+
 // Must match `HEADER` in `mercurial/dirstateutils/docket.py`
 #[derive(BytesCast)]
 #[repr(C)]
@@ -58,6 +70,8 @@
     /// Counted in bytes
     data_size: Size,
 
+    metadata: TreeMetadata,
+
     uuid_size: u8,
 }
 
@@ -68,7 +82,7 @@
 
 #[derive(BytesCast)]
 #[repr(C)]
-struct Root {
+struct TreeMetadata {
     root_nodes: ChildNodes,
     nodes_with_entry_count: Size,
     nodes_with_copy_source_count: Size,
@@ -134,7 +148,7 @@
     ///   - All direct children of this directory (as returned by
     ///     `std::fs::read_dir`) either have a corresponding dirstate node, or
     ///     are ignored by ignore patterns whose hash is in
-    ///     `Root::ignore_patterns_hash`.
+    ///     `TreeMetadata::ignore_patterns_hash`.
     ///
     ///   This means that if `std::fs::symlink_metadata` later reports the
     ///   same modification time and ignored patterns haven’t changed, a run
@@ -205,13 +219,6 @@
 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
 type OptPathSlice = PathSlice;
 
-/// Make sure that size-affecting changes are made knowingly
-fn _static_assert_size_of() {
-    let _ = std::mem::transmute::<DocketHeader, [u8; 81]>;
-    let _ = std::mem::transmute::<Root, [u8; 40]>;
-    let _ = std::mem::transmute::<Node, [u8; 43]>;
-}
-
 /// Unexpected file format found in `.hg/dirstate` with the "v2" format.
 ///
 /// This should only happen if Mercurial is buggy or a repository is corrupted.
@@ -242,6 +249,10 @@
         DirstateParents { p1, p2 }
     }
 
+    pub fn tree_metadata(&self) -> &[u8] {
+        self.header.metadata.as_bytes()
+    }
+
     pub fn data_size(&self) -> usize {
         // This `unwrap` could only panic on a 16-bit CPU
         self.header.data_size.get().try_into().unwrap()
@@ -265,40 +276,25 @@
     }
 }
 
-fn read_root<'on_disk>(
-    on_disk: &'on_disk [u8],
-) -> Result<&'on_disk Root, DirstateV2ParseError> {
-    // Find the `Root` at the end of the given slice
-    let root_offset = on_disk
-        .len()
-        .checked_sub(std::mem::size_of::<Root>())
-        // A non-empty slice too short is an error
-        .ok_or(DirstateV2ParseError)?;
-    let (root, _) = Root::from_bytes(&on_disk[root_offset..])
-        .map_err(|_| DirstateV2ParseError)?;
-    Ok(root)
-}
-
 pub(super) fn read<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
 ) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
     if on_disk.is_empty() {
         return Ok(DirstateMap::empty(on_disk));
     }
-    let root = read_root(on_disk)?;
-    let mut unreachable_bytes = root.unreachable_bytes.get();
-    // Each append writes a new `Root`, so it’s never reused
-    unreachable_bytes += std::mem::size_of::<Root>() as u32;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     let dirstate_map = DirstateMap {
         on_disk,
         root: dirstate_map::ChildNodes::OnDisk(read_nodes(
             on_disk,
-            root.root_nodes,
+            meta.root_nodes,
         )?),
-        nodes_with_entry_count: root.nodes_with_entry_count.get(),
-        nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(),
-        ignore_patterns_hash: root.ignore_patterns_hash,
-        unreachable_bytes,
+        nodes_with_entry_count: meta.nodes_with_entry_count.get(),
+        nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(),
+        ignore_patterns_hash: meta.ignore_patterns_hash,
+        unreachable_bytes: meta.unreachable_bytes.get(),
     };
     Ok(dirstate_map)
 }
@@ -530,9 +526,11 @@
 
 pub(crate) fn for_each_tracked_path<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
     mut f: impl FnMut(&'on_disk HgPath),
 ) -> Result<(), DirstateV2ParseError> {
-    let root = read_root(on_disk)?;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     fn recur<'on_disk>(
         on_disk: &'on_disk [u8],
         nodes: ChildNodes,
@@ -548,23 +546,23 @@
         }
         Ok(())
     }
-    recur(on_disk, root.root_nodes, &mut f)
+    recur(on_disk, meta.root_nodes, &mut f)
 }
 
-/// Returns new data together with whether that data should be appended to the
-/// existing data file whose content is at `dirstate_map.on_disk` (true),
-/// instead of written to a new data file (false).
+/// Returns new data and metadata, together with whether that data should be
+/// appended to the existing data file whose content is at
+/// `dirstate_map.on_disk` (true), instead of written to a new data file
+/// (false).
 pub(super) fn write(
     dirstate_map: &mut DirstateMap,
     can_append: bool,
-) -> Result<(Vec<u8>, bool), DirstateError> {
+) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
     let append = can_append && dirstate_map.write_should_append();
 
     // This ignores the space for paths, and for nodes without an entry.
     // TODO: better estimate? Skip the `Vec` and write to a file directly?
-    let size_guess = std::mem::size_of::<Root>()
-        + std::mem::size_of::<Node>()
-            * dirstate_map.nodes_with_entry_count as usize;
+    let size_guess = std::mem::size_of::<Node>()
+        * dirstate_map.nodes_with_entry_count as usize;
 
     let mut writer = Writer {
         dirstate_map,
@@ -574,7 +572,7 @@
 
     let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
 
-    let root = Root {
+    let meta = TreeMetadata {
         root_nodes,
         nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
         nodes_with_copy_source_count: dirstate_map
@@ -583,8 +581,7 @@
         unreachable_bytes: dirstate_map.unreachable_bytes.into(),
         ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
-    writer.out.extend(root.as_bytes());
-    Ok((writer.out, append))
+    Ok((writer.out, meta.as_bytes().to_vec(), append))
 }
 
 struct Writer<'dmap, 'on_disk> {
diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs
@@ -182,16 +182,17 @@
     /// serialize bytes to write a dirstate data file to disk in dirstate-v2
     /// format.
     ///
-    /// Returns new data together with whether that data should be appended to
-    /// the existing data file whose content is at `self.on_disk` (true),
-    /// instead of written to a new data file (false).
+    /// Returns new data and metadata together with whether that data should be
+    /// appended to the existing data file whose content is at
+    /// `self.on_disk` (true), instead of written to a new data file
+    /// (false).
     ///
     /// Note: this is only supported by the tree dirstate map.
     fn pack_v2(
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError>;
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError>;
 
     /// Run the status algorithm.
     ///
@@ -395,7 +396,7 @@
         &mut self,
         _now: Timestamp,
         _can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         panic!(
             "should have used dirstate_tree::DirstateMap to use the v2 format"
         )
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -420,9 +420,10 @@
     pub fn new_v2(
         on_disk: &'on_disk [u8],
         data_size: usize,
+        metadata: &[u8],
     ) -> Result<Self, DirstateError> {
         if let Some(data) = on_disk.get(..data_size) {
-            Ok(on_disk::read(data)?)
+            Ok(on_disk::read(data, metadata)?)
         } else {
             Err(DirstateV2ParseError.into())
         }
@@ -1090,15 +1091,16 @@
         Ok(packed)
     }
 
-    /// Returns new data together with whether that data should be appended to
-    /// the existing data file whose content is at `self.on_disk` (true),
-    /// instead of written to a new data file (false).
+    /// Returns new data and metadata together with whether that data should be
+    /// appended to the existing data file whose content is at
+    /// `self.on_disk` (true), instead of written to a new data file
+    /// (false).
     #[timed]
     fn pack_v2(
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         // TODO: how do we want to handle this in 2038?
         let now: i32 = now.0.try_into().expect("time overflow");
         let mut paths = Vec::new();
diff --git a/mercurial/dirstateutils/docket.py b/mercurial/dirstateutils/docket.py
--- a/mercurial/dirstateutils/docket.py
+++ b/mercurial/dirstateutils/docket.py
@@ -14,47 +14,60 @@
 
 V2_FORMAT_MARKER = b"dirstate-v2\n"
 
+# Must match the constant of the same name in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`
+TREE_METADATA_SIZE = 40
+
 # * 12 bytes: format marker
 # * 32 bytes: node ID of the working directory's first parent
 # * 32 bytes: node ID of the working directory's second parent
 # * 4 bytes: big-endian used size of the data file
+# * {TREE_METADATA_SIZE} bytes: tree metadata, parsed separately
 # * 1 byte: length of the data file's UUID
 # * variable: data file's UUID
 #
 # Node IDs are null-padded if shorter than 32 bytes.
 # A data file shorter than the specified used size is corrupted (truncated)
-HEADER = struct.Struct(">{}s32s32sLB".format(len(V2_FORMAT_MARKER)))
+HEADER = struct.Struct(
+    ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
+)
 
 
 class DirstateDocket(object):
     data_filename_pattern = b'dirstate.%s.d'
 
-    def __init__(self, parents, data_size, uuid):
+    def __init__(self, parents, data_size, tree_metadata, uuid):
         self.parents = parents
         self.data_size = data_size
+        self.tree_metadata = tree_metadata
         self.uuid = uuid
 
     @classmethod
-    def with_new_uuid(cls, parents, data):
-        return cls(parents, data, docket_mod.make_uid())
+    def with_new_uuid(cls, parents, data_size, tree_metadata):
+        return cls(parents, data_size, tree_metadata, docket_mod.make_uid())
 
     @classmethod
     def parse(cls, data, nodeconstants):
         if not data:
             parents = (nodeconstants.nullid, nodeconstants.nullid)
-            return cls(parents, 0, None)
-        marker, p1, p2, data_size, uuid_size = HEADER.unpack_from(data)
+            return cls(parents, 0, b'', None)
+        marker, p1, p2, data_size, meta, uuid_size = HEADER.unpack_from(data)
         if marker != V2_FORMAT_MARKER:
             raise ValueError("expected dirstate-v2 marker")
         uuid = data[HEADER.size : HEADER.size + uuid_size]
         p1 = p1[: nodeconstants.nodelen]
         p2 = p2[: nodeconstants.nodelen]
-        return cls((p1, p2), data_size, uuid)
+        return cls((p1, p2), data_size, meta, uuid)
 
     def serialize(self):
         p1, p2 = self.parents
         header = HEADER.pack(
-            V2_FORMAT_MARKER, p1, p2, self.data_size, len(self.uuid)
+            V2_FORMAT_MARKER,
+            p1,
+            p2,
+            self.data_size,
+            self.tree_metadata,
+            len(self.uuid),
         )
         return header + self.uuid
 
diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py
--- a/mercurial/dirstatemap.py
+++ b/mercurial/dirstatemap.py
@@ -640,7 +640,7 @@
                 else:
                     data = b''
                 self._rustmap = rustmod.DirstateMap.new_v2(
-                    data, self.docket.data_size
+                    data, self.docket.data_size, self.docket.tree_metadata
                 )
                 parents = self.docket.parents
             else:
@@ -666,7 +666,7 @@
 
             # We can only append to an existing data file if there is one
             can_append = self.docket.uuid is not None
-            packed, append = self._rustmap.write_v2(now, can_append)
+            packed, meta, append = self._rustmap.write_v2(now, can_append)
             if append:
                 docket = self.docket
                 with self._opener(docket.data_filename(), b'ab') as fp:
@@ -678,12 +678,13 @@
                     assert written == len(packed)
                 docket.data_size += len(packed)
                 docket.parents = self.parents()
+                docket.tree_metadata = meta
                 st.write(docket.serialize())
                 st.close()
             else:
                 old_docket = self.docket
                 new_docket = docketmod.DirstateDocket.with_new_uuid(
-                    self.parents(), len(packed)
+                    self.parents(), len(packed), meta
                 )
                 self._opener.write(new_docket.data_filename(), packed)
                 # Write the new docket after the new data file has been
diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
--- a/mercurial/debugcommands.py
+++ b/mercurial/debugcommands.py
@@ -999,11 +999,7 @@
     if repo.dirstate._use_dirstate_v2:
         docket = repo.dirstate._map.docket
         hash_len = 20  # 160 bits for SHA-1
-        hash_offset = docket.data_size - hash_len  # hash is at the end
-        data_filename = docket.data_filename()
-        with repo.vfs(data_filename) as f:
-            f.seek(hash_offset)
-            hash_bytes = f.read(hash_len)
+        hash_bytes = docket.tree_metadata[-hash_len:]
         print(binascii.hexlify(hash_bytes).decode())
 
 



To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel


More information about the Mercurial-devel mailing list