D11094: dirstate-v2: Support appending to the same data file
SimonSapin
phabricator at mercurial-scm.org
Thu Jul 15 15:29:13 UTC 2021
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.
REVISION SUMMARY
For now weâre still writing the entire data every time, so appending is not
useful yet. Later weâll have new nodes pointing to some existing data for
nodes and paths that havenât changed.
The decision whether to append is pseudo-random in order to make tests exercise
both code paths. This will be replaced by a heuristic based on the amount
of unused existing data.
REPOSITORY
rHG Mercurial
BRANCH
default
REVISION DETAIL
https://phab.mercurial-scm.org/D11094
AFFECTED FILES
mercurial/dirstatemap.py
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/dispatch.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-cpython/src/dirstate/dirstate_map.rs
rust/hg-cpython/src/dirstate/dispatch.rs
CHANGE DETAILS
diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs
--- a/rust/hg-cpython/src/dirstate/dispatch.rs
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs
@@ -124,8 +124,12 @@
self.get_mut().pack_v1(parents, now)
}
- fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
- self.get_mut().pack_v2(now)
+ fn pack_v2(
+ &mut self,
+ now: Timestamp,
+ can_append: bool,
+ ) -> Result<(Vec<u8>, bool), DirstateError> {
+ self.get_mut().pack_v2(now, can_append)
}
fn status<'a>(
diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs
@@ -340,16 +340,23 @@
}
}
+ /// Returns new data together with whether that data should be appended to
+ /// the existing data file whose content is at `self.on_disk` (True),
+ /// instead of written to a new data file (False).
def write_v2(
&self,
- now: PyObject
- ) -> PyResult<PyBytes> {
+ now: PyObject,
+ can_append: bool,
+ ) -> PyResult<PyObject> {
let now = Timestamp(now.extract(py)?);
let mut inner = self.inner(py).borrow_mut();
- let result = inner.pack_v2(now);
+ let result = inner.pack_v2(now, can_append);
match result {
- Ok(packed) => Ok(PyBytes::new(py, &packed)),
+ Ok((packed, append)) => {
+ let packed = PyBytes::new(py, &packed);
+ Ok((packed, append).to_py_object(py).into_object())
+ },
Err(_) => Err(PyErr::new::<exc::OSError, _>(
py,
"Dirstate error".to_string(),
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -544,20 +544,28 @@
recur(on_disk, root.root_nodes, &mut f)
}
+/// Returns new data together with whether that data should be appended to the
+/// existing data file whose content is at `dirstate_map.on_disk` (true),
+/// instead of written to a new data file (false).
pub(super) fn write(
dirstate_map: &mut DirstateMap,
-) -> Result<Vec<u8>, DirstateError> {
- let root_len = std::mem::size_of::<Root>();
+ can_append: bool,
+) -> Result<(Vec<u8>, bool), DirstateError> {
+ let append = can_append && dirstate_map.write_should_append();
// This ignores the space for paths, and for nodes without an entry.
// TODO: better estimate? Skip the `Vec` and write to a file directly?
- let size_guess = root_len
+ let size_guess = std::mem::size_of::<Root>()
+ std::mem::size_of::<Node>()
* dirstate_map.nodes_with_entry_count as usize;
- let mut out = Vec::with_capacity(size_guess);
- let root_nodes =
- write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?;
+ let mut writer = Writer {
+ dirstate_map,
+ append,
+ out: Vec::with_capacity(size_guess),
+ };
+
+ let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
let root = Root {
root_nodes,
@@ -567,112 +575,121 @@
.into(),
ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
};
- out.extend(root.as_bytes());
- Ok(out)
+ writer.out.extend(root.as_bytes());
+ Ok((writer.out, append))
+}
+
+struct Writer<'dmap, 'on_disk> {
+ dirstate_map: &'dmap DirstateMap<'on_disk>,
+ append: bool,
+ out: Vec<u8>,
}
-fn write_nodes(
- dirstate_map: &DirstateMap,
- nodes: dirstate_map::ChildNodesRef,
- out: &mut Vec<u8>,
-) -> Result<ChildNodes, DirstateError> {
- // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
- // order. Sort to enable binary search in the written file.
- let nodes = nodes.sorted();
- let nodes_len = nodes.len();
+impl Writer<'_, '_> {
+ fn write_nodes(
+ &mut self,
+ nodes: dirstate_map::ChildNodesRef,
+ ) -> Result<ChildNodes, DirstateError> {
+ // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
+ // order. Sort to enable binary search in the written file.
+ let nodes = nodes.sorted();
+ let nodes_len = nodes.len();
- // First accumulate serialized nodes in a `Vec`
- let mut on_disk_nodes = Vec::with_capacity(nodes_len);
- for node in nodes {
- let children = write_nodes(
- dirstate_map,
- node.children(dirstate_map.on_disk)?,
- out,
- )?;
- let full_path = node.full_path(dirstate_map.on_disk)?;
- let full_path = write_path(full_path.as_bytes(), out);
- let copy_source =
- if let Some(source) = node.copy_source(dirstate_map.on_disk)? {
- write_path(source.as_bytes(), out)
+ // First accumulate serialized nodes in a `Vec`
+ let mut on_disk_nodes = Vec::with_capacity(nodes_len);
+ for node in nodes {
+ let children =
+ self.write_nodes(node.children(self.dirstate_map.on_disk)?)?;
+ let full_path = node.full_path(self.dirstate_map.on_disk)?;
+ let full_path = self.write_path(full_path.as_bytes());
+ let copy_source = if let Some(source) =
+ node.copy_source(self.dirstate_map.on_disk)?
+ {
+ self.write_path(source.as_bytes())
} else {
PathSlice {
start: 0.into(),
len: 0.into(),
}
};
- on_disk_nodes.push(match node {
- NodeRef::InMemory(path, node) => {
- let (state, data) = match &node.data {
- dirstate_map::NodeData::Entry(entry) => (
- entry.state.into(),
- Entry {
- mode: entry.mode.into(),
- mtime: entry.mtime.into(),
- size: entry.size.into(),
- },
- ),
- dirstate_map::NodeData::CachedDirectory { mtime } => {
- (b'd', Entry::from_timestamp(*mtime))
+ on_disk_nodes.push(match node {
+ NodeRef::InMemory(path, node) => {
+ let (state, data) = match &node.data {
+ dirstate_map::NodeData::Entry(entry) => (
+ entry.state.into(),
+ Entry {
+ mode: entry.mode.into(),
+ mtime: entry.mtime.into(),
+ size: entry.size.into(),
+ },
+ ),
+ dirstate_map::NodeData::CachedDirectory { mtime } => {
+ (b'd', Entry::from_timestamp(*mtime))
+ }
+ dirstate_map::NodeData::None => (
+ b'\0',
+ Entry {
+ mode: 0.into(),
+ mtime: 0.into(),
+ size: 0.into(),
+ },
+ ),
+ };
+ Node {
+ children,
+ copy_source,
+ full_path,
+ base_name_start: u16::try_from(path.base_name_start())
+ // Could only panic for paths over 64 KiB
+ .expect("dirstate-v2 path length overflow")
+ .into(),
+ descendants_with_entry_count: node
+ .descendants_with_entry_count
+ .into(),
+ tracked_descendants_count: node
+ .tracked_descendants_count
+ .into(),
+ state,
+ data,
}
- dirstate_map::NodeData::None => (
- b'\0',
- Entry {
- mode: 0.into(),
- mtime: 0.into(),
- size: 0.into(),
- },
- ),
- };
- Node {
+ }
+ NodeRef::OnDisk(node) => Node {
children,
copy_source,
full_path,
- base_name_start: u16::try_from(path.base_name_start())
- // Could only panic for paths over 64 KiB
- .expect("dirstate-v2 path length overflow")
- .into(),
- descendants_with_entry_count: node
- .descendants_with_entry_count
- .into(),
- tracked_descendants_count: node
- .tracked_descendants_count
- .into(),
- state,
- data,
- }
- }
- NodeRef::OnDisk(node) => Node {
- children,
- copy_source,
- full_path,
- ..*node
- },
- })
+ ..*node
+ },
+ })
+ }
+ // ⦠so we can write them contiguously, after writing everything else
+ // they refer to.
+ let start = self.curent_offset();
+ let len = u32::try_from(nodes_len)
+ // Could only panic with over 4 billion nodes
+ .expect("dirstate-v2 path length overflow")
+ .into();
+ self.out.extend(on_disk_nodes.as_bytes());
+ Ok(ChildNodes { start, len })
}
- // ⦠so we can write them contiguously, after writing everything else they
- // refer to.
- let start = curent_offset(out);
- let len = u32::try_from(nodes_len)
- // Could only panic with over 4 billion nodes
- .expect("dirstate-v2 path length overflow")
- .into();
- out.extend(on_disk_nodes.as_bytes());
- Ok(ChildNodes { start, len })
-}
-fn curent_offset(out: &Vec<u8>) -> Offset {
- u32::try_from(out.len())
- // Could only panic for a dirstate file larger than 4 GiB
- .expect("dirstate-v2 offset overflow")
- .into()
-}
+ fn curent_offset(&mut self) -> Offset {
+ let mut offset = self.out.len();
+ if self.append {
+ offset += self.dirstate_map.on_disk.len()
+ }
+ u32::try_from(offset)
+ // Could only panic for a dirstate file larger than 4 GiB
+ .expect("dirstate-v2 offset overflow")
+ .into()
+ }
-fn write_path(slice: &[u8], out: &mut Vec<u8>) -> PathSlice {
- let start = curent_offset(out);
- let len = u16::try_from(slice.len())
- // Could only panic for paths over 64 KiB
- .expect("dirstate-v2 path length overflow")
- .into();
- out.extend(slice.as_bytes());
- PathSlice { start, len }
+ fn write_path(&mut self, slice: &[u8]) -> PathSlice {
+ let start = self.curent_offset();
+ let len = u16::try_from(slice.len())
+ // Could only panic for paths over 64 KiB
+ .expect("dirstate-v2 path length overflow")
+ .into();
+ self.out.extend(slice.as_bytes());
+ PathSlice { start, len }
+ }
}
diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs
@@ -179,11 +179,19 @@
/// Clear mtimes that are ambigous with `now` (similar to
/// `clear_ambiguous_times` but for all files in the dirstate map), and
- /// serialize bytes to write the `.hg/dirstate` file to disk in dirstate-v2
+ /// serialize bytes to write a dirstate data file to disk in dirstate-v2
/// format.
///
+ /// Returns new data together with whether that data should be appended to
+ /// the existing data file whose content is at `self.on_disk` (true),
+ /// instead of written to a new data file (false).
+ ///
/// Note: this is only supported by the tree dirstate map.
- fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError>;
+ fn pack_v2(
+ &mut self,
+ now: Timestamp,
+ can_append: bool,
+ ) -> Result<(Vec<u8>, bool), DirstateError>;
/// Run the status algorithm.
///
@@ -383,7 +391,11 @@
self.pack(parents, now)
}
- fn pack_v2(&mut self, _now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+ fn pack_v2(
+ &mut self,
+ _now: Timestamp,
+ _can_append: bool,
+ ) -> Result<(Vec<u8>, bool), DirstateError> {
panic!(
"should have used dirstate_tree::DirstateMap to use the v2 format"
)
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -468,6 +468,24 @@
Ok((map, parents))
}
+ /// Assuming dirstate-v2 format, returns whether the next write should
+ /// append to the existing data file that contains `self.on_disk` (true),
+ /// or create a new data file from scratch (false).
+ pub(super) fn write_should_append(&self) -> bool {
+ // Soon this will be a heuristic based on the amount of unreachable
+ // data. For now itâs pseudo-random in order to make tests exercise
+ // both code paths.
+
+ fn bad_rng() -> u32 {
+ std::time::SystemTime::now()
+ .duration_since(std::time::UNIX_EPOCH)
+ .unwrap()
+ .subsec_millis()
+ }
+
+ bad_rng() % 2 == 0
+ }
+
fn get_node<'tree>(
&'tree self,
path: &HgPath,
@@ -1043,8 +1061,15 @@
Ok(packed)
}
+ /// Returns new data together with whether that data should be appended to
+ /// the existing data file whose content is at `self.on_disk` (true),
+ /// instead of written to a new data file (false).
#[timed]
- fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+ fn pack_v2(
+ &mut self,
+ now: Timestamp,
+ can_append: bool,
+ ) -> Result<(Vec<u8>, bool), DirstateError> {
// TODO:Â how do we want to handle this in 2038?
let now: i32 = now.0.try_into().expect("time overflow");
let mut paths = Vec::new();
@@ -1063,7 +1088,7 @@
self.clear_known_ambiguous_mtimes(&paths)?;
- on_disk::write(self)
+ on_disk::write(self, can_append)
}
fn status<'a>(
diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py
--- a/mercurial/dirstatemap.py
+++ b/mercurial/dirstatemap.py
@@ -657,8 +657,30 @@
return self._rustmap
def write(self, tr, st, now):
- if self._use_dirstate_v2:
- packed = self._rustmap.write_v2(now)
+ if not self._use_dirstate_v2:
+ p1, p2 = self.parents()
+ packed = self._rustmap.write_v1(p1, p2, now)
+ st.write(packed)
+ st.close()
+ return
+
+ # We can only append to an existing data file if there is one
+ can_append = self.docket.uuid is not None
+ packed, append = self._rustmap.write_v2(now, can_append)
+ if append:
+ docket = self.docket
+ with self._opener(docket.data_filename(), b'ab') as fp:
+ assert fp.tell() == docket.data_size, (
+ fp.tell(),
+ docket.data_size,
+ )
+ written = fp.write(packed)
+ assert written == len(packed)
+ docket.data_size += len(packed)
+ docket.parents = self.parents()
+ st.write(docket.serialize())
+ st.close()
+ else:
old_docket = self.docket
new_docket = docketmod.DirstateDocket.with_new_uuid(
self.parents(), len(packed)
@@ -674,11 +696,8 @@
if old_docket.uuid:
self._opener.unlink(old_docket.data_filename())
self._docket = new_docket
- else:
- p1, p2 = self.parents()
- packed = self._rustmap.write_v1(p1, p2, now)
- st.write(packed)
- st.close()
+ # Reload from the newly-written file
+ util.clearcachedproperty(self, b"_rustmap")
self._dirtyparents = False
@propertycache
To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel
More information about the Mercurial-devel
mailing list