[Commented On] D10826: dirstate-v2: Skip readdir in status based on directory mtime
baymax (Baymax, Your Personal Patch-care Companion)
phabricator at mercurial-scm.org
Tue Jun 1 21:06:48 UTC 2021
baymax added a comment.
baymax updated this revision to Diff 28384.
✅ refresh by Heptapod after a successful CI run (🐙 💚)
REPOSITORY
rHG Mercurial
CHANGES SINCE LAST UPDATE
https://phab.mercurial-scm.org/D10826?vs=28359&id=28384
BRANCH
default
CHANGES SINCE LAST ACTION
https://phab.mercurial-scm.org/D10826/new/
REVISION DETAIL
https://phab.mercurial-scm.org/D10826
AFFECTED FILES
rust/hg-core/Cargo.toml
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/dirstate_tree/status.rs
CHANGE DETAILS
diff --git a/rust/hg-core/src/dirstate_tree/status.rs b/rust/hg-core/src/dirstate_tree/status.rs
--- a/rust/hg-core/src/dirstate_tree/status.rs
+++ b/rust/hg-core/src/dirstate_tree/status.rs
@@ -2,8 +2,11 @@
use crate::dirstate_tree::dirstate_map::BorrowedPath;
use crate::dirstate_tree::dirstate_map::ChildNodesRef;
use crate::dirstate_tree::dirstate_map::DirstateMap;
+use crate::dirstate_tree::dirstate_map::NodeData;
use crate::dirstate_tree::dirstate_map::NodeRef;
use crate::dirstate_tree::on_disk::DirstateV2ParseError;
+use crate::dirstate_tree::on_disk::Timestamp;
+use crate::dirstate_tree::path_with_basename::WithBasename;
use crate::matchers::get_ignore_function;
use crate::matchers::Matcher;
use crate::utils::files::get_bytes_from_os_string;
@@ -18,10 +21,12 @@
use crate::StatusOptions;
use micro_timer::timed;
use rayon::prelude::*;
+use std::borrow::Cow;
use std::io;
use std::path::Path;
use std::path::PathBuf;
use std::sync::Mutex;
+use std::time::SystemTime;
/// Returns the status of the working directory compared to its parent
/// changeset.
@@ -52,19 +57,45 @@
options,
matcher,
ignore_fn,
- outcome: Mutex::new(DirstateStatus::default()),
+ outcome: Default::default(),
+ cached_directory_mtimes_to_add: Default::default(),
+ filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
};
let is_at_repo_root = true;
let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
let has_ignored_ancestor = false;
+ let root_cached_mtime = None;
+ let root_dir_metadata = None;
+ // If the path we have for the repository root is a symlink, do follow it.
+ // (As opposed to symlinks within the working directory which are not
+ // followed, using `std::fs::symlink_metadata`.)
common.traverse_fs_directory_and_dirstate(
has_ignored_ancestor,
dmap.root.as_ref(),
hg_path,
&root_dir,
+ root_dir_metadata,
+ root_cached_mtime,
is_at_repo_root,
)?;
- Ok((common.outcome.into_inner().unwrap(), warnings))
+ let outcome = common.outcome.into_inner().unwrap();
+ let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
+ for (path, mtime) in &to_add {
+ let node = DirstateMap::get_or_insert_node(
+ dmap.on_disk,
+ &mut dmap.root,
+ path,
+ WithBasename::to_cow_owned,
+ |_| {},
+ )?;
+ match &node.data {
+ NodeData::Entry(_) => {} // Don’t overwrite an entry
+ NodeData::CachedDirectory { .. } | NodeData::None => {
+ node.data = NodeData::CachedDirectory { mtime: *mtime }
+ }
+ }
+ }
+ Ok((outcome, warnings))
}
/// Bag of random things needed by various parts of the algorithm. Reduces the
@@ -75,6 +106,12 @@
matcher: &'a (dyn Matcher + Sync),
ignore_fn: IgnoreFnType<'a>,
outcome: Mutex<DirstateStatus<'on_disk>>,
+ cached_directory_mtimes_to_add:
+ Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+
+ /// The current time at the start of the `status()` algorithm, as measured
+ /// and possibly truncated by the filesystem.
+ filesystem_time_at_status_start: Option<SystemTime>,
}
impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
@@ -97,18 +134,54 @@
.push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
}
+ /// If this returns true, we can get accurate results by only using
+ /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
+ /// need to call `read_dir`.
+ fn can_skip_fs_readdir(
+ &self,
+ directory_metadata: Option<&std::fs::Metadata>,
+ cached_directory_mtime: Option<&Timestamp>,
+ ) -> bool {
+ if !self.options.list_unknown && !self.options.list_ignored {
+ // All states that we care about listing have corresponding
+ // dirstate entries.
+ // This happens for example with `hg status -mard`.
+ return true;
+ }
+ if let Some(cached_mtime) = cached_directory_mtime {
+ // The dirstate contains a cached mtime for this directory, set by
+ // a previous run of the `status` algorithm which found this
+ // directory eligible for `read_dir` caching.
+ if let Some(meta) = directory_metadata {
+ if let Ok(current_mtime) = meta.modified() {
+ if current_mtime == cached_mtime.into() {
+ // The mtime of that directory has not changed since
+ // then, which means that the
+ // results of `read_dir` should also
+ // be unchanged.
+ return true;
+ }
+ }
+ }
+ }
+ false
+ }
+
+ /// Returns whether the filesystem directory was found to have any entry
+ /// that does not have a corresponding dirstate tree node.
fn traverse_fs_directory_and_dirstate(
&self,
has_ignored_ancestor: bool,
dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
directory_fs_path: &Path,
+ directory_metadata: Option<&std::fs::Metadata>,
+ cached_directory_mtime: Option<&Timestamp>,
is_at_repo_root: bool,
- ) -> Result<(), DirstateV2ParseError> {
- if !self.options.list_unknown && !self.options.list_ignored {
- // We only care about files in the dirstate, so we can skip listing
- // filesystem directories entirely.
- return dirstate_nodes
+ ) -> Result<bool, DirstateV2ParseError> {
+ if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
+ {
+ dirstate_nodes
.par_iter()
.map(|dirstate_node| {
let fs_path = directory_fs_path.join(get_path_from_bytes(
@@ -131,7 +204,13 @@
}
}
})
- .collect();
+ .collect::<Result<_, _>>()?;
+
+ // Conservatively don’t let the caller assume that there aren’t
+ // any, since we don’t know.
+ let directory_has_any_fs_only_entry = true;
+
+ return Ok(directory_has_any_fs_only_entry);
}
let mut fs_entries = if let Ok(entries) = self.read_dir(
@@ -174,6 +253,7 @@
.par_bridge()
.map(|pair| {
use itertools::EitherOrBoth::*;
+ let is_fs_only = pair.is_right();
match pair {
Both(dirstate_node, fs_entry) => self
.traverse_fs_and_dirstate(
@@ -181,18 +261,19 @@
&fs_entry.metadata,
dirstate_node,
has_ignored_ancestor,
- ),
+ )?,
Left(dirstate_node) => {
- self.traverse_dirstate_only(dirstate_node)
+ self.traverse_dirstate_only(dirstate_node)?
}
- Right(fs_entry) => Ok(self.traverse_fs_only(
+ Right(fs_entry) => self.traverse_fs_only(
has_ignored_ancestor,
directory_hg_path,
fs_entry,
- )),
+ ),
}
+ Ok(is_fs_only)
})
- .collect()
+ .try_reduce(|| false, |a, b| Ok(a || b))
}
fn traverse_fs_and_dirstate(
@@ -224,12 +305,20 @@
}
let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
let is_at_repo_root = false;
- self.traverse_fs_directory_and_dirstate(
- is_ignored,
- dirstate_node.children(self.dmap.on_disk)?,
- hg_path,
- fs_path,
- is_at_repo_root,
+ let directory_has_any_fs_only_entry = self
+ .traverse_fs_directory_and_dirstate(
+ is_ignored,
+ dirstate_node.children(self.dmap.on_disk)?,
+ hg_path,
+ fs_path,
+ Some(fs_metadata),
+ dirstate_node.cached_directory_mtime(),
+ is_at_repo_root,
+ )?;
+ self.maybe_save_directory_mtime(
+ directory_has_any_fs_only_entry,
+ fs_metadata,
+ dirstate_node,
)?
} else {
if file_or_symlink && self.matcher.matches(hg_path) {
@@ -274,6 +363,75 @@
Ok(())
}
+ fn maybe_save_directory_mtime(
+ &self,
+ directory_has_any_fs_only_entry: bool,
+ directory_metadata: &std::fs::Metadata,
+ dirstate_node: NodeRef<'tree, 'on_disk>,
+ ) -> Result<(), DirstateV2ParseError> {
+ if !directory_has_any_fs_only_entry {
+ // All filesystem directory entries from `read_dir` have a
+ // corresponding node in the dirstate, so we can reconstitute the
+ // names of those entries without calling `read_dir` again.
+ if let (Some(status_start), Ok(directory_mtime)) = (
+ &self.filesystem_time_at_status_start,
+ directory_metadata.modified(),
+ ) {
+ // Although the Rust standard library’s `SystemTime` type
+ // has nanosecond precision, the times reported for a
+ // directory’s (or file’s) modified time may have lower
+ // resolution based on the filesystem (for example ext3
+ // only stores integer seconds), kernel (see
+ // https://stackoverflow.com/a/14393315/1162888), etc.
+ if &directory_mtime >= status_start {
+ // The directory was modified too recently, don’t cache its
+ // `read_dir` results.
+ //
+ // A timeline like this is possible:
+ //
+ // 1. A change to this directory (direct child was
+ // added or removed) cause its mtime to be set
+ // (possibly truncated) to `directory_mtime`
+ // 2. This `status` algorithm calls `read_dir`
+ // 3. An other change is made to the same directory is
+ // made so that calling `read_dir` agin would give
+ // different results, but soon enough after 1. that
+ // the mtime stays the same
+ //
+ // On a system where the time resolution poor, this
+ // scenario is not unlikely if all three steps are caused
+ // by the same script.
+ } else {
+ // We’ve observed (through `status_start`) that time has
+ // “progressed” since `directory_mtime`, so any further
+ // change to this directory is extremely likely to cause a
+ // different mtime.
+ //
+ // Having the same mtime again is not entirely impossible
+ // since the system clock is not monotonous. It could jump
+ // backward to some point before `directory_mtime`, then a
+ // directory change could potentially happen during exactly
+ // the wrong tick.
+ //
+ // We deem this scenario (unlike the previous one) to be
+ // unlikely enough in practice.
+ let timestamp = directory_mtime.into();
+ let cached = dirstate_node.cached_directory_mtime();
+ if cached != Some(×tamp) {
+ let hg_path = dirstate_node
+ .full_path_borrowed(self.dmap.on_disk)?
+ .detach_from_tree();
+ self.cached_directory_mtimes_to_add
+ .lock()
+ .unwrap()
+ .push((hg_path, timestamp))
+ }
+ }
+ }
+ }
+ Ok(())
+ }
+
/// A file with `EntryState::Normal` in the dirstate was found in the
/// filesystem
fn handle_normal_file(
@@ -505,3 +663,22 @@
Ok(results)
}
}
+
+/// Return the `mtime` of a temporary file newly-created in the `.hg` directory
+/// of the give repository.
+///
+/// This is similar to `SystemTime::now()`, with the result truncated to the
+/// same time resolution as other files’ modification times. Using `.hg`
+/// instead of the system’s default temporary directory (such as `/tmp`) makes
+/// it more likely the temporary file is in the same disk partition as contents
+/// of the working directory, which can matter since different filesystems may
+/// store timestamps with different resolutions.
+///
+/// This may fail, typically if we lack write permissions. In that case we
+/// should continue the `status()` algoritm anyway and consider the current
+/// date/time to be unknown.
+fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
+ tempfile::tempfile_in(repo_root.join(".hg"))?
+ .metadata()?
+ .modified()
+}
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -56,13 +56,31 @@
/// Dependending on the value of `state`:
///
- /// * A null byte: `data` represents nothing
+ /// * A null byte: `data` is not used.
+ ///
/// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
- /// represents a dirstate entry like in the v1 format.
+ /// represent a dirstate entry like in the v1 format.
+ ///
/// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
/// as the `Timestamp` for the mtime of a cached directory.
///
- /// TODO: document directory caching
+ /// The presence of this state means that at some point, this path in
+ /// the working directory was observed:
+ ///
+ /// - To be a directory
+ /// - With the modification time as given by `Timestamp`
+ /// - That timestamp was already strictly in the past when observed,
+ /// meaning that later changes cannot happen in the same clock tick
+ /// and must cause a different modification time (unless the system
+ /// clock jumps back and we get unlucky, which is not impossible but
+ /// but deemed unlikely enough).
+ /// - The directory did not contain any child entry that did not have a
+ /// corresponding dirstate node.
+ ///
+ /// This means that if `std::fs::symlink_metadata` later reports the
+ /// same modification time, we don’t need to call `std::fs::read_dir`
+ /// again for this directory and can iterate child dirstate nodes
+ /// instead.
state: u8,
data: Entry,
}
@@ -76,7 +94,7 @@
}
/// Duration since the Unix epoch
-#[derive(BytesCast, Copy, Clone)]
+#[derive(BytesCast, Copy, Clone, PartialEq)]
#[repr(C)]
pub(super) struct Timestamp {
seconds: I64Be,
@@ -258,6 +276,14 @@
}
}
+ pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
+ if self.state == b'd' {
+ Some(self.data.as_timestamp())
+ } else {
+ None
+ }
+ }
+
pub(super) fn state(
&self,
) -> Result<Option<EntryState>, DirstateV2ParseError> {
@@ -326,8 +352,8 @@
}
}
-impl From<&'_ SystemTime> for Timestamp {
- fn from(system_time: &'_ SystemTime) -> Self {
+impl From<SystemTime> for Timestamp {
+ fn from(system_time: SystemTime) -> Self {
let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
Ok(duration) => {
(duration.as_secs() as i64, duration.subsec_nanos())
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -317,6 +317,18 @@
}
}
+ pub(super) fn cached_directory_mtime(
+ &self,
+ ) -> Option<&on_disk::Timestamp> {
+ match self {
+ NodeRef::InMemory(_path, node) => match &node.data {
+ NodeData::CachedDirectory { mtime } => Some(mtime),
+ _ => None,
+ },
+ NodeRef::OnDisk(node) => node.cached_directory_mtime(),
+ }
+ }
+
pub(super) fn tracked_descendants_count(&self) -> u32 {
match self {
NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
@@ -479,7 +491,7 @@
}
}
- fn get_or_insert_node<'tree, 'path>(
+ pub(super) fn get_or_insert_node<'tree, 'path>(
on_disk: &'on_disk [u8],
root: &'tree mut ChildNodes<'on_disk>,
path: &'path HgPath,
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -23,6 +23,7 @@
regex = "1.3.9"
twox-hash = "1.5.0"
same-file = "1.0.6"
+tempfile = "3.1.0"
crossbeam-channel = "0.4"
micro-timer = "0.3.0"
log = "0.4.8"
@@ -41,4 +42,3 @@
[dev-dependencies]
clap = "*"
pretty_assertions = "0.6.1"
-tempfile = "3.1.0"
To: SimonSapin, #hg-reviewers
Cc: mercurial-patches
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurial-scm.org/pipermail/mercurial-patches/attachments/20210601/0ac0b855/attachment-0002.html>
More information about the Mercurial-patches
mailing list