[Updated] D11932: rhg: Use binary search in manifest lookup
SimonSapin
phabricator at mercurial-scm.org
Fri Dec 17 11:16:06 UTC 2021
Closed by commit rHGe293ff808a05: rhg: Use binary search in manifest lookup (authored by SimonSapin).
This revision was automatically updated to reflect the committed changes.
REPOSITORY
rHG Mercurial
CHANGES SINCE LAST UPDATE
https://phab.mercurial-scm.org/D11932?vs=31498&id=31526
CHANGES SINCE LAST ACTION
https://phab.mercurial-scm.org/D11932/new/
REVISION DETAIL
https://phab.mercurial-scm.org/D11932
AFFECTED FILES
rust/hg-core/src/revlog/manifest.rs
rust/rhg/src/commands/status.rs
CHANGE DETAILS
diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs
--- a/rust/rhg/src/commands/status.rs
+++ b/rust/rhg/src/commands/status.rs
@@ -473,7 +473,7 @@
};
let entry = manifest
- .find_file(hg_path)?
+ .find_by_path(hg_path)?
.expect("ambgious file not in p1");
if entry.flags != fs_flags {
return Ok(true);
diff --git a/rust/hg-core/src/revlog/manifest.rs b/rust/hg-core/src/revlog/manifest.rs
--- a/rust/hg-core/src/revlog/manifest.rs
+++ b/rust/hg-core/src/revlog/manifest.rs
@@ -52,6 +52,15 @@
/// `Manifestlog` entry which knows how to interpret the `manifest` data bytes.
#[derive(Debug)]
pub struct Manifest {
+ /// Format for a manifest: flat sequence of variable-size entries,
+ /// sorted by path, each as:
+ ///
+ /// ```text
+ /// <path> \0 <hex_node_id> <flags> \n
+ /// ```
+ ///
+ /// The last entry is also terminated by a newline character.
+ /// Flags is one of `b""` (the empty string), `b"x"`, `b"l"`, or `b"t"`.
bytes: Vec<u8>,
}
@@ -62,44 +71,84 @@
self.bytes
.split(|b| b == &b'\n')
.filter(|line| !line.is_empty())
- .map(|line| {
- let (path, rest) = line.split_2(b'\0').ok_or_else(|| {
- HgError::corrupted("manifest line should contain \\0")
- })?;
- let path = HgPath::new(path);
- let (hex_node_id, flags) = match rest.split_last() {
- Some((&b'x', rest)) => (rest, Some(b'x')),
- Some((&b'l', rest)) => (rest, Some(b'l')),
- Some((&b't', rest)) => (rest, Some(b't')),
- _ => (rest, None),
- };
- Ok(ManifestEntry {
- path,
- hex_node_id,
- flags,
- })
- })
+ .map(ManifestEntry::from_raw)
}
/// If the given path is in this manifest, return its filelog node ID
- pub fn find_file(
+ pub fn find_by_path(
&self,
path: &HgPath,
) -> Result<Option<ManifestEntry>, HgError> {
- // TODO: use binary search instead of linear scan. This may involve
- // building (and caching) an index of the byte indicex of each manifest
- // line.
+ use std::cmp::Ordering::*;
+ let path = path.as_bytes();
+ // Both boundaries of this `&[u8]` slice are always at the boundary of
+ // an entry
+ let mut bytes = &*self.bytes;
- // TODO: use try_find when available (if still using linear scan)
- // https://github.com/rust-lang/rust/issues/63178
- for entry in self.iter() {
- let entry = entry?;
- if entry.path == path {
- return Ok(Some(entry));
+ // Binary search algorithm derived from `[T]::binary_search_by`
+ // <https://github.com/rust-lang/rust/blob/1.57.0/library/core/src/slice/mod.rs#L2221>
+ // except we don’t have a slice of entries. Instead we jump to the
+ // middle of the byte slice and look around for entry delimiters
+ // (newlines).
+ while let Some(entry_range) = Self::find_entry_near_middle_of(bytes)? {
+ let (entry_path, rest) =
+ ManifestEntry::split_path(&bytes[entry_range.clone()])?;
+ let cmp = entry_path.cmp(path);
+ if cmp == Less {
+ let after_newline = entry_range.end + 1;
+ bytes = &bytes[after_newline..];
+ } else if cmp == Greater {
+ bytes = &bytes[..entry_range.start];
+ } else {
+ return Ok(Some(ManifestEntry::from_path_and_rest(
+ entry_path, rest,
+ )));
}
}
Ok(None)
}
+
+ /// If there is at least one, return the byte range of an entry *excluding*
+ /// the final newline.
+ fn find_entry_near_middle_of(
+ bytes: &[u8],
+ ) -> Result<Option<std::ops::Range<usize>>, HgError> {
+ let len = bytes.len();
+ if len > 0 {
+ let middle = bytes.len() / 2;
+ // Integer division rounds down, so `middle < len`.
+ let (before, after) = bytes.split_at(middle);
+ let is_newline = |&byte: &u8| byte == b'\n';
+ let entry_start = match before.iter().rposition(is_newline) {
+ Some(i) => i + 1,
+ None => 0, // We choose the first entry in `bytes`
+ };
+ let entry_end = match after.iter().position(is_newline) {
+ Some(i) => {
+ // No `+ 1` here to exclude this newline from the range
+ middle + i
+ }
+ None => {
+ // In a well-formed manifest:
+ //
+ // * Since `len > 0`, `bytes` contains at least one entry
+ // * Every entry ends with a newline
+ // * Since `middle < len`, `after` contains at least the
+ // newline at the end of the last entry of `bytes`.
+ //
+ // We didn’t find a newline, so this manifest is not
+ // well-formed.
+ return Err(HgError::corrupted(
+ "manifest entry without \\n delimiter",
+ ));
+ }
+ };
+ Ok(Some(entry_start..entry_end))
+ } else {
+ // len == 0
+ Ok(None)
+ }
+ }
}
/// `Manifestlog` entry which knows how to interpret the `manifest` data bytes.
@@ -112,7 +161,32 @@
pub flags: Option<u8>,
}
-impl ManifestEntry<'_> {
+impl<'a> ManifestEntry<'a> {
+ fn split_path(bytes: &[u8]) -> Result<(&[u8], &[u8]), HgError> {
+ bytes.split_2(b'\0').ok_or_else(|| {
+ HgError::corrupted("manifest entry without \\0 delimiter")
+ })
+ }
+
+ fn from_path_and_rest(path: &'a [u8], rest: &'a [u8]) -> Self {
+ let (hex_node_id, flags) = match rest.split_last() {
+ Some((&b'x', rest)) => (rest, Some(b'x')),
+ Some((&b'l', rest)) => (rest, Some(b'l')),
+ Some((&b't', rest)) => (rest, Some(b't')),
+ _ => (rest, None),
+ };
+ Self {
+ path: HgPath::new(path),
+ hex_node_id,
+ flags,
+ }
+ }
+
+ fn from_raw(bytes: &'a [u8]) -> Result<Self, HgError> {
+ let (path, rest) = Self::split_path(bytes)?;
+ Ok(Self::from_path_and_rest(path, rest))
+ }
+
pub fn node_id(&self) -> Result<Node, HgError> {
Node::from_hex_for_repo(self.hex_node_id)
}
To: SimonSapin, #hg-reviewers, Alphare
Cc: mercurial-patches
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurial-scm.org/pipermail/mercurial-patches/attachments/20211217/27c442d7/attachment-0002.html>
More information about the Mercurial-patches
mailing list