[Updated] D11965: rhg: desambiguate status without decompressing filelog if possible

SimonSapin phabricator at mercurial-scm.org
Fri Jan 7 09:54:36 UTC 2022


Closed by commit rHG340a484b65a7: rhg: desambiguate status without decompressing filelog if possible (authored by SimonSapin).
This revision was automatically updated to reflect the committed changes.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D11965?vs=31605&id=31616

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D11965/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D11965

AFFECTED FILES
  mercurial/revlogutils/flagutil.py
  rust/hg-core/src/revlog/filelog.rs
  rust/hg-core/src/revlog/index.rs
  rust/hg-core/src/revlog/revlog.rs
  rust/rhg/src/commands/status.rs

CHANGE DETAILS

diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs
--- a/rust/rhg/src/commands/status.rs
+++ b/rust/rhg/src/commands/status.rs
@@ -516,16 +516,16 @@
         filelog.entry_for_node(entry.node_id()?).map_err(|_| {
             HgError::corrupted("filelog missing node from manifest")
         })?;
-    // TODO: check `fs_len` here like below, but based on
-    // `RevlogEntry::uncompressed_len` without decompressing the full filelog
-    // contents where possible. This is only valid if the revlog data does not
-    // contain metadata. See how Python’s `revlog.rawsize` calls
-    // `storageutil.filerevisioncopied`.
-    // (Maybe also check for content-modifying flags? See `revlog.size`.)
-    let filelog_data = filelog_entry.data()?;
-    let contents_in_p1 = filelog_data.file_data()?;
-    if contents_in_p1.len() as u64 != fs_len {
-        // No need to read the file contents:
+    if filelog_entry.file_data_len_not_equal_to(fs_len) {
+        // No need to read file contents:
+        // it cannot be equal if it has a different length.
+        return Ok(true);
+    }
+
+    let p1_filelog_data = filelog_entry.data()?;
+    let p1_contents = p1_filelog_data.file_data()?;
+    if p1_contents.len() as u64 != fs_len {
+        // No need to read file contents:
         // it cannot be equal if it has a different length.
         return Ok(true);
     }
@@ -535,5 +535,5 @@
     } else {
         vfs.read(fs_path)?
     };
-    Ok(contents_in_p1 != &*fs_contents)
+    Ok(p1_contents != &*fs_contents)
 }
diff --git a/rust/hg-core/src/revlog/revlog.rs b/rust/hg-core/src/revlog/revlog.rs
--- a/rust/hg-core/src/revlog/revlog.rs
+++ b/rust/hg-core/src/revlog/revlog.rs
@@ -20,6 +20,18 @@
 use crate::revlog::Revision;
 use crate::{Node, NULL_REVISION};
 
+const REVISION_FLAG_CENSORED: u16 = 1 << 15;
+const REVISION_FLAG_ELLIPSIS: u16 = 1 << 14;
+const REVISION_FLAG_EXTSTORED: u16 = 1 << 13;
+const REVISION_FLAG_HASCOPIESINFO: u16 = 1 << 12;
+
+// Keep this in sync with REVIDX_KNOWN_FLAGS in
+// mercurial/revlogutils/flagutil.py
+const REVIDX_KNOWN_FLAGS: u16 = REVISION_FLAG_CENSORED
+    | REVISION_FLAG_ELLIPSIS
+    | REVISION_FLAG_EXTSTORED
+    | REVISION_FLAG_HASCOPIESINFO;
+
 #[derive(derive_more::From)]
 pub enum RevlogError {
     InvalidRevision,
@@ -282,6 +294,7 @@
             },
             p1: index_entry.p1(),
             p2: index_entry.p2(),
+            flags: index_entry.flags(),
             hash: *index_entry.hash(),
         };
         Ok(entry)
@@ -309,6 +322,7 @@
     base_rev_or_base_of_delta_chain: Option<Revision>,
     p1: Revision,
     p2: Revision,
+    flags: u16,
     hash: Node,
 }
 
@@ -321,6 +335,20 @@
         u32::try_from(self.uncompressed_len).ok()
     }
 
+    pub fn has_p1(&self) -> bool {
+        self.p1 != NULL_REVISION
+    }
+
+    pub fn is_cencored(&self) -> bool {
+        (self.flags & REVISION_FLAG_CENSORED) != 0
+    }
+
+    pub fn has_length_affecting_flag_processor(&self) -> bool {
+        // Relevant Python code: revlog.size()
+        // note: ELLIPSIS is known to not change the content
+        (self.flags & (REVIDX_KNOWN_FLAGS ^ REVISION_FLAG_ELLIPSIS)) != 0
+    }
+
     /// The data for this entry, after resolving deltas if any.
     pub fn data(&self) -> Result<Cow<'a, [u8]>, HgError> {
         let mut entry = self.clone();
diff --git a/rust/hg-core/src/revlog/index.rs b/rust/hg-core/src/revlog/index.rs
--- a/rust/hg-core/src/revlog/index.rs
+++ b/rust/hg-core/src/revlog/index.rs
@@ -260,6 +260,10 @@
         }
     }
 
+    pub fn flags(&self) -> u16 {
+        BigEndian::read_u16(&self.bytes[6..=7])
+    }
+
     /// Return the compressed length of the data.
     pub fn compressed_len(&self) -> u32 {
         BigEndian::read_u32(&self.bytes[8..=11])
diff --git a/rust/hg-core/src/revlog/filelog.rs b/rust/hg-core/src/revlog/filelog.rs
--- a/rust/hg-core/src/revlog/filelog.rs
+++ b/rust/hg-core/src/revlog/filelog.rs
@@ -73,6 +73,89 @@
 pub struct FilelogEntry<'a>(RevlogEntry<'a>);
 
 impl FilelogEntry<'_> {
+    /// `self.data()` can be expensive, with decompression and delta
+    /// resolution.
+    ///
+    /// *Without* paying this cost, based on revlog index information
+    /// including `RevlogEntry::uncompressed_len`:
+    ///
+    /// * Returns `true` if the length that `self.data().file_data().len()`
+    ///   would return is definitely **not equal** to `other_len`.
+    /// * Returns `false` if available information is inconclusive.
+    pub fn file_data_len_not_equal_to(&self, other_len: u64) -> bool {
+        // Relevant code that implement this behavior in Python code:
+        // basefilectx.cmp, filelog.size, storageutil.filerevisioncopied,
+        // revlog.size, revlog.rawsize
+
+        // Let’s call `file_data_len` what would be returned by
+        // `self.data().file_data().len()`.
+
+        if self.0.is_cencored() {
+            let file_data_len = 0;
+            return other_len != file_data_len;
+        }
+
+        if self.0.has_length_affecting_flag_processor() {
+            // We can’t conclude anything about `file_data_len`.
+            return false;
+        }
+
+        // Revlog revisions (usually) have metadata for the size of
+        // their data after decompression and delta resolution
+        // as would be returned by `Revlog::get_rev_data`.
+        //
+        // For filelogs this is the file’s contents preceded by an optional
+        // metadata block.
+        let uncompressed_len = if let Some(l) = self.0.uncompressed_len() {
+            l as u64
+        } else {
+            // The field was set to -1, the actual uncompressed len is unknown.
+            // We need to decompress to say more.
+            return false;
+        };
+        // `uncompressed_len = file_data_len + optional_metadata_len`,
+        // so `file_data_len <= uncompressed_len`.
+        if uncompressed_len < other_len {
+            // Transitively, `file_data_len < other_len`.
+            // So `other_len != file_data_len` definitely.
+            return true;
+        }
+
+        if uncompressed_len == other_len + 4 {
+            // It’s possible that `file_data_len == other_len` with an empty
+            // metadata block (2 start marker bytes + 2 end marker bytes).
+            // This happens when there wouldn’t otherwise be metadata, but
+            // the first 2 bytes of file data happen to match a start marker
+            // and would be ambiguous.
+            return false;
+        }
+
+        if !self.0.has_p1() {
+            // There may or may not be copy metadata, so we can’t deduce more
+            // about `file_data_len` without computing file data.
+            return false;
+        }
+
+        // Filelog ancestry is not meaningful in the way changelog ancestry is.
+        // It only provides hints to delta generation.
+        // p1 and p2 are set to null when making a copy or rename since
+        // contents are likely unrelatedto what might have previously existed
+        // at the destination path.
+        //
+        // Conversely, since here p1 is non-null, there is no copy metadata.
+        // Note that this reasoning may be invalidated in the presence of
+        // merges made by some previous versions of Mercurial that
+        // swapped p1 and p2. See <https://bz.mercurial-scm.org/show_bug.cgi?id=6528>
+        // and `tests/test-issue6528.t`.
+        //
+        // Since copy metadata is currently the only kind of metadata
+        // kept in revlog data of filelogs,
+        // this `FilelogEntry` does not have such metadata:
+        let file_data_len = uncompressed_len;
+
+        return file_data_len != other_len;
+    }
+
     pub fn data(&self) -> Result<FilelogRevisionData, HgError> {
         Ok(FilelogRevisionData(self.0.data()?.into_owned()))
     }
diff --git a/mercurial/revlogutils/flagutil.py b/mercurial/revlogutils/flagutil.py
--- a/mercurial/revlogutils/flagutil.py
+++ b/mercurial/revlogutils/flagutil.py
@@ -32,6 +32,7 @@
 REVIDX_FLAGS_ORDER
 REVIDX_RAWTEXT_CHANGING_FLAGS
 
+# Keep this in sync with REVIDX_KNOWN_FLAGS in rust/hg-core/src/revlog/revlog.rs
 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
 
 # Store flag processors (cf. 'addflagprocessor()' to register)



To: SimonSapin, #hg-reviewers, Alphare
Cc: mercurial-patches
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurial-scm.org/pipermail/mercurial-patches/attachments/20220107/60051730/attachment-0002.html>


More information about the Mercurial-patches mailing list