[Updated] [++++ ] D11389: rhg: Port Python’s `ui.configlist` as `Config::get_list`

SimonSapin phabricator at mercurial-scm.org
Sat Sep 4 07:25:57 UTC 2021


SimonSapin updated this revision to Diff 30183.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D11389?vs=30181&id=30183

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D11389/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D11389

AFFECTED FILES
  rust/hg-core/src/config/config.rs
  rust/hg-core/src/config/values.rs
  tests/test-config-parselist.py

CHANGE DETAILS

diff --git a/tests/test-config-parselist.py b/tests/test-config-parselist.py
new file mode 100644
--- /dev/null
+++ b/tests/test-config-parselist.py
@@ -0,0 +1,52 @@
+"""
+List-valued configuration keys have an ad-hoc microsyntax. From `hg help config`:
+
+> List values are separated by whitespace or comma, except when values are
+> placed in double quotation marks:
+>
+>     allow_read = "John Doe, PhD", brian, betty
+>
+> Quotation marks can be escaped by prefixing them with a backslash. Only
+> quotation marks at the beginning of a word is counted as a quotation
+> (e.g., ``foo"bar baz`` is the list of ``foo"bar`` and ``baz``).
+
+That help documentation is fairly light on details, the actual parser has many
+other edge cases. This test tries to cover them.
+"""
+
+from mercurial.utils import stringutil
+
+
+def assert_parselist(input, expected):
+    result = stringutil.parselist(input)
+    if result != expected:
+        raise AssertionError(
+            "parse_input(%r)\n     got %r\nexpected %r"
+            % (input, result, expected)
+        )
+
+
+# Keep these Python tests in sync with the Rust ones in `rust/hg-core/src/config/values.rs`
+
+assert_parselist(b'', [])
+assert_parselist(b',', [])
+assert_parselist(b'A', [b'A'])
+assert_parselist(b'B,B', [b'B', b'B'])
+assert_parselist(b', C, ,C,', [b'C', b'C'])
+assert_parselist(b'"', [b'"'])
+assert_parselist(b'""', [b'', b''])
+assert_parselist(b'D,"', [b'D', b'"'])
+assert_parselist(b'E,""', [b'E', b'', b''])
+assert_parselist(b'"F,F"', [b'F,F'])
+assert_parselist(b'"G,G', [b'"G', b'G'])
+assert_parselist(b'"H \\",\\"H', [b'"H', b',', b'H'])
+assert_parselist(b'I,I"', [b'I', b'I"'])
+assert_parselist(b'J,"J', [b'J', b'"J'])
+assert_parselist(b'K K', [b'K', b'K'])
+assert_parselist(b'"K" K', [b'K', b'K'])
+assert_parselist(b'L\tL', [b'L', b'L'])
+assert_parselist(b'"L"\tL', [b'L', b'', b'L'])
+assert_parselist(b'M\x0bM', [b'M', b'M'])
+assert_parselist(b'"M"\x0bM', [b'M', b'', b'M'])
+assert_parselist(b'"N"  , ,"', [b'N"'])
+assert_parselist(b'" ,O,  ', [b'"', b'O'])
diff --git a/rust/hg-core/src/config/values.rs b/rust/hg-core/src/config/values.rs
--- a/rust/hg-core/src/config/values.rs
+++ b/rust/hg-core/src/config/values.rs
@@ -8,6 +8,8 @@
 //! details about where the value came from (but omits details of what’s
 //! invalid inside the value).
 
+use crate::utils::SliceExt;
+
 pub(super) fn parse_bool(v: &[u8]) -> Option<bool> {
     match v.to_ascii_lowercase().as_slice() {
         b"1" | b"yes" | b"true" | b"on" | b"always" => Some(true),
@@ -42,6 +44,216 @@
     value.parse().ok()
 }
 
+/// Parse a config value as a list of sub-values.
+///
+/// Ported from `parselist` in `mercurial/utils/stringutil.py`
+
+// Note: keep behavior in sync with the Python one.
+
+// Note: this could return `Vec<Cow<[u8]>>` instead and borrow `input` when
+// possible (when there’s no backslash-escapes) but this is probably not worth
+// the complexity as config is presumably not accessed inside
+// preformance-sensitive loops.
+pub(super) fn parse_list(input: &[u8]) -> Vec<Vec<u8>> {
+    // Port of Python’s `value.lstrip(b' ,\n')`
+    // TODO: is this really what we want?
+    let input =
+        input.trim_start_matches(|b| b == b' ' || b == b',' || b == b'\n');
+    parse_list_without_trim_start(input)
+}
+
+fn parse_list_without_trim_start(input: &[u8]) -> Vec<Vec<u8>> {
+    // Start of port of Python’s `_configlist`
+    let input = input.trim_end_matches(|b| b == b' ' || b == b',');
+    if input.is_empty() {
+        return Vec::new();
+    }
+
+    // Just to make “a string” less confusable with “a list of strings”.
+    type ByteString = Vec<u8>;
+
+    // These correspond to Python’s…
+    let mut mode = ParserMode::Plain; // `parser`
+    let mut values = Vec::new(); // `parts[:-1]`
+    let mut next_value = ByteString::new(); // `parts[-1]`
+    let mut offset = 0; // `offset`
+
+    // Setting `parser` to `None` is instead handled by returning immediately
+    enum ParserMode {
+        Plain,
+        Quoted,
+    }
+
+    loop {
+        match mode {
+            ParserMode::Plain => {
+                // Start of port of Python’s `_parse_plain`
+                let mut whitespace = false;
+                while let Some(&byte) = input.get(offset) {
+                    if is_space(byte) || byte == b',' {
+                        whitespace = true;
+                        offset += 1;
+                    } else {
+                        break;
+                    }
+                }
+                if let Some(&byte) = input.get(offset) {
+                    if whitespace {
+                        values.push(std::mem::take(&mut next_value))
+                    }
+                    if byte == b'"' && next_value.is_empty() {
+                        mode = ParserMode::Quoted;
+                    } else {
+                        if byte == b'"' && next_value.ends_with(b"\\") {
+                            next_value.pop();
+                        }
+                        next_value.push(byte);
+                    }
+                    offset += 1;
+                } else {
+                    values.push(next_value);
+                    return values;
+                }
+            }
+            ParserMode::Quoted => {
+                // Start of port of Python’s `_parse_quote`
+                if let Some(&byte) = input.get(offset) {
+                    if byte == b'"' {
+                        // The input contains a quoted zero-length value `""`
+                        debug_assert_eq!(next_value, b"");
+                        values.push(std::mem::take(&mut next_value));
+                        offset += 1;
+                        while let Some(&byte) = input.get(offset) {
+                            if is_space(byte) || byte == b',' {
+                                offset += 1;
+                            } else {
+                                break;
+                            }
+                        }
+                        mode = ParserMode::Plain;
+                        continue;
+                    }
+                }
+
+                while let Some(&byte) = input.get(offset) {
+                    if byte == b'"' {
+                        break;
+                    }
+                    if byte == b'\\' && input.get(offset + 1) == Some(&b'"') {
+                        next_value.push(b'"');
+                        offset += 2;
+                    } else {
+                        next_value.push(byte);
+                        offset += 1;
+                    }
+                }
+
+                if offset >= input.len() {
+                    // We didn’t find a closing double-quote,
+                    // so treat the opening one as part of an unquoted value
+                    // instead of delimiting the start of a quoted value.
+
+                    // `next_value` may have had some backslash-escapes
+                    // unescaped. TODO: shouldn’t we use a slice of `input`
+                    // instead?
+                    let mut real_values =
+                        parse_list_without_trim_start(&next_value);
+
+                    if let Some(first) = real_values.first_mut() {
+                        first.insert(0, b'"');
+                        // Drop `next_value`
+                        values.extend(real_values)
+                    } else {
+                        next_value.push(b'"');
+                        values.push(next_value);
+                    }
+                    return values;
+                }
+
+                // We’re not at the end of the input, which means the `while`
+                // loop above ended at at double quote. Skip
+                // over that.
+                offset += 1;
+
+                while let Some(&byte) = input.get(offset) {
+                    if byte == b' ' || byte == b',' {
+                        offset += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                if offset >= input.len() {
+                    values.push(next_value);
+                    return values;
+                }
+
+                if offset + 1 == input.len() && input[offset] == b'"' {
+                    next_value.push(b'"');
+                    offset += 1;
+                } else {
+                    values.push(std::mem::take(&mut next_value));
+                }
+
+                mode = ParserMode::Plain;
+            }
+        }
+    }
+
+    // https://docs.python.org/3/library/stdtypes.html?#bytes.isspace
+    fn is_space(byte: u8) -> bool {
+        if let b' ' | b'\t' | b'\n' | b'\r' | b'\x0b' | b'\x0c' = byte {
+            true
+        } else {
+            false
+        }
+    }
+}
+
+#[test]
+fn test_parse_list() {
+    // Make `assert_eq` error messages nicer
+    fn as_strings(values: &[Vec<u8>]) -> Vec<String> {
+        values
+            .iter()
+            .map(|v| std::str::from_utf8(v.as_ref()).unwrap().to_owned())
+            .collect()
+    }
+    macro_rules! assert_parse_list {
+        ( $input: expr => [ $( $output: expr ),* ] ) => {
+            assert_eq!(
+                as_strings(&parse_list($input)),
+                as_strings(&[ $( Vec::from(&$output[..]) ),* ]),
+            );
+        }
+    }
+
+    // Keep these Rust tests in sync with the Python ones in
+    // `tests/test-config-parselist.py`
+    assert_parse_list!(b"" => []);
+    assert_parse_list!(b"," => []);
+    assert_parse_list!(b"A" => [b"A"]);
+    assert_parse_list!(b"B,B" => [b"B", b"B"]);
+    assert_parse_list!(b", C, ,C," => [b"C", b"C"]);
+    assert_parse_list!(b"\"" => [b"\""]);
+    assert_parse_list!(b"\"\"" => [b"", b""]);
+    assert_parse_list!(b"D,\"" => [b"D", b"\""]);
+    assert_parse_list!(b"E,\"\"" => [b"E", b"", b""]);
+    assert_parse_list!(b"\"F,F\"" => [b"F,F"]);
+    assert_parse_list!(b"\"G,G" => [b"\"G", b"G"]);
+    assert_parse_list!(b"\"H \\\",\\\"H" => [b"\"H", b",", b"H"]);
+    assert_parse_list!(b"I,I\"" => [b"I", b"I\""]);
+    assert_parse_list!(b"J,\"J" => [b"J", b"\"J"]);
+    assert_parse_list!(b"K K" => [b"K", b"K"]);
+    assert_parse_list!(b"\"K\" K" => [b"K", b"K"]);
+    assert_parse_list!(b"L\tL" => [b"L", b"L"]);
+    assert_parse_list!(b"\"L\"\tL" => [b"L", b"", b"L"]);
+    assert_parse_list!(b"M\x0bM" => [b"M", b"M"]);
+    assert_parse_list!(b"\"M\"\x0bM" => [b"M", b"", b"M"]);
+    assert_parse_list!(b"\"N\"  , ,\"" => [b"N\""]);
+    assert_parse_list!(b"\" ,O,  " => [b"\"", b"O"]);
+}
+
 #[test]
 fn test_parse_byte_size() {
     assert_eq!(parse_byte_size(b""), None);
diff --git a/rust/hg-core/src/config/config.rs b/rust/hg-core/src/config/config.rs
--- a/rust/hg-core/src/config/config.rs
+++ b/rust/hg-core/src/config/config.rs
@@ -388,6 +388,16 @@
         })
     }
 
+    /// If there is an `item` value in `section`, parse and return a list of
+    /// byte strings.
+    pub fn get_list(
+        &self,
+        section: &[u8],
+        item: &[u8],
+    ) -> Option<Vec<Vec<u8>>> {
+        self.get(section, item).map(values::parse_list)
+    }
+
     /// Returns the raw value bytes of the first one found, or `None`.
     pub fn get(&self, section: &[u8], item: &[u8]) -> Option<&[u8]> {
         self.get_inner(section, item)



To: SimonSapin, #hg-reviewers
Cc: mercurial-patches
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurial-scm.org/pipermail/mercurial-patches/attachments/20210904/0536af5a/attachment-0002.html>


More information about the Mercurial-patches mailing list