D7908: rust-filepatterns: improve API and robustness for pattern files parsing
Alphare (Raphaël Gomès)
phabricator at mercurial-scm.org
Mon Feb 10 23:33:02 UTC 2020
Closed by commit rHG725d79b48e07: rust-filepatterns: improve API and robustness for pattern files parsing (authored by Alphare).
This revision was automatically updated to reflect the committed changes.
This revision was not accepted when it landed; it landed in state "Needs Review".
REPOSITORY
rHG Mercurial
CHANGES SINCE LAST UPDATE
https://phab.mercurial-scm.org/D7908?vs=20038&id=20078
CHANGES SINCE LAST ACTION
https://phab.mercurial-scm.org/D7908/new/
REVISION DETAIL
https://phab.mercurial-scm.org/D7908
AFFECTED FILES
rust/hg-core/src/filepatterns.rs
rust/hg-core/src/lib.rs
CHANGE DETAILS
diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -25,7 +25,8 @@
use crate::utils::hg_path::{HgPathBuf, HgPathError};
pub use filepatterns::{
- build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
+ parse_pattern_syntax, read_pattern_file, IgnorePattern,
+ PatternFileWarning, PatternSyntax,
};
use std::collections::HashMap;
use twox_hash::RandomXxHashBuilder64;
@@ -115,18 +116,31 @@
#[derive(Debug)]
pub enum PatternError {
+ Path(HgPathError),
UnsupportedSyntax(String),
+ UnsupportedSyntaxInFile(String, String, usize),
+ TooLong(usize),
+ IO(std::io::Error),
}
-#[derive(Debug)]
-pub enum PatternFileError {
- IO(std::io::Error),
- Pattern(PatternError, LineNumber),
-}
-
-impl From<std::io::Error> for PatternFileError {
- fn from(e: std::io::Error) -> Self {
- PatternFileError::IO(e)
+impl ToString for PatternError {
+ fn to_string(&self) -> String {
+ match self {
+ PatternError::UnsupportedSyntax(syntax) => {
+ format!("Unsupported syntax {}", syntax)
+ }
+ PatternError::UnsupportedSyntaxInFile(syntax, file_path, line) => {
+ format!(
+ "{}:{}: unsupported syntax {}",
+ file_path, line, syntax
+ )
+ }
+ PatternError::TooLong(size) => {
+ format!("matcher pattern is too long ({} bytes)", size)
+ }
+ PatternError::IO(e) => e.to_string(),
+ PatternError::Path(e) => e.to_string(),
+ }
}
}
@@ -141,3 +155,15 @@
DirstateError::IO(e)
}
}
+
+impl From<std::io::Error> for PatternError {
+ fn from(e: std::io::Error) -> Self {
+ PatternError::IO(e)
+ }
+}
+
+impl From<HgPathError> for PatternError {
+ fn from(e: HgPathError) -> Self {
+ PatternError::Path(e)
+ }
+}
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -7,9 +7,7 @@
//! Handling of Mercurial-specific patterns.
-use crate::{
- utils::SliceExt, FastHashMap, LineNumber, PatternError, PatternFileError,
-};
+use crate::{utils::SliceExt, FastHashMap, PatternError};
use lazy_static::lazy_static;
use regex::bytes::{NoExpand, Regex};
use std::fs::File;
@@ -32,18 +30,28 @@
const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
&[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
+/// Appended to the regexp of globs
+const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum PatternSyntax {
+ /// A regular expression
Regexp,
/// Glob that matches at the front of the path
RootGlob,
/// Glob that matches at any suffix of the path (still anchored at
/// slashes)
Glob,
+ /// a path relative to repository root, which is matched recursively
Path,
+ /// A path relative to cwd
RelPath,
+ /// an unrooted glob (*.rs matches Rust files in all dirs)
RelGlob,
+ /// A regexp that needn't match the start of a name
RelRegexp,
+ /// A path relative to repository root, which is matched non-recursively
+ /// (will not match subdirectories)
RootFiles,
}
@@ -125,16 +133,18 @@
.collect()
}
-fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
+pub fn parse_pattern_syntax(
+ kind: &[u8],
+) -> Result<PatternSyntax, PatternError> {
match kind {
- b"re" => Ok(PatternSyntax::Regexp),
- b"path" => Ok(PatternSyntax::Path),
- b"relpath" => Ok(PatternSyntax::RelPath),
- b"rootfilesin" => Ok(PatternSyntax::RootFiles),
- b"relglob" => Ok(PatternSyntax::RelGlob),
- b"relre" => Ok(PatternSyntax::RelRegexp),
- b"glob" => Ok(PatternSyntax::Glob),
- b"rootglob" => Ok(PatternSyntax::RootGlob),
+ b"re:" => Ok(PatternSyntax::Regexp),
+ b"path:" => Ok(PatternSyntax::Path),
+ b"relpath:" => Ok(PatternSyntax::RelPath),
+ b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
+ b"relglob:" => Ok(PatternSyntax::RelGlob),
+ b"relre:" => Ok(PatternSyntax::RelRegexp),
+ b"glob:" => Ok(PatternSyntax::Glob),
+ b"rootglob:" => Ok(PatternSyntax::RootGlob),
_ => Err(PatternError::UnsupportedSyntax(
String::from_utf8_lossy(kind).to_string(),
)),
@@ -144,11 +154,10 @@
/// Builds the regex that corresponds to the given pattern.
/// If within a `syntax: regexp` context, returns the pattern,
/// otherwise, returns the corresponding regex.
-fn _build_single_regex(
- syntax: PatternSyntax,
- pattern: &[u8],
- globsuffix: &[u8],
-) -> Vec<u8> {
+fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
+ let IgnorePattern {
+ syntax, pattern, ..
+ } = entry;
if pattern.is_empty() {
return vec![];
}
@@ -158,7 +167,7 @@
if pattern[0] == b'^' {
return pattern.to_owned();
}
- [b".*", pattern].concat()
+ [&b".*"[..], pattern].concat()
}
PatternSyntax::Path | PatternSyntax::RelPath => {
if pattern == b"." {
@@ -181,13 +190,13 @@
PatternSyntax::RelGlob => {
let glob_re = glob_to_re(pattern);
if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
- [b".*", rest, globsuffix].concat()
+ [b".*", rest, GLOB_SUFFIX].concat()
} else {
- [b"(?:|.*/)", glob_re.as_slice(), globsuffix].concat()
+ [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
}
}
PatternSyntax::Glob | PatternSyntax::RootGlob => {
- [glob_to_re(pattern).as_slice(), globsuffix].concat()
+ [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
}
}
}
@@ -195,22 +204,73 @@
const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
[b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
+/// TODO support other platforms
+#[cfg(unix)]
+pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
+ if bytes.is_empty() {
+ return b".".to_vec();
+ }
+ let sep = b'/';
+
+ let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
+ if initial_slashes > 2 {
+ // POSIX allows one or two initial slashes, but treats three or more
+ // as single slash.
+ initial_slashes = 1;
+ }
+ let components = bytes
+ .split(|b| *b == sep)
+ .filter(|c| !(c.is_empty() || c == b"."))
+ .fold(vec![], |mut acc, component| {
+ if component != b".."
+ || (initial_slashes == 0 && acc.is_empty())
+ || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
+ {
+ acc.push(component)
+ } else if !acc.is_empty() {
+ acc.pop();
+ }
+ acc
+ });
+ let mut new_bytes = components.join(&sep);
+
+ if initial_slashes > 0 {
+ let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
+ buf.extend(new_bytes);
+ new_bytes = buf;
+ }
+ if new_bytes.is_empty() {
+ b".".to_vec()
+ } else {
+ new_bytes
+ }
+}
+
/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
/// that don't need to be transformed into a regex.
pub fn build_single_regex(
- kind: &[u8],
- pat: &[u8],
- globsuffix: &[u8],
+ entry: &IgnorePattern,
) -> Result<Vec<u8>, PatternError> {
- let enum_kind = parse_pattern_syntax(kind)?;
- if enum_kind == PatternSyntax::RootGlob
- && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
+ let IgnorePattern {
+ pattern, syntax, ..
+ } = entry;
+ let pattern = match syntax {
+ PatternSyntax::RootGlob
+ | PatternSyntax::Path
+ | PatternSyntax::RelGlob
+ | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
+ _ => pattern.to_owned(),
+ };
+ if *syntax == PatternSyntax::RootGlob
+ && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
{
- let mut escaped = escape_pattern(pat);
- escaped.extend(b"(?:/|$)");
+ let mut escaped = escape_pattern(&pattern);
+ escaped.extend(GLOB_SUFFIX);
Ok(escaped)
} else {
- Ok(_build_single_regex(enum_kind, pat, globsuffix))
+ let mut entry = entry.clone();
+ entry.pattern = pattern;
+ Ok(_build_single_regex(&entry))
}
}
@@ -222,24 +282,29 @@
m.insert(b"regexp".as_ref(), b"relre:".as_ref());
m.insert(b"glob".as_ref(), b"relglob:".as_ref());
m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
- m.insert(b"include".as_ref(), b"include".as_ref());
- m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
+ m.insert(b"include".as_ref(), b"include:".as_ref());
+ m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
m
};
}
-pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
-type WarningTuple = (PathBuf, Vec<u8>);
+#[derive(Debug)]
+pub enum PatternFileWarning {
+ /// (file path, syntax bytes)
+ InvalidSyntax(PathBuf, Vec<u8>),
+ /// File path
+ NoSuchFile(PathBuf),
+}
pub fn parse_pattern_file_contents<P: AsRef<Path>>(
lines: &[u8],
file_path: P,
warn: bool,
-) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
+) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
let comment_escape_regex = Regex::new(r"\\#").unwrap();
- let mut inputs: Vec<PatternTuple> = vec![];
- let mut warnings: Vec<WarningTuple> = vec![];
+ let mut inputs: Vec<IgnorePattern> = vec![];
+ let mut warnings: Vec<PatternFileWarning> = vec![];
let mut current_syntax = b"relre:".as_ref();
@@ -267,8 +332,10 @@
if let Some(rel_syntax) = SYNTAXES.get(syntax) {
current_syntax = rel_syntax;
} else if warn {
- warnings
- .push((file_path.as_ref().to_owned(), syntax.to_owned()));
+ warnings.push(PatternFileWarning::InvalidSyntax(
+ file_path.as_ref().to_owned(),
+ syntax.to_owned(),
+ ));
}
continue;
}
@@ -288,34 +355,82 @@
}
}
- inputs.push((
- [line_syntax, line].concat(),
- line_number,
- line.to_owned(),
+ inputs.push(IgnorePattern::new(
+ parse_pattern_syntax(&line_syntax).map_err(|e| match e {
+ PatternError::UnsupportedSyntax(syntax) => {
+ PatternError::UnsupportedSyntaxInFile(
+ syntax,
+ file_path.as_ref().to_string_lossy().into(),
+ line_number,
+ )
+ }
+ _ => e,
+ })?,
+ &line,
+ &file_path,
));
}
- (inputs, warnings)
+ Ok((inputs, warnings))
}
pub fn read_pattern_file<P: AsRef<Path>>(
file_path: P,
warn: bool,
-) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
- let mut f = File::open(file_path.as_ref())?;
+) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
+ let mut f = match File::open(file_path.as_ref()) {
+ Ok(f) => Ok(f),
+ Err(e) => match e.kind() {
+ std::io::ErrorKind::NotFound => {
+ return Ok((
+ vec![],
+ vec![PatternFileWarning::NoSuchFile(
+ file_path.as_ref().to_owned(),
+ )],
+ ))
+ }
+ _ => Err(e),
+ },
+ }?;
let mut contents = Vec::new();
f.read_to_end(&mut contents)?;
- Ok(parse_pattern_file_contents(&contents, file_path, warn))
+ Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
+}
+
+/// Represents an entry in an "ignore" file.
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct IgnorePattern {
+ pub syntax: PatternSyntax,
+ pub pattern: Vec<u8>,
+ pub source: PathBuf,
}
+impl IgnorePattern {
+ pub fn new(
+ syntax: PatternSyntax,
+ pattern: &[u8],
+ source: impl AsRef<Path>,
+ ) -> Self {
+ Self {
+ syntax,
+ pattern: pattern.to_owned(),
+ source: source.as_ref().to_owned(),
+ }
+ }
+}
+
+pub type PatternResult<T> = Result<T, PatternError>;
+
#[cfg(test)]
mod tests {
use super::*;
+ use pretty_assertions::assert_eq;
#[test]
fn escape_pattern_test() {
- let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
+ let untouched =
+ br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
assert_eq!(escape_pattern(untouched), untouched.to_vec());
// All escape codes
assert_eq!(
@@ -342,39 +457,78 @@
let lines = b"syntax: glob\n*.elc";
assert_eq!(
- vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
parse_pattern_file_contents(lines, Path::new("file_path"), false)
+ .unwrap()
.0,
+ vec![IgnorePattern::new(
+ PatternSyntax::RelGlob,
+ b"*.elc",
+ Path::new("file_path")
+ )],
);
let lines = b"syntax: include\nsyntax: glob";
assert_eq!(
parse_pattern_file_contents(lines, Path::new("file_path"), false)
+ .unwrap()
.0,
vec![]
);
let lines = b"glob:**.o";
assert_eq!(
parse_pattern_file_contents(lines, Path::new("file_path"), false)
+ .unwrap()
.0,
- vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
+ vec![IgnorePattern::new(
+ PatternSyntax::RelGlob,
+ b"**.o",
+ Path::new("file_path")
+ )]
+ );
+ }
+
+ #[test]
+ fn test_build_single_regex() {
+ assert_eq!(
+ build_single_regex(&IgnorePattern::new(
+ PatternSyntax::RelGlob,
+ b"rust/target/",
+ Path::new("")
+ ))
+ .unwrap(),
+ br"(?:|.*/)rust/target(?:/|$)".to_vec(),
);
}
#[test]
fn test_build_single_regex_shortcut() {
assert_eq!(
- br"(?:/|$)".to_vec(),
- build_single_regex(b"rootglob", b"", b"").unwrap()
+ build_single_regex(&IgnorePattern::new(
+ PatternSyntax::RootGlob,
+ b"",
+ Path::new("")
+ ))
+ .unwrap(),
+ br"\.(?:/|$)".to_vec(),
);
assert_eq!(
+ build_single_regex(&IgnorePattern::new(
+ PatternSyntax::RootGlob,
+ b"whatever",
+ Path::new("")
+ ))
+ .unwrap(),
br"whatever(?:/|$)".to_vec(),
- build_single_regex(b"rootglob", b"whatever", b"").unwrap()
);
assert_eq!(
- br"[^/]*\.o".to_vec(),
- build_single_regex(b"rootglob", b"*.o", b"").unwrap()
+ build_single_regex(&IgnorePattern::new(
+ PatternSyntax::RootGlob,
+ b"*.o",
+ Path::new("")
+ ))
+ .unwrap(),
+ br"[^/]*\.o(?:/|$)".to_vec(),
);
}
}
To: Alphare, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel
More information about the Mercurial-devel
mailing list