D8323: rust-matchers: use the `regex` crate

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

D8323: rust-matchers: use the `regex` crate

martinvonz (Martin von Zweigbergk)
Alphare created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  Instead of falling back to Python when a code path with "ignore" functionality
  is reached and `Re2` is not installed, the default compilation (i.e. without
  the `with-re2` feature) will use the `regex` crate for all regular expressions
  business.
 
  As with the introduction of `Re2` in a previous series, this yields a big
  performance boost compared to the Python + C code in `status`, `diff`, `commit`,
  `update`, and maybe others.
 
  For now `Re2` looks to be faster at compiling the DFA (1.5ms vs 5ms for
  Netbeans' `.hgignore`) and a bit faster in actual use: (123ms vs 137ms for
  the parallel traversal of Netbeans' clean repo). I am in talks with the author
  of `regex` to see whether that performance difference is a bug, a "won't fix",
  or a tuning issue.
 
  The `regex` crate is already one of our dependencies and using this code does
  not require any additional work from the end-user than to use the Rust
  extensions.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D8323

AFFECTED FILES
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/matchers.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/matchers.rs b/rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/matchers.rs
+++ b/rust/hg-core/src/matchers.rs
@@ -331,8 +331,37 @@
 }
 
 #[cfg(not(feature = "with-re2"))]
-fn re_matcher(_: &[u8]) -> PatternResult<Box<dyn Fn(&HgPath) -> bool + Sync>> {
-    Err(PatternError::Re2NotInstalled)
+/// Returns a function that matches an `HgPath` against the given regex
+/// pattern.
+///
+/// This can fail when the pattern is invalid or not supported by the
+/// underlying engine (the `regex` crate), for instance anything with
+/// back-references.
+fn re_matcher(
+    pattern: &[u8],
+) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
+    use std::io::Write;
+
+    let mut escaped_bytes = vec![];
+    for byte in pattern {
+        if *byte > 127 {
+            write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
+        } else {
+            escaped_bytes.push(*byte);
+        }
+    }
+
+    // Avoid the cost of UTF8 checking
+    //
+    // # Safety
+    // This is safe because we escaped all non-ASCII bytes.
+    let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
+    let re = regex::bytes::RegexBuilder::new(&pattern_string)
+        .unicode(false)
+        .build()
+        .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
+
+    Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
 }
 
 /// Returns the regex pattern and a function that matches an `HgPath` against
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -176,9 +176,14 @@
         return vec![];
     }
     match syntax {
-        PatternSyntax::Regexp => pattern.to_owned(),
+        // The `regex` crate adds `.*` to the start and end of expressions
+        // if there are no anchors, so add them.
+        PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
         PatternSyntax::RelRegexp => {
-            if pattern[0] == b'^' {
+            // The `regex` crate accepts `**` while `re2` and Python's `re`
+            // do not. Checking for `*` correctly triggers the same error all
+            // engines.
+            if pattern[0] == b'^' || pattern[0] == b'*' {
                 return pattern.to_owned();
             }
             [&b".*"[..], pattern].concat()
@@ -191,14 +196,15 @@
         }
         PatternSyntax::RootFiles => {
             let mut res = if pattern == b"." {
-                vec![]
+                vec![b'^']
             } else {
                 // Pattern is a directory name.
-                [escape_pattern(pattern).as_slice(), b"/"].concat()
+                [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
             };
 
             // Anything after the pattern must be a non-directory.
             res.extend(b"[^/]+$");
+            res.push(b'$');
             res
         }
         PatternSyntax::RelGlob => {
@@ -206,11 +212,11 @@
             if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
                 [b".*", rest, GLOB_SUFFIX].concat()
             } else {
-                [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
+                [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
             }
         }
         PatternSyntax::Glob | PatternSyntax::RootGlob => {
-            [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
+            [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
         }
         PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
     }
@@ -282,7 +288,10 @@
     if *syntax == PatternSyntax::RootGlob
         && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
-        let mut escaped = escape_pattern(&pattern);
+        // The `regex` crate adds `.*` to the start and end of expressions
+        // if there are no anchors, so add the start anchor.
+        let mut escaped = vec![b'^'];
+        escaped.extend(escape_pattern(&pattern));
         escaped.extend(GLOB_SUFFIX);
         Ok(escaped)
     } else {
@@ -619,7 +628,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"(?:|.*/)rust/target(?:/|$)".to_vec(),
+            br"(?:.*/)?rust/target(?:/|$)".to_vec(),
         );
     }
 
@@ -632,7 +641,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"\.(?:/|$)".to_vec(),
+            br"^\.(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -641,7 +650,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"whatever(?:/|$)".to_vec(),
+            br"^whatever(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -650,7 +659,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"[^/]*\.o(?:/|$)".to_vec(),
+            br"^[^/]*\.o(?:/|$)".to_vec(),
         );
     }
 }



To: Alphare, #hg-reviewers
Cc: mercurial-devel
_______________________________________________
Mercurial-devel mailing list
[hidden email]
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Reply | Threaded
Open this post in threaded view
|

D8323: rust-matchers: use the `regex` crate

martinvonz (Martin von Zweigbergk)
Closed by commit rHG496868f1030c: rust-matchers: use the `regex` crate (authored by Alphare).
This revision was automatically updated to reflect the committed changes.
This revision was not accepted when it landed; it landed in state "Needs Review".

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D8323?vs=20867&id=20868

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D8323/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D8323

AFFECTED FILES
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/matchers.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/matchers.rs b/rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/matchers.rs
+++ b/rust/hg-core/src/matchers.rs
@@ -331,8 +331,37 @@
 }
 
 #[cfg(not(feature = "with-re2"))]
-fn re_matcher(_: &[u8]) -> PatternResult<Box<dyn Fn(&HgPath) -> bool + Sync>> {
-    Err(PatternError::Re2NotInstalled)
+/// Returns a function that matches an `HgPath` against the given regex
+/// pattern.
+///
+/// This can fail when the pattern is invalid or not supported by the
+/// underlying engine (the `regex` crate), for instance anything with
+/// back-references.
+fn re_matcher(
+    pattern: &[u8],
+) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
+    use std::io::Write;
+
+    let mut escaped_bytes = vec![];
+    for byte in pattern {
+        if *byte > 127 {
+            write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
+        } else {
+            escaped_bytes.push(*byte);
+        }
+    }
+
+    // Avoid the cost of UTF8 checking
+    //
+    // # Safety
+    // This is safe because we escaped all non-ASCII bytes.
+    let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
+    let re = regex::bytes::RegexBuilder::new(&pattern_string)
+        .unicode(false)
+        .build()
+        .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
+
+    Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
 }
 
 /// Returns the regex pattern and a function that matches an `HgPath` against
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -176,9 +176,14 @@
         return vec![];
     }
     match syntax {
-        PatternSyntax::Regexp => pattern.to_owned(),
+        // The `regex` crate adds `.*` to the start and end of expressions
+        // if there are no anchors, so add them.
+        PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
         PatternSyntax::RelRegexp => {
-            if pattern[0] == b'^' {
+            // The `regex` crate accepts `**` while `re2` and Python's `re`
+            // do not. Checking for `*` correctly triggers the same error all
+            // engines.
+            if pattern[0] == b'^' || pattern[0] == b'*' {
                 return pattern.to_owned();
             }
             [&b".*"[..], pattern].concat()
@@ -191,14 +196,15 @@
         }
         PatternSyntax::RootFiles => {
             let mut res = if pattern == b"." {
-                vec![]
+                vec![b'^']
             } else {
                 // Pattern is a directory name.
-                [escape_pattern(pattern).as_slice(), b"/"].concat()
+                [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
             };
 
             // Anything after the pattern must be a non-directory.
             res.extend(b"[^/]+$");
+            res.push(b'$');
             res
         }
         PatternSyntax::RelGlob => {
@@ -206,11 +212,11 @@
             if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
                 [b".*", rest, GLOB_SUFFIX].concat()
             } else {
-                [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
+                [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
             }
         }
         PatternSyntax::Glob | PatternSyntax::RootGlob => {
-            [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
+            [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
         }
         PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
     }
@@ -282,7 +288,10 @@
     if *syntax == PatternSyntax::RootGlob
         && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
-        let mut escaped = escape_pattern(&pattern);
+        // The `regex` crate adds `.*` to the start and end of expressions
+        // if there are no anchors, so add the start anchor.
+        let mut escaped = vec![b'^'];
+        escaped.extend(escape_pattern(&pattern));
         escaped.extend(GLOB_SUFFIX);
         Ok(escaped)
     } else {
@@ -619,7 +628,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"(?:|.*/)rust/target(?:/|$)".to_vec(),
+            br"(?:.*/)?rust/target(?:/|$)".to_vec(),
         );
     }
 
@@ -632,7 +641,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"\.(?:/|$)".to_vec(),
+            br"^\.(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -641,7 +650,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"whatever(?:/|$)".to_vec(),
+            br"^whatever(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -650,7 +659,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"[^/]*\.o(?:/|$)".to_vec(),
+            br"^[^/]*\.o(?:/|$)".to_vec(),
         );
     }
 }



To: Alphare, #hg-reviewers
Cc: mercurial-devel
_______________________________________________
Mercurial-devel mailing list
[hidden email]
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel