422 lines
18 KiB
Rust
422 lines
18 KiB
Rust
/// The set of user configurable options for compiling zero or more regexes.
|
|
#[derive(Clone, Debug)]
|
|
#[allow(missing_docs)]
|
|
pub struct RegexOptions {
|
|
pub pats: Vec<String>,
|
|
pub size_limit: usize,
|
|
pub dfa_size_limit: usize,
|
|
pub nest_limit: u32,
|
|
pub case_insensitive: bool,
|
|
pub multi_line: bool,
|
|
pub dot_matches_new_line: bool,
|
|
pub swap_greed: bool,
|
|
pub ignore_whitespace: bool,
|
|
pub unicode: bool,
|
|
pub octal: bool,
|
|
}
|
|
|
|
impl Default for RegexOptions {
|
|
fn default() -> Self {
|
|
RegexOptions {
|
|
pats: vec![],
|
|
size_limit: 10 * (1 << 20),
|
|
dfa_size_limit: 2 * (1 << 20),
|
|
nest_limit: 250,
|
|
case_insensitive: false,
|
|
multi_line: false,
|
|
dot_matches_new_line: false,
|
|
swap_greed: false,
|
|
ignore_whitespace: false,
|
|
unicode: true,
|
|
octal: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
macro_rules! define_builder {
|
|
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
|
pub mod $name {
|
|
use super::RegexOptions;
|
|
use crate::error::Error;
|
|
use crate::exec::ExecBuilder;
|
|
|
|
use crate::$regex_mod::Regex;
|
|
|
|
/// A configurable builder for a regular expression.
|
|
///
|
|
/// A builder can be used to configure how the regex is built, for example, by
|
|
/// setting the default flags (which can be overridden in the expression
|
|
/// itself) or setting various limits.
|
|
#[derive(Debug)]
|
|
pub struct RegexBuilder(RegexOptions);
|
|
|
|
impl RegexBuilder {
|
|
/// Create a new regular expression builder with the given pattern.
|
|
///
|
|
/// If the pattern is invalid, then an error will be returned when
|
|
/// `build` is called.
|
|
pub fn new(pattern: &str) -> RegexBuilder {
|
|
let mut builder = RegexBuilder(RegexOptions::default());
|
|
builder.0.pats.push(pattern.to_owned());
|
|
builder
|
|
}
|
|
|
|
/// Consume the builder and compile the regular expression.
|
|
///
|
|
/// Note that calling `as_str` on the resulting `Regex` will produce the
|
|
/// pattern given to `new` verbatim. Notably, it will not incorporate any
|
|
/// of the flags set on this builder.
|
|
pub fn build(&self) -> Result<Regex, Error> {
|
|
ExecBuilder::new_options(self.0.clone())
|
|
.only_utf8($only_utf8)
|
|
.build()
|
|
.map(Regex::from)
|
|
}
|
|
|
|
/// Set the value for the case insensitive (`i`) flag.
|
|
///
|
|
/// When enabled, letters in the pattern will match both upper case and
|
|
/// lower case variants.
|
|
pub fn case_insensitive(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexBuilder {
|
|
self.0.case_insensitive = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the multi-line matching (`m`) flag.
|
|
///
|
|
/// When enabled, `^` matches the beginning of lines and `$` matches the
|
|
/// end of lines.
|
|
///
|
|
/// By default, they match beginning/end of the input.
|
|
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
|
self.0.multi_line = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the any character (`s`) flag, where in `.` matches
|
|
/// anything when `s` is set and matches anything except for new line when
|
|
/// it is not set (the default).
|
|
///
|
|
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
|
|
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
|
|
/// Unicode is enabled.
|
|
pub fn dot_matches_new_line(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexBuilder {
|
|
self.0.dot_matches_new_line = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the greedy swap (`U`) flag.
|
|
///
|
|
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
|
|
/// match) and `a*?` is greedy (tries to find longest match).
|
|
///
|
|
/// By default, `a*` is greedy and `a*?` is lazy.
|
|
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
|
self.0.swap_greed = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the ignore whitespace (`x`) flag.
|
|
///
|
|
/// When enabled, whitespace such as new lines and spaces will be ignored
|
|
/// between expressions of the pattern, and `#` can be used to start a
|
|
/// comment until the next new line.
|
|
pub fn ignore_whitespace(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexBuilder {
|
|
self.0.ignore_whitespace = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the Unicode (`u`) flag.
|
|
///
|
|
/// Enabled by default. When disabled, character classes such as `\w` only
|
|
/// match ASCII word characters instead of all Unicode word characters.
|
|
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
|
self.0.unicode = yes;
|
|
self
|
|
}
|
|
|
|
/// Whether to support octal syntax or not.
|
|
///
|
|
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
|
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
|
/// `\141` are all equivalent regular expressions, where the last example
|
|
/// shows octal syntax.
|
|
///
|
|
/// While supporting octal syntax isn't in and of itself a problem, it does
|
|
/// make good error messages harder. That is, in PCRE based regex engines,
|
|
/// syntax like `\0` invokes a backreference, which is explicitly
|
|
/// unsupported in Rust's regex engine. However, many users expect it to
|
|
/// be supported. Therefore, when octal support is disabled, the error
|
|
/// message will explicitly mention that backreferences aren't supported.
|
|
///
|
|
/// Octal syntax is disabled by default.
|
|
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
|
|
self.0.octal = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the approximate size limit of the compiled regular expression.
|
|
///
|
|
/// This roughly corresponds to the number of bytes occupied by a single
|
|
/// compiled program. If the program exceeds this number, then a
|
|
/// compilation error is returned.
|
|
pub fn size_limit(
|
|
&mut self,
|
|
limit: usize,
|
|
) -> &mut RegexBuilder {
|
|
self.0.size_limit = limit;
|
|
self
|
|
}
|
|
|
|
/// Set the approximate size of the cache used by the DFA.
|
|
///
|
|
/// This roughly corresponds to the number of bytes that the DFA will
|
|
/// use while searching.
|
|
///
|
|
/// Note that this is a *per thread* limit. There is no way to set a global
|
|
/// limit. In particular, if a regex is used from multiple threads
|
|
/// simultaneously, then each thread may use up to the number of bytes
|
|
/// specified here.
|
|
pub fn dfa_size_limit(
|
|
&mut self,
|
|
limit: usize,
|
|
) -> &mut RegexBuilder {
|
|
self.0.dfa_size_limit = limit;
|
|
self
|
|
}
|
|
|
|
/// Set the nesting limit for this parser.
|
|
///
|
|
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
|
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
|
/// groups), then an error is returned by the parser.
|
|
///
|
|
/// The purpose of this limit is to act as a heuristic to prevent stack
|
|
/// overflow for consumers that do structural induction on an `Ast` using
|
|
/// explicit recursion. While this crate never does this (instead using
|
|
/// constant stack space and moving the call stack to the heap), other
|
|
/// crates may.
|
|
///
|
|
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
|
/// if callers want to put a limit on the amount of heap space used, then
|
|
/// they should impose a limit on the length, in bytes, of the concrete
|
|
/// pattern string. In particular, this is viable since this parser
|
|
/// implementation will limit itself to heap space proportional to the
|
|
/// length of the pattern string.
|
|
///
|
|
/// Note that a nest limit of `0` will return a nest limit error for most
|
|
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
|
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
|
/// depth of `1`. In general, a nest limit is not something that manifests
|
|
/// in an obvious way in the concrete syntax, therefore, it should not be
|
|
/// used in a granular way.
|
|
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
|
self.0.nest_limit = limit;
|
|
self
|
|
}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
define_builder!(bytes, re_bytes, false);
|
|
define_builder!(unicode, re_unicode, true);
|
|
|
|
macro_rules! define_set_builder {
|
|
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
|
pub mod $name {
|
|
use super::RegexOptions;
|
|
use crate::error::Error;
|
|
use crate::exec::ExecBuilder;
|
|
|
|
use crate::re_set::$regex_mod::RegexSet;
|
|
|
|
/// A configurable builder for a set of regular expressions.
|
|
///
|
|
/// A builder can be used to configure how the regexes are built, for example,
|
|
/// by setting the default flags (which can be overridden in the expression
|
|
/// itself) or setting various limits.
|
|
#[derive(Debug)]
|
|
pub struct RegexSetBuilder(RegexOptions);
|
|
|
|
impl RegexSetBuilder {
|
|
/// Create a new regular expression builder with the given pattern.
|
|
///
|
|
/// If the pattern is invalid, then an error will be returned when
|
|
/// `build` is called.
|
|
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
|
|
where
|
|
S: AsRef<str>,
|
|
I: IntoIterator<Item = S>,
|
|
{
|
|
let mut builder = RegexSetBuilder(RegexOptions::default());
|
|
for pat in patterns {
|
|
builder.0.pats.push(pat.as_ref().to_owned());
|
|
}
|
|
builder
|
|
}
|
|
|
|
/// Consume the builder and compile the regular expressions into a set.
|
|
pub fn build(&self) -> Result<RegexSet, Error> {
|
|
ExecBuilder::new_options(self.0.clone())
|
|
.only_utf8($only_utf8)
|
|
.build()
|
|
.map(RegexSet::from)
|
|
}
|
|
|
|
/// Set the value for the case insensitive (`i`) flag.
|
|
pub fn case_insensitive(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.case_insensitive = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the multi-line matching (`m`) flag.
|
|
pub fn multi_line(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.multi_line = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the any character (`s`) flag, where in `.` matches
|
|
/// anything when `s` is set and matches anything except for new line when
|
|
/// it is not set (the default).
|
|
///
|
|
/// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
|
|
/// expressions and means "any Unicode scalar value" for `regex::RegexSet`
|
|
/// expressions.
|
|
pub fn dot_matches_new_line(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.dot_matches_new_line = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the greedy swap (`U`) flag.
|
|
pub fn swap_greed(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.swap_greed = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the ignore whitespace (`x`) flag.
|
|
pub fn ignore_whitespace(
|
|
&mut self,
|
|
yes: bool,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.ignore_whitespace = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the value for the Unicode (`u`) flag.
|
|
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
|
self.0.unicode = yes;
|
|
self
|
|
}
|
|
|
|
/// Whether to support octal syntax or not.
|
|
///
|
|
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
|
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
|
/// `\141` are all equivalent regular expressions, where the last example
|
|
/// shows octal syntax.
|
|
///
|
|
/// While supporting octal syntax isn't in and of itself a problem, it does
|
|
/// make good error messages harder. That is, in PCRE based regex engines,
|
|
/// syntax like `\0` invokes a backreference, which is explicitly
|
|
/// unsupported in Rust's regex engine. However, many users expect it to
|
|
/// be supported. Therefore, when octal support is disabled, the error
|
|
/// message will explicitly mention that backreferences aren't supported.
|
|
///
|
|
/// Octal syntax is disabled by default.
|
|
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
|
|
self.0.octal = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the approximate size limit of the compiled regular expression.
|
|
///
|
|
/// This roughly corresponds to the number of bytes occupied by a single
|
|
/// compiled program. If the program exceeds this number, then a
|
|
/// compilation error is returned.
|
|
pub fn size_limit(
|
|
&mut self,
|
|
limit: usize,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.size_limit = limit;
|
|
self
|
|
}
|
|
|
|
/// Set the approximate size of the cache used by the DFA.
|
|
///
|
|
/// This roughly corresponds to the number of bytes that the DFA will
|
|
/// use while searching.
|
|
///
|
|
/// Note that this is a *per thread* limit. There is no way to set a global
|
|
/// limit. In particular, if a regex is used from multiple threads
|
|
/// simultaneously, then each thread may use up to the number of bytes
|
|
/// specified here.
|
|
pub fn dfa_size_limit(
|
|
&mut self,
|
|
limit: usize,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.dfa_size_limit = limit;
|
|
self
|
|
}
|
|
|
|
/// Set the nesting limit for this parser.
|
|
///
|
|
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
|
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
|
/// groups), then an error is returned by the parser.
|
|
///
|
|
/// The purpose of this limit is to act as a heuristic to prevent stack
|
|
/// overflow for consumers that do structural induction on an `Ast` using
|
|
/// explicit recursion. While this crate never does this (instead using
|
|
/// constant stack space and moving the call stack to the heap), other
|
|
/// crates may.
|
|
///
|
|
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
|
/// if callers want to put a limit on the amount of heap space used, then
|
|
/// they should impose a limit on the length, in bytes, of the concrete
|
|
/// pattern string. In particular, this is viable since this parser
|
|
/// implementation will limit itself to heap space proportional to the
|
|
/// length of the pattern string.
|
|
///
|
|
/// Note that a nest limit of `0` will return a nest limit error for most
|
|
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
|
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
|
/// depth of `1`. In general, a nest limit is not something that manifests
|
|
/// in an obvious way in the concrete syntax, therefore, it should not be
|
|
/// used in a granular way.
|
|
pub fn nest_limit(
|
|
&mut self,
|
|
limit: u32,
|
|
) -> &mut RegexSetBuilder {
|
|
self.0.nest_limit = limit;
|
|
self
|
|
}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
define_set_builder!(set_bytes, bytes, false);
|
|
define_set_builder!(set_unicode, unicode, true);
|