1257 lines
46 KiB
Rust
1257 lines
46 KiB
Rust
#[cfg(feature = "std")]
|
|
use core::fmt;
|
|
#[cfg(feature = "std")]
|
|
use core::iter;
|
|
use core::marker::PhantomData;
|
|
use core::mem::size_of;
|
|
#[cfg(feature = "std")]
|
|
use std::collections::HashMap;
|
|
|
|
#[cfg(feature = "std")]
|
|
use byteorder::{BigEndian, LittleEndian};
|
|
use byteorder::{ByteOrder, NativeEndian};
|
|
|
|
use classes::ByteClasses;
|
|
use dense;
|
|
use dfa::DFA;
|
|
#[cfg(feature = "std")]
|
|
use error::{Error, Result};
|
|
#[cfg(feature = "std")]
|
|
use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
|
|
#[cfg(not(feature = "std"))]
|
|
use state_id::{dead_id, StateID};
|
|
|
|
/// A sparse table-based deterministic finite automaton (DFA).
|
|
///
|
|
/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
|
|
/// more space efficient representation for its transition table. Consequently,
|
|
/// sparse DFAs can use much less memory than dense DFAs, but this comes at a
|
|
/// price. In particular, reading the more space efficient transitions takes
|
|
/// more work, and consequently, searching using a sparse DFA is typically
|
|
/// slower than a dense DFA.
|
|
///
|
|
/// A sparse DFA can be built using the default configuration via the
|
|
/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
|
|
/// one can configure various aspects of a dense DFA via
|
|
/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
|
|
/// DFA to a sparse DFA using
|
|
/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
|
|
///
|
|
/// In general, a sparse DFA supports all the same operations as a dense DFA.
|
|
///
|
|
/// Making the choice between a dense and sparse DFA depends on your specific
|
|
/// work load. If you can sacrifice a bit of search time performance, then a
|
|
/// sparse DFA might be the best choice. In particular, while sparse DFAs are
|
|
/// probably always slower than dense DFAs, you may find that they are easily
|
|
/// fast enough for your purposes!
|
|
///
|
|
/// # State size
|
|
///
|
|
/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
|
|
/// the type of the DFA's transition table while `S` corresponds to the
|
|
/// representation used for the DFA's state identifiers as described by the
|
|
/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
|
|
/// `usize`, but other valid choices provided by this crate include `u8`,
|
|
/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
|
|
/// identifier representation than the default is to reduce the amount of
|
|
/// memory used by a DFA. Note though, that if the chosen representation cannot
|
|
/// accommodate the size of your DFA, then building the DFA will fail and
|
|
/// return an error.
|
|
///
|
|
/// While the reduction in heap memory used by a DFA is one reason for choosing
|
|
/// a smaller state identifier representation, another possible reason is for
|
|
/// decreasing the serialization size of a DFA, as returned by
|
|
/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
|
|
/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
|
|
/// or
|
|
/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
|
|
///
|
|
/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
|
|
/// depending on where the transition table is stored. Note that this is
|
|
/// different than a dense DFA, whose transition table is typically
|
|
/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
|
|
/// its transition table from raw bytes because the table is compactly packed.
|
|
///
|
|
/// # Variants
|
|
///
|
|
/// This DFA is defined as a non-exhaustive enumeration of different types of
|
|
/// dense DFAs. All of the variants use the same internal representation
|
|
/// for the transition table, but they vary in how the transition table is
|
|
/// read. A DFA's specific variant depends on the configuration options set via
|
|
/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
|
|
/// `ByteClass`.
|
|
///
|
|
/// # The `DFA` trait
|
|
///
|
|
/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
|
|
/// can be used for searching. For example:
|
|
///
|
|
/// ```
|
|
/// use regex_automata::{DFA, SparseDFA};
|
|
///
|
|
/// # fn example() -> Result<(), regex_automata::Error> {
|
|
/// let dfa = SparseDFA::new("foo[0-9]+")?;
|
|
/// assert_eq!(Some(8), dfa.find(b"foo12345"));
|
|
/// # Ok(()) }; example().unwrap()
|
|
/// ```
|
|
///
|
|
/// The `DFA` trait also provides an assortment of other lower level methods
|
|
/// for DFAs, such as `start_state` and `next_state`. While these are correctly
|
|
/// implemented, it is an anti-pattern to use them in performance sensitive
|
|
/// code on the `SparseDFA` type directly. Namely, each implementation requires
|
|
/// a branch to determine which type of sparse DFA is being used. Instead,
|
|
/// this branch should be pushed up a layer in the code since walking the
|
|
/// transitions of a DFA is usually a hot path. If you do need to use these
|
|
/// lower level methods in performance critical code, then you should match on
|
|
/// the variants of this DFA and use each variant's implementation of the `DFA`
|
|
/// trait directly.
|
|
#[derive(Clone, Debug)]
|
|
pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
|
|
/// A standard DFA that does not use byte classes.
|
|
Standard(Standard<T, S>),
|
|
/// A DFA that shrinks its alphabet to a set of equivalence classes instead
|
|
/// of using all possible byte values. Any two bytes belong to the same
|
|
/// equivalence class if and only if they can be used interchangeably
|
|
/// anywhere in the DFA while never discriminating between a match and a
|
|
/// non-match.
|
|
///
|
|
/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
|
|
/// from using byte classes. In some cases, using byte classes can even
|
|
/// marginally increase the size of a sparse DFA's transition table. The
|
|
/// reason for this is that a sparse DFA already compacts each state's
|
|
/// transitions separate from whether byte classes are used.
|
|
ByteClass(ByteClass<T, S>),
|
|
/// Hints that destructuring should not be exhaustive.
|
|
///
|
|
/// This enum may grow additional variants, so this makes sure clients
|
|
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
|
/// could break existing code.)
|
|
#[doc(hidden)]
|
|
__Nonexhaustive,
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl SparseDFA<Vec<u8>, usize> {
|
|
/// Parse the given regular expression using a default configuration and
|
|
/// return the corresponding sparse DFA.
|
|
///
|
|
/// The default configuration uses `usize` for state IDs and reduces the
|
|
/// alphabet size by splitting bytes into equivalence classes. The
|
|
/// resulting DFA is *not* minimized.
|
|
///
|
|
/// If you want a non-default configuration, then use the
|
|
/// [`dense::Builder`](dense/struct.Builder.html)
|
|
/// to set your own configuration, and then call
|
|
/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
|
|
/// to create a sparse DFA.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```
|
|
/// use regex_automata::{DFA, SparseDFA};
|
|
///
|
|
/// # fn example() -> Result<(), regex_automata::Error> {
|
|
/// let dfa = SparseDFA::new("foo[0-9]+bar")?;
|
|
/// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
|
|
/// # Ok(()) }; example().unwrap()
|
|
/// ```
|
|
pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
|
|
dense::Builder::new()
|
|
.build(pattern)
|
|
.and_then(|dense| dense.to_sparse())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<S: StateID> SparseDFA<Vec<u8>, S> {
|
|
/// Create a new empty sparse DFA that never matches any input.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// In order to build an empty DFA, callers must provide a type hint
|
|
/// indicating their choice of state identifier representation.
|
|
///
|
|
/// ```
|
|
/// use regex_automata::{DFA, SparseDFA};
|
|
///
|
|
/// # fn example() -> Result<(), regex_automata::Error> {
|
|
/// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
|
|
/// assert_eq!(None, dfa.find(b""));
|
|
/// assert_eq!(None, dfa.find(b"foo"));
|
|
/// # Ok(()) }; example().unwrap()
|
|
/// ```
|
|
pub fn empty() -> SparseDFA<Vec<u8>, S> {
|
|
dense::DenseDFA::empty().to_sparse().unwrap()
|
|
}
|
|
|
|
pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
|
|
dfa: &dense::Repr<T, S>,
|
|
) -> Result<SparseDFA<Vec<u8>, A>> {
|
|
Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
|
|
}
|
|
}
|
|
|
|
impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
|
|
/// Cheaply return a borrowed version of this sparse DFA. Specifically, the
|
|
/// DFA returned always uses `&[u8]` for its transition table while keeping
|
|
/// the same state identifier representation.
|
|
pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
|
|
match *self {
|
|
SparseDFA::Standard(Standard(ref r)) => {
|
|
SparseDFA::Standard(Standard(r.as_ref()))
|
|
}
|
|
SparseDFA::ByteClass(ByteClass(ref r)) => {
|
|
SparseDFA::ByteClass(ByteClass(r.as_ref()))
|
|
}
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
/// Return an owned version of this sparse DFA. Specifically, the DFA
|
|
/// returned always uses `Vec<u8>` for its transition table while keeping
|
|
/// the same state identifier representation.
|
|
///
|
|
/// Effectively, this returns a sparse DFA whose transition table lives
|
|
/// on the heap.
|
|
#[cfg(feature = "std")]
|
|
pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
|
|
match *self {
|
|
SparseDFA::Standard(Standard(ref r)) => {
|
|
SparseDFA::Standard(Standard(r.to_owned()))
|
|
}
|
|
SparseDFA::ByteClass(ByteClass(ref r)) => {
|
|
SparseDFA::ByteClass(ByteClass(r.to_owned()))
|
|
}
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
/// Returns the memory usage, in bytes, of this DFA.
|
|
///
|
|
/// The memory usage is computed based on the number of bytes used to
|
|
/// represent this DFA's transition table. This typically corresponds to
|
|
/// heap memory usage.
|
|
///
|
|
/// This does **not** include the stack size used up by this DFA. To
|
|
/// compute that, used `std::mem::size_of::<SparseDFA>()`.
|
|
pub fn memory_usage(&self) -> usize {
|
|
self.repr().memory_usage()
|
|
}
|
|
|
|
fn repr(&self) -> &Repr<T, S> {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => &r.0,
|
|
SparseDFA::ByteClass(ref r) => &r.0,
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Routines for converting a sparse DFA to other representations, such as
|
|
/// smaller state identifiers or raw bytes suitable for persistent storage.
|
|
#[cfg(feature = "std")]
|
|
impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
|
|
/// Create a new sparse DFA whose match semantics are equivalent to
|
|
/// this DFA, but attempt to use `u8` for the representation of state
|
|
/// identifiers. If `u8` is insufficient to represent all state identifiers
|
|
/// in this DFA, then this returns an error.
|
|
///
|
|
/// This is a convenience routine for `to_sized::<u8>()`.
|
|
pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
|
|
self.to_sized()
|
|
}
|
|
|
|
/// Create a new sparse DFA whose match semantics are equivalent to
|
|
/// this DFA, but attempt to use `u16` for the representation of state
|
|
/// identifiers. If `u16` is insufficient to represent all state
|
|
/// identifiers in this DFA, then this returns an error.
|
|
///
|
|
/// This is a convenience routine for `to_sized::<u16>()`.
|
|
pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
|
|
self.to_sized()
|
|
}
|
|
|
|
/// Create a new sparse DFA whose match semantics are equivalent to
|
|
/// this DFA, but attempt to use `u32` for the representation of state
|
|
/// identifiers. If `u32` is insufficient to represent all state
|
|
/// identifiers in this DFA, then this returns an error.
|
|
///
|
|
/// This is a convenience routine for `to_sized::<u32>()`.
|
|
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
|
pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
|
|
self.to_sized()
|
|
}
|
|
|
|
/// Create a new sparse DFA whose match semantics are equivalent to
|
|
/// this DFA, but attempt to use `u64` for the representation of state
|
|
/// identifiers. If `u64` is insufficient to represent all state
|
|
/// identifiers in this DFA, then this returns an error.
|
|
///
|
|
/// This is a convenience routine for `to_sized::<u64>()`.
|
|
#[cfg(target_pointer_width = "64")]
|
|
pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
|
|
self.to_sized()
|
|
}
|
|
|
|
/// Create a new sparse DFA whose match semantics are equivalent to
|
|
/// this DFA, but attempt to use `A` for the representation of state
|
|
/// identifiers. If `A` is insufficient to represent all state identifiers
|
|
/// in this DFA, then this returns an error.
|
|
///
|
|
/// An alternative way to construct such a DFA is to use
|
|
/// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
|
|
/// In general, picking the appropriate size upon initial construction of
|
|
/// a sparse DFA is preferred, since it will do the conversion in one
|
|
/// step instead of two.
|
|
pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
|
|
self.repr().to_sized().map(|r| r.into_sparse_dfa())
|
|
}
|
|
|
|
/// Serialize a sparse DFA to raw bytes in little endian format.
|
|
///
|
|
/// If the state identifier representation of this DFA has a size different
|
|
/// than 1, 2, 4 or 8 bytes, then this returns an error. All
|
|
/// implementations of `StateID` provided by this crate satisfy this
|
|
/// requirement.
|
|
pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
|
|
self.repr().to_bytes::<LittleEndian>()
|
|
}
|
|
|
|
/// Serialize a sparse DFA to raw bytes in big endian format.
|
|
///
|
|
/// If the state identifier representation of this DFA has a size different
|
|
/// than 1, 2, 4 or 8 bytes, then this returns an error. All
|
|
/// implementations of `StateID` provided by this crate satisfy this
|
|
/// requirement.
|
|
pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
|
|
self.repr().to_bytes::<BigEndian>()
|
|
}
|
|
|
|
/// Serialize a sparse DFA to raw bytes in native endian format.
|
|
/// Generally, it is better to pick an explicit endianness using either
|
|
/// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
|
|
/// useful in tests where the DFA is serialized and deserialized on the
|
|
/// same platform.
|
|
///
|
|
/// If the state identifier representation of this DFA has a size different
|
|
/// than 1, 2, 4 or 8 bytes, then this returns an error. All
|
|
/// implementations of `StateID` provided by this crate satisfy this
|
|
/// requirement.
|
|
pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
|
|
self.repr().to_bytes::<NativeEndian>()
|
|
}
|
|
}
|
|
|
|
impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
|
|
/// Deserialize a sparse DFA with a specific state identifier
|
|
/// representation.
|
|
///
|
|
/// Deserializing a DFA using this routine will never allocate heap memory.
|
|
/// This is also guaranteed to be a constant time operation that does not
|
|
/// vary with the size of the DFA.
|
|
///
|
|
/// The bytes given should be generated by the serialization of a DFA with
|
|
/// either the
|
|
/// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
|
|
/// method or the
|
|
/// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
|
|
/// endian, depending on the endianness of the machine you are
|
|
/// deserializing this DFA from.
|
|
///
|
|
/// If the state identifier representation is `usize`, then deserialization
|
|
/// is dependent on the pointer size. For this reason, it is best to
|
|
/// serialize DFAs using a fixed size representation for your state
|
|
/// identifiers, such as `u8`, `u16`, `u32` or `u64`.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// The bytes given should be *trusted*. In particular, if the bytes
|
|
/// are not a valid serialization of a DFA, or if the endianness of the
|
|
/// serialized bytes is different than the endianness of the machine that
|
|
/// is deserializing the DFA, then this routine will panic. Moreover, it
|
|
/// is possible for this deserialization routine to succeed even if the
|
|
/// given bytes do not represent a valid serialized sparse DFA.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// This routine is unsafe because it permits callers to provide an
|
|
/// arbitrary transition table with possibly incorrect transitions. While
|
|
/// the various serialization routines will never return an incorrect
|
|
/// transition table, there is no guarantee that the bytes provided here
|
|
/// are correct. While deserialization does many checks (as documented
|
|
/// above in the panic conditions), this routine does not check that the
|
|
/// transition table is correct. Given an incorrect transition table, it is
|
|
/// possible for the search routines to access out-of-bounds memory because
|
|
/// of explicit bounds check elision.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// This example shows how to serialize a DFA to raw bytes, deserialize it
|
|
/// and then use it for searching. Note that we first convert the DFA to
|
|
/// using `u16` for its state identifier representation before serializing
|
|
/// it. While this isn't strictly necessary, it's good practice in order to
|
|
/// decrease the size of the DFA and to avoid platform specific pitfalls
|
|
/// such as differing pointer sizes.
|
|
///
|
|
/// ```
|
|
/// use regex_automata::{DFA, DenseDFA, SparseDFA};
|
|
///
|
|
/// # fn example() -> Result<(), regex_automata::Error> {
|
|
/// let sparse = SparseDFA::new("foo[0-9]+")?;
|
|
/// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
|
|
///
|
|
/// let dfa: SparseDFA<&[u8], u16> = unsafe {
|
|
/// SparseDFA::from_bytes(&bytes)
|
|
/// };
|
|
///
|
|
/// assert_eq!(Some(8), dfa.find(b"foo12345"));
|
|
/// # Ok(()) }; example().unwrap()
|
|
/// ```
|
|
pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
|
|
Repr::from_bytes(buf).into_sparse_dfa()
|
|
}
|
|
}
|
|
|
|
impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
|
|
type ID = S;
|
|
|
|
#[inline]
|
|
fn start_state(&self) -> S {
|
|
self.repr().start_state()
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_state(&self, id: S) -> bool {
|
|
self.repr().is_match_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_dead_state(&self, id: S) -> bool {
|
|
self.repr().is_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_or_dead_state(&self, id: S) -> bool {
|
|
self.repr().is_match_or_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_anchored(&self) -> bool {
|
|
self.repr().is_anchored()
|
|
}
|
|
|
|
#[inline]
|
|
fn next_state(&self, current: S, input: u8) -> S {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => r.next_state(current, input),
|
|
SparseDFA::ByteClass(ref r) => r.next_state(current, input),
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
|
self.next_state(current, input)
|
|
}
|
|
|
|
// We specialize the following methods because it lets us lift the
|
|
// case analysis between the different types of sparse DFAs. Instead of
|
|
// doing the case analysis for every transition, we do it once before
|
|
// searching. For sparse DFAs, this doesn't seem to benefit performance as
|
|
// much as it does for the dense DFAs, but it's easy to do so we might as
|
|
// well do it.
|
|
|
|
#[inline]
|
|
fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
|
|
SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
|
|
SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => r.find_at(bytes, start),
|
|
SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
|
match *self {
|
|
SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
|
|
SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
|
|
SparseDFA::__Nonexhaustive => unreachable!(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A standard sparse DFA that does not use premultiplication or byte classes.
|
|
///
|
|
/// Generally, it isn't necessary to use this type directly, since a
|
|
/// `SparseDFA` can be used for searching directly. One possible reason why
|
|
/// one might want to use this type directly is if you are implementing your
|
|
/// own search routines by walking a DFA's transitions directly. In that case,
|
|
/// you'll want to use this type (or any of the other DFA variant types)
|
|
/// directly, since they implement `next_state` more efficiently.
|
|
#[derive(Clone, Debug)]
|
|
pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
|
|
|
|
impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
|
|
type ID = S;
|
|
|
|
#[inline]
|
|
fn start_state(&self) -> S {
|
|
self.0.start_state()
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_state(&self, id: S) -> bool {
|
|
self.0.is_match_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_dead_state(&self, id: S) -> bool {
|
|
self.0.is_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_or_dead_state(&self, id: S) -> bool {
|
|
self.0.is_match_or_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_anchored(&self) -> bool {
|
|
self.0.is_anchored()
|
|
}
|
|
|
|
#[inline]
|
|
fn next_state(&self, current: S, input: u8) -> S {
|
|
self.0.state(current).next(input)
|
|
}
|
|
|
|
#[inline]
|
|
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
|
self.next_state(current, input)
|
|
}
|
|
}
|
|
|
|
/// A sparse DFA that shrinks its alphabet.
|
|
///
|
|
/// Alphabet shrinking is achieved by using a set of equivalence classes
|
|
/// instead of using all possible byte values. Any two bytes belong to the same
|
|
/// equivalence class if and only if they can be used interchangeably anywhere
|
|
/// in the DFA while never discriminating between a match and a non-match.
|
|
///
|
|
/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
|
|
/// using byte classes. In some cases, using byte classes can even marginally
|
|
/// increase the size of a sparse DFA's transition table. The reason for this
|
|
/// is that a sparse DFA already compacts each state's transitions separate
|
|
/// from whether byte classes are used.
|
|
///
|
|
/// Generally, it isn't necessary to use this type directly, since a
|
|
/// `SparseDFA` can be used for searching directly. One possible reason why
|
|
/// one might want to use this type directly is if you are implementing your
|
|
/// own search routines by walking a DFA's transitions directly. In that case,
|
|
/// you'll want to use this type (or any of the other DFA variant types)
|
|
/// directly, since they implement `next_state` more efficiently.
|
|
#[derive(Clone, Debug)]
|
|
pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
|
|
|
|
impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
|
|
type ID = S;
|
|
|
|
#[inline]
|
|
fn start_state(&self) -> S {
|
|
self.0.start_state()
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_state(&self, id: S) -> bool {
|
|
self.0.is_match_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_dead_state(&self, id: S) -> bool {
|
|
self.0.is_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_match_or_dead_state(&self, id: S) -> bool {
|
|
self.0.is_match_or_dead_state(id)
|
|
}
|
|
|
|
#[inline]
|
|
fn is_anchored(&self) -> bool {
|
|
self.0.is_anchored()
|
|
}
|
|
|
|
#[inline]
|
|
fn next_state(&self, current: S, input: u8) -> S {
|
|
let input = self.0.byte_classes.get(input);
|
|
self.0.state(current).next(input)
|
|
}
|
|
|
|
#[inline]
|
|
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
|
self.next_state(current, input)
|
|
}
|
|
}
|
|
|
|
/// The underlying representation of a sparse DFA. This is shared by all of
|
|
/// the different variants of a sparse DFA.
|
|
#[derive(Clone)]
|
|
#[cfg_attr(not(feature = "std"), derive(Debug))]
|
|
struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
|
|
anchored: bool,
|
|
start: S,
|
|
state_count: usize,
|
|
max_match: S,
|
|
byte_classes: ByteClasses,
|
|
trans: T,
|
|
}
|
|
|
|
impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
|
|
fn into_sparse_dfa(self) -> SparseDFA<T, S> {
|
|
if self.byte_classes.is_singleton() {
|
|
SparseDFA::Standard(Standard(self))
|
|
} else {
|
|
SparseDFA::ByteClass(ByteClass(self))
|
|
}
|
|
}
|
|
|
|
fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
|
|
Repr {
|
|
anchored: self.anchored,
|
|
start: self.start,
|
|
state_count: self.state_count,
|
|
max_match: self.max_match,
|
|
byte_classes: self.byte_classes.clone(),
|
|
trans: self.trans(),
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
fn to_owned(&self) -> Repr<Vec<u8>, S> {
|
|
Repr {
|
|
anchored: self.anchored,
|
|
start: self.start,
|
|
state_count: self.state_count,
|
|
max_match: self.max_match,
|
|
byte_classes: self.byte_classes.clone(),
|
|
trans: self.trans().to_vec(),
|
|
}
|
|
}
|
|
|
|
/// Return a convenient representation of the given state.
|
|
///
|
|
/// This is marked as inline because it doesn't seem to get inlined
|
|
/// otherwise, which leads to a fairly significant performance loss (~25%).
|
|
#[inline]
|
|
fn state<'a>(&'a self, id: S) -> State<'a, S> {
|
|
let mut pos = id.to_usize();
|
|
let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
|
|
pos += 2;
|
|
let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
|
|
pos += 2 * ntrans;
|
|
let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
|
|
State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
|
|
}
|
|
|
|
/// Return an iterator over all of the states in this DFA.
|
|
///
|
|
/// The iterator returned yields tuples, where the first element is the
|
|
/// state ID and the second element is the state itself.
|
|
#[cfg(feature = "std")]
|
|
fn states<'a>(&'a self) -> StateIter<'a, T, S> {
|
|
StateIter { dfa: self, id: dead_id() }
|
|
}
|
|
|
|
fn memory_usage(&self) -> usize {
|
|
self.trans().len()
|
|
}
|
|
|
|
fn start_state(&self) -> S {
|
|
self.start
|
|
}
|
|
|
|
fn is_match_state(&self, id: S) -> bool {
|
|
self.is_match_or_dead_state(id) && !self.is_dead_state(id)
|
|
}
|
|
|
|
fn is_dead_state(&self, id: S) -> bool {
|
|
id == dead_id()
|
|
}
|
|
|
|
fn is_match_or_dead_state(&self, id: S) -> bool {
|
|
id <= self.max_match
|
|
}
|
|
|
|
fn is_anchored(&self) -> bool {
|
|
self.anchored
|
|
}
|
|
|
|
fn trans(&self) -> &[u8] {
|
|
self.trans.as_ref()
|
|
}
|
|
|
|
/// Create a new sparse DFA whose match semantics are equivalent to this
|
|
/// DFA, but attempt to use `A` for the representation of state
|
|
/// identifiers. If `A` is insufficient to represent all state identifiers
|
|
/// in this DFA, then this returns an error.
|
|
#[cfg(feature = "std")]
|
|
fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
|
|
// To build the new DFA, we proceed much like the initial construction
|
|
// of the sparse DFA. Namely, since the state ID size is changing,
|
|
// we don't actually know all of our state IDs until we've allocated
|
|
// all necessary space. So we do one pass that allocates all of the
|
|
// storage we need, and then another pass to fill in the transitions.
|
|
|
|
let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
|
|
let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
|
|
for (old_id, state) in self.states() {
|
|
let pos = trans.len();
|
|
map.insert(old_id, usize_to_state_id(pos)?);
|
|
|
|
let n = state.ntrans;
|
|
let zeros = 2 + (n * 2) + (n * size_of::<A>());
|
|
trans.extend(iter::repeat(0).take(zeros));
|
|
|
|
NativeEndian::write_u16(&mut trans[pos..], n as u16);
|
|
let (s, e) = (pos + 2, pos + 2 + (n * 2));
|
|
trans[s..e].copy_from_slice(state.input_ranges);
|
|
}
|
|
|
|
let mut new = Repr {
|
|
anchored: self.anchored,
|
|
start: map[&self.start],
|
|
state_count: self.state_count,
|
|
max_match: map[&self.max_match],
|
|
byte_classes: self.byte_classes.clone(),
|
|
trans,
|
|
};
|
|
for (&old_id, &new_id) in map.iter() {
|
|
let old_state = self.state(old_id);
|
|
let mut new_state = new.state_mut(new_id);
|
|
for i in 0..new_state.ntrans {
|
|
let next = map[&old_state.next_at(i)];
|
|
new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
|
|
}
|
|
}
|
|
new.start = map[&self.start];
|
|
new.max_match = map[&self.max_match];
|
|
Ok(new)
|
|
}
|
|
|
|
/// Serialize a sparse DFA to raw bytes using the provided endianness.
|
|
///
|
|
/// If the state identifier representation of this DFA has a size different
|
|
/// than 1, 2, 4 or 8 bytes, then this returns an error. All
|
|
/// implementations of `StateID` provided by this crate satisfy this
|
|
/// requirement.
|
|
///
|
|
/// Unlike dense DFAs, the result is not necessarily aligned since a
|
|
/// sparse DFA's transition table is always read as a sequence of bytes.
|
|
#[cfg(feature = "std")]
|
|
fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
|
|
let label = b"rust-regex-automata-sparse-dfa\x00";
|
|
let size =
|
|
// For human readable label.
|
|
label.len()
|
|
// endiannes check, must be equal to 0xFEFF for native endian
|
|
+ 2
|
|
// For version number.
|
|
+ 2
|
|
// Size of state ID representation, in bytes.
|
|
// Must be 1, 2, 4 or 8.
|
|
+ 2
|
|
// For DFA misc options. (Currently unused.)
|
|
+ 2
|
|
// For start state.
|
|
+ 8
|
|
// For state count.
|
|
+ 8
|
|
// For max match state.
|
|
+ 8
|
|
// For byte class map.
|
|
+ 256
|
|
// For transition table.
|
|
+ self.trans().len();
|
|
|
|
let mut i = 0;
|
|
let mut buf = vec![0; size];
|
|
|
|
// write label
|
|
for &b in label {
|
|
buf[i] = b;
|
|
i += 1;
|
|
}
|
|
// endianness check
|
|
A::write_u16(&mut buf[i..], 0xFEFF);
|
|
i += 2;
|
|
// version number
|
|
A::write_u16(&mut buf[i..], 1);
|
|
i += 2;
|
|
// size of state ID
|
|
let state_size = size_of::<S>();
|
|
if ![1, 2, 4, 8].contains(&state_size) {
|
|
return Err(Error::serialize(&format!(
|
|
"state size of {} not supported, must be 1, 2, 4 or 8",
|
|
state_size
|
|
)));
|
|
}
|
|
A::write_u16(&mut buf[i..], state_size as u16);
|
|
i += 2;
|
|
// DFA misc options
|
|
let mut options = 0u16;
|
|
if self.anchored {
|
|
options |= dense::MASK_ANCHORED;
|
|
}
|
|
A::write_u16(&mut buf[i..], options);
|
|
i += 2;
|
|
// start state
|
|
A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
|
|
i += 8;
|
|
// state count
|
|
A::write_u64(&mut buf[i..], self.state_count as u64);
|
|
i += 8;
|
|
// max match state
|
|
A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
|
|
i += 8;
|
|
// byte class map
|
|
for b in (0..256).map(|b| b as u8) {
|
|
buf[i] = self.byte_classes.get(b);
|
|
i += 1;
|
|
}
|
|
// transition table
|
|
for (_, state) in self.states() {
|
|
A::write_u16(&mut buf[i..], state.ntrans as u16);
|
|
i += 2;
|
|
buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
|
|
i += state.ntrans * 2;
|
|
for j in 0..state.ntrans {
|
|
write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
|
|
i += size_of::<S>();
|
|
}
|
|
}
|
|
|
|
assert_eq!(size, i, "expected to consume entire buffer");
|
|
|
|
Ok(buf)
|
|
}
|
|
}
|
|
|
|
impl<'a, S: StateID> Repr<&'a [u8], S> {
|
|
/// The implementation for deserializing a sparse DFA from raw bytes.
|
|
unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
|
|
// skip over label
|
|
match buf.iter().position(|&b| b == b'\x00') {
|
|
None => panic!("could not find label"),
|
|
Some(i) => buf = &buf[i + 1..],
|
|
}
|
|
|
|
// check that current endianness is same as endianness of DFA
|
|
let endian_check = NativeEndian::read_u16(buf);
|
|
buf = &buf[2..];
|
|
if endian_check != 0xFEFF {
|
|
panic!(
|
|
"endianness mismatch, expected 0xFEFF but got 0x{:X}. \
|
|
are you trying to load a SparseDFA serialized with a \
|
|
different endianness?",
|
|
endian_check,
|
|
);
|
|
}
|
|
|
|
// check that the version number is supported
|
|
let version = NativeEndian::read_u16(buf);
|
|
buf = &buf[2..];
|
|
if version != 1 {
|
|
panic!(
|
|
"expected version 1, but found unsupported version {}",
|
|
version,
|
|
);
|
|
}
|
|
|
|
// read size of state
|
|
let state_size = NativeEndian::read_u16(buf) as usize;
|
|
if state_size != size_of::<S>() {
|
|
panic!(
|
|
"state size of SparseDFA ({}) does not match \
|
|
requested state size ({})",
|
|
state_size,
|
|
size_of::<S>(),
|
|
);
|
|
}
|
|
buf = &buf[2..];
|
|
|
|
// read miscellaneous options
|
|
let opts = NativeEndian::read_u16(buf);
|
|
buf = &buf[2..];
|
|
|
|
// read start state
|
|
let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
|
|
buf = &buf[8..];
|
|
|
|
// read state count
|
|
let state_count = NativeEndian::read_u64(buf) as usize;
|
|
buf = &buf[8..];
|
|
|
|
// read max match state
|
|
let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
|
|
buf = &buf[8..];
|
|
|
|
// read byte classes
|
|
let byte_classes = ByteClasses::from_slice(&buf[..256]);
|
|
buf = &buf[256..];
|
|
|
|
Repr {
|
|
anchored: opts & dense::MASK_ANCHORED > 0,
|
|
start,
|
|
state_count,
|
|
max_match,
|
|
byte_classes,
|
|
trans: buf,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<S: StateID> Repr<Vec<u8>, S> {
|
|
/// The implementation for constructing a sparse DFA from a dense DFA.
|
|
fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
|
|
dfa: &dense::Repr<T, S>,
|
|
) -> Result<Repr<Vec<u8>, A>> {
|
|
// In order to build the transition table, we need to be able to write
|
|
// state identifiers for each of the "next" transitions in each state.
|
|
// Our state identifiers correspond to the byte offset in the
|
|
// transition table at which the state is encoded. Therefore, we do not
|
|
// actually know what the state identifiers are until we've allocated
|
|
// exactly as much space as we need for each state. Thus, construction
|
|
// of the transition table happens in two passes.
|
|
//
|
|
// In the first pass, we fill out the shell of each state, which
|
|
// includes the transition count, the input byte ranges and zero-filled
|
|
// space for the transitions. In this first pass, we also build up a
|
|
// map from the state identifier index of the dense DFA to the state
|
|
// identifier in this sparse DFA.
|
|
//
|
|
// In the second pass, we fill in the transitions based on the map
|
|
// built in the first pass.
|
|
|
|
let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
|
|
let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
|
|
for (old_id, state) in dfa.states() {
|
|
let pos = trans.len();
|
|
|
|
remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
|
|
// zero-filled space for the transition count
|
|
trans.push(0);
|
|
trans.push(0);
|
|
|
|
let mut trans_count = 0;
|
|
for (b1, b2, _) in state.sparse_transitions() {
|
|
trans_count += 1;
|
|
trans.push(b1);
|
|
trans.push(b2);
|
|
}
|
|
// fill in the transition count
|
|
NativeEndian::write_u16(&mut trans[pos..], trans_count);
|
|
|
|
// zero-fill the actual transitions
|
|
let zeros = trans_count as usize * size_of::<A>();
|
|
trans.extend(iter::repeat(0).take(zeros));
|
|
}
|
|
|
|
let mut new = Repr {
|
|
anchored: dfa.is_anchored(),
|
|
start: remap[dfa.state_id_to_index(dfa.start_state())],
|
|
state_count: dfa.state_count(),
|
|
max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
|
|
byte_classes: dfa.byte_classes().clone(),
|
|
trans,
|
|
};
|
|
for (old_id, old_state) in dfa.states() {
|
|
let new_id = remap[dfa.state_id_to_index(old_id)];
|
|
let mut new_state = new.state_mut(new_id);
|
|
let sparse = old_state.sparse_transitions();
|
|
for (i, (_, _, next)) in sparse.enumerate() {
|
|
let next = remap[dfa.state_id_to_index(next)];
|
|
new_state.set_next_at(i, next);
|
|
}
|
|
}
|
|
Ok(new)
|
|
}
|
|
|
|
/// Return a convenient mutable representation of the given state.
|
|
fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
|
|
let mut pos = id.to_usize();
|
|
let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
|
|
pos += 2;
|
|
|
|
let size = (ntrans * 2) + (ntrans * size_of::<S>());
|
|
let ranges_and_next = &mut self.trans[pos..pos + size];
|
|
let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
|
|
StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
fn state_status<T: AsRef<[u8]>, S: StateID>(
|
|
dfa: &Repr<T, S>,
|
|
id: S,
|
|
) -> &'static str {
|
|
if id == dead_id() {
|
|
if dfa.is_match_state(id) {
|
|
"D*"
|
|
} else {
|
|
"D "
|
|
}
|
|
} else if id == dfa.start_state() {
|
|
if dfa.is_match_state(id) {
|
|
">*"
|
|
} else {
|
|
"> "
|
|
}
|
|
} else {
|
|
if dfa.is_match_state(id) {
|
|
" *"
|
|
} else {
|
|
" "
|
|
}
|
|
}
|
|
}
|
|
|
|
writeln!(f, "SparseDFA(")?;
|
|
for (id, state) in self.states() {
|
|
let status = state_status(self, id);
|
|
writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
|
|
}
|
|
writeln!(f, ")")?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// An iterator over all states in a sparse DFA.
|
|
///
|
|
/// This iterator yields tuples, where the first element is the state ID and
|
|
/// the second element is the state itself.
|
|
#[cfg(feature = "std")]
|
|
#[derive(Debug)]
|
|
struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
|
|
dfa: &'a Repr<T, S>,
|
|
id: S,
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
|
|
type Item = (S, State<'a, S>);
|
|
|
|
fn next(&mut self) -> Option<(S, State<'a, S>)> {
|
|
if self.id.to_usize() >= self.dfa.trans().len() {
|
|
return None;
|
|
}
|
|
let id = self.id;
|
|
let state = self.dfa.state(id);
|
|
self.id = S::from_usize(self.id.to_usize() + state.bytes());
|
|
Some((id, state))
|
|
}
|
|
}
|
|
|
|
/// A representation of a sparse DFA state that can be cheaply materialized
|
|
/// from a state identifier.
|
|
#[derive(Clone)]
|
|
struct State<'a, S: StateID = usize> {
|
|
/// The state identifier representation used by the DFA from which this
|
|
/// state was extracted. Since our transition table is compacted in a
|
|
/// &[u8], we don't actually use the state ID type parameter explicitly
|
|
/// anywhere, so we fake it. This prevents callers from using an incorrect
|
|
/// state ID representation to read from this state.
|
|
_state_id_repr: PhantomData<S>,
|
|
/// The number of transitions in this state.
|
|
ntrans: usize,
|
|
/// Pairs of input ranges, where there is one pair for each transition.
|
|
/// Each pair specifies an inclusive start and end byte range for the
|
|
/// corresponding transition.
|
|
input_ranges: &'a [u8],
|
|
/// Transitions to the next state. This slice contains native endian
|
|
/// encoded state identifiers, with `S` as the representation. Thus, there
|
|
/// are `ntrans * size_of::<S>()` bytes in this slice.
|
|
next: &'a [u8],
|
|
}
|
|
|
|
impl<'a, S: StateID> State<'a, S> {
|
|
/// Searches for the next transition given an input byte. If no such
|
|
/// transition could be found, then a dead state is returned.
|
|
fn next(&self, input: u8) -> S {
|
|
// This straight linear search was observed to be much better than
|
|
// binary search on ASCII haystacks, likely because a binary search
|
|
// visits the ASCII case last but a linear search sees it first. A
|
|
// binary search does do a little better on non-ASCII haystacks, but
|
|
// not by much. There might be a better trade off lurking here.
|
|
for i in 0..self.ntrans {
|
|
let (start, end) = self.range(i);
|
|
if start <= input && input <= end {
|
|
return self.next_at(i);
|
|
}
|
|
// We could bail early with an extra branch: if input < b1, then
|
|
// we know we'll never find a matching transition. Interestingly,
|
|
// this extra branch seems to not help performance, or will even
|
|
// hurt it. It's likely very dependent on the DFA itself and what
|
|
// is being searched.
|
|
}
|
|
dead_id()
|
|
}
|
|
|
|
/// Returns the inclusive input byte range for the ith transition in this
|
|
/// state.
|
|
fn range(&self, i: usize) -> (u8, u8) {
|
|
(self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
|
|
}
|
|
|
|
/// Returns the next state for the ith transition in this state.
|
|
fn next_at(&self, i: usize) -> S {
|
|
S::read_bytes(&self.next[i * size_of::<S>()..])
|
|
}
|
|
|
|
/// Return the total number of bytes that this state consumes in its
|
|
/// encoded form.
|
|
#[cfg(feature = "std")]
|
|
fn bytes(&self) -> usize {
|
|
2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<'a, S: StateID> fmt::Debug for State<'a, S> {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
let mut transitions = vec![];
|
|
for i in 0..self.ntrans {
|
|
let next = self.next_at(i);
|
|
if next == dead_id() {
|
|
continue;
|
|
}
|
|
|
|
let (start, end) = self.range(i);
|
|
if start == end {
|
|
transitions.push(format!(
|
|
"{} => {}",
|
|
escape(start),
|
|
next.to_usize()
|
|
));
|
|
} else {
|
|
transitions.push(format!(
|
|
"{}-{} => {}",
|
|
escape(start),
|
|
escape(end),
|
|
next.to_usize(),
|
|
));
|
|
}
|
|
}
|
|
write!(f, "{}", transitions.join(", "))
|
|
}
|
|
}
|
|
|
|
/// A representation of a mutable sparse DFA state that can be cheaply
|
|
/// materialized from a state identifier.
|
|
#[cfg(feature = "std")]
|
|
struct StateMut<'a, S: StateID = usize> {
|
|
/// The state identifier representation used by the DFA from which this
|
|
/// state was extracted. Since our transition table is compacted in a
|
|
/// &[u8], we don't actually use the state ID type parameter explicitly
|
|
/// anywhere, so we fake it. This prevents callers from using an incorrect
|
|
/// state ID representation to read from this state.
|
|
_state_id_repr: PhantomData<S>,
|
|
/// The number of transitions in this state.
|
|
ntrans: usize,
|
|
/// Pairs of input ranges, where there is one pair for each transition.
|
|
/// Each pair specifies an inclusive start and end byte range for the
|
|
/// corresponding transition.
|
|
input_ranges: &'a mut [u8],
|
|
/// Transitions to the next state. This slice contains native endian
|
|
/// encoded state identifiers, with `S` as the representation. Thus, there
|
|
/// are `ntrans * size_of::<S>()` bytes in this slice.
|
|
next: &'a mut [u8],
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<'a, S: StateID> StateMut<'a, S> {
|
|
/// Sets the ith transition to the given state.
|
|
fn set_next_at(&mut self, i: usize, next: S) {
|
|
next.write_bytes(&mut self.next[i * size_of::<S>()..]);
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
let state = State {
|
|
_state_id_repr: self._state_id_repr,
|
|
ntrans: self.ntrans,
|
|
input_ranges: self.input_ranges,
|
|
next: self.next,
|
|
};
|
|
fmt::Debug::fmt(&state, f)
|
|
}
|
|
}
|
|
|
|
/// Return the given byte as its escaped string form.
|
|
#[cfg(feature = "std")]
|
|
fn escape(b: u8) -> String {
|
|
use std::ascii;
|
|
|
|
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
|
|
}
|
|
|
|
/// A binary search routine specialized specifically to a sparse DFA state's
|
|
/// transitions. Specifically, the transitions are defined as a set of pairs
|
|
/// of input bytes that delineate an inclusive range of bytes. If the input
|
|
/// byte is in the range, then the corresponding transition is a match.
|
|
///
|
|
/// This binary search accepts a slice of these pairs and returns the position
|
|
/// of the matching pair (the ith transition), or None if no matching pair
|
|
/// could be found.
|
|
///
|
|
/// Note that this routine is not currently used since it was observed to
|
|
/// either decrease performance when searching ASCII, or did not provide enough
|
|
/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
|
|
/// for posterity in case we can find a way to use it.
|
|
///
|
|
/// In theory, we could use the standard library's search routine if we could
|
|
/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
|
|
/// guaranteed to be safe and is thus UB (since I don't think the in-memory
|
|
/// representation of `(u8, u8)` has been nailed down).
|
|
#[inline(always)]
|
|
#[allow(dead_code)]
|
|
fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
|
|
debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
|
|
debug_assert!(ranges.len() <= 512, "ranges should be short");
|
|
|
|
let (mut left, mut right) = (0, ranges.len() / 2);
|
|
while left < right {
|
|
let mid = (left + right) / 2;
|
|
let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
|
|
if needle < b1 {
|
|
right = mid;
|
|
} else if needle > b2 {
|
|
left = mid + 1;
|
|
} else {
|
|
return Some(mid);
|
|
}
|
|
}
|
|
None
|
|
}
|