From 495751c215acc7f72b913ff2fe720679a4bf104e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 1 Jul 2025 21:28:34 +0200 Subject: [PATCH 01/62] Add generic trait for method parameters --- .gitignore | 5 +- src/fast_automaton/operation/alternation.rs | 23 +++++-- src/fast_automaton/operation/concatenate.rs | 21 ++++++- src/fast_automaton/operation/intersection.rs | 34 +++++++++-- src/lib.rs | 34 ++++++----- src/regex/mod.rs | 8 +-- src/regex/operation/union.rs | 64 +++++++++++++++----- src/traits.rs | 54 +++++++++++++++++ 8 files changed, 193 insertions(+), 50 deletions(-) create mode 100644 src/traits.rs diff --git a/.gitignore b/.gitignore index d01bd1a..bf7ff1c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +# cargo mutants output +mutants.out*/ \ No newline at end of file diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 06c386e..90437d1 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -2,18 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::error::EngineError; +use crate::{error::EngineError, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn union(&self, that: &FastAutomaton) -> Result { - let mut union = self.clone(); - union.alternate(that)?; - Ok(union) + pub fn union<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = self.clone(); + + for other in others.parameters() { + result.alternate(other)?; + + if result.is_total() { + break; + } + } + + Ok(result) } - pub fn alternation(automatons: Vec) -> Result { + pub fn alternation(automatons: &Vec) -> Result { if automatons.len() == 1 { return Ok(automatons[0].clone()); } diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 3741e01..e39642f 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -2,12 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::error::EngineError; +use crate::{error::EngineError, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn concatenate(automatons: Vec) -> Result { + pub fn concatenation<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = self.clone(); + + for other in others.parameters() { + result.concat(other)?; + + if result.is_total() { + break; + } + } + + Ok(result) + } + + pub fn concatenate(automatons: &Vec) -> Result { if automatons.len() == 1 { return Ok(automatons[0].clone()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 96007e6..4483199 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,17 +1,39 @@ +use std::borrow::Cow; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ThreadLocalParams, traits::MethodParameters}; use super::*; impl FastAutomaton { - pub fn intersection(&self, other: &FastAutomaton) -> Result { + pub fn intersection<'o, S>(&self, others: S) -> Result + where + S: MethodParameters<'o, FastAutomaton>, + { + let mut result = Cow::Borrowed(self); + + for other in others.parameters() { + result = result.intersection_(other)?; + + if result.is_empty() { + break; + } + } + + Ok(result.into_owned()) + } + + fn intersection_<'a>( + &self, + other: &'a FastAutomaton, + ) -> Result, EngineError> { if self.is_empty() || other.is_empty() { - return Ok(Self::new_empty()); + return Ok(Cow::Owned(Self::new_empty())); } else if self.is_total() { - return Ok(other.clone()); + return Ok(Cow::Borrowed(other)); } else if other.is_total() { - return Ok(self.clone()); + return Ok(Cow::Owned(self.clone())); } let execution_profile = ThreadLocalParams::get_execution_profile(); @@ -70,7 +92,7 @@ impl FastAutomaton { } new_automaton.spanning_set = new_spanning_set; new_automaton.remove_dead_transitions(); - Ok(new_automaton) + Ok(Cow::Owned(new_automaton)) } pub fn has_intersection(&self, other: &FastAutomaton) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 91493c7..f1681ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ pub mod execution_profile; pub mod fast_automaton; pub mod regex; pub mod tokenizer; +pub(crate) mod traits; type IntMap = HashMap>>; type IntSet = HashSet>>; @@ -73,33 +74,36 @@ impl Term { pub fn union(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut return_regex = RegularExpression::new_empty(); - let mut return_automaton = FastAutomaton::new_empty(); - match self { - Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone(); - } - Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.clone(); - } - } + let mut regex_list = Vec::with_capacity(terms.len()); + let mut automaton_list = Vec::with_capacity(terms.len()); for operand in terms { match operand { Term::RegularExpression(regex) => { - return_regex = return_regex.union(regex); - if return_regex.is_total() { + if regex.is_total() { return Ok(Term::RegularExpression(RegularExpression::new_total())); } + regex_list.push(regex); } Term::Automaton(automaton) => { - return_automaton = return_automaton.union(automaton)?; - if return_automaton.is_total() { + if automaton.is_total() { return Ok(Term::RegularExpression(RegularExpression::new_total())); } + automaton_list.push(automaton); } } } + let mut return_regex = RegularExpression::new_empty(); + let mut return_automaton = FastAutomaton::new_empty(); + match self { + Term::RegularExpression(regular_expression) => { + return_regex = regular_expression.union(®ex_list); + } + Term::Automaton(fast_automaton) => { + return_automaton = fast_automaton.union(&automaton_list)?; + } + } + if return_automaton.is_empty() { Ok(Term::RegularExpression(return_regex)) } else { @@ -138,7 +142,7 @@ impl Term { let mut return_automaton = self.get_automaton()?; for term in terms { let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(&automaton)?); + return_automaton = Cow::Owned(return_automaton.intersection(automaton.as_ref())?); if return_automaton.is_empty() { return Ok(Term::RegularExpression(RegularExpression::new_empty())); } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 176612f..59be90b 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -138,14 +138,14 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(concats) + FastAutomaton::concatenate(&concats) } RegularExpression::Alternation(alternation) => { - let mut concats = Vec::with_capacity(alternation.len()); + let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { - concats.push(c.to_automaton()?); + alternates.push(c.to_automaton()?); } - FastAutomaton::alternation(concats) + FastAutomaton::alternation(&alternates) } } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8f5c1ae..8508e4e 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -1,21 +1,40 @@ use std::collections::BTreeSet; +use crate::traits::MethodParameters; + use super::*; impl RegularExpression { - pub fn union(&self, other: &RegularExpression) -> RegularExpression { + pub fn union<'o, S>(&self, others: S) -> RegularExpression + where + S: MethodParameters<'o, RegularExpression>, + { + let mut result = Cow::Borrowed(self); + + for other in others.parameters() { + result = result.union_(other); + + if result.is_total() { + break; + } + } + + result.into_owned() + } + + fn union_<'a>(&self, other: &'a RegularExpression) -> Cow<'a, RegularExpression> { if self.is_total() || other.is_total() { - return RegularExpression::new_total(); + return Cow::Owned(RegularExpression::new_total()); } else if self.is_empty() { - return other.clone(); + return Cow::Borrowed(other); } else if other.is_empty() || self == other { - return self.clone(); + return Cow::Owned(self.clone()); } else if other.is_empty_string() { - return self.clone().repeat(0, Some(1)); + return Cow::Owned(self.repeat(0, Some(1))); } else if self.is_empty_string() { - return other.clone().repeat(0, Some(1)); + return Cow::Owned(other.repeat(0, Some(1))); } - match (self, other) { + Cow::Owned(match (self, other) { ( RegularExpression::Character(self_range), RegularExpression::Character(other_range), @@ -63,14 +82,14 @@ impl RegularExpression { Self::opunion_concat_and_alternation(other, self) } (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(_)) => { - let mut new_alternation = other.clone(); + let mut new_alternation = Cow::Borrowed(other); for self_element in self_elements { - new_alternation = new_alternation.union(self_element); + new_alternation = new_alternation.union_(self_element); } - new_alternation + new_alternation.into_owned() } - } + }) } fn opunion_character_and_repetition( @@ -116,17 +135,25 @@ impl RegularExpression { if prefix.is_none() && suffix.is_none() { let mut alternate_elements = vec![self_regex, other_regex]; alternate_elements.sort_unstable(); - RegularExpression::Alternation(alternate_elements) + Cow::Owned(RegularExpression::Alternation(alternate_elements)) } else { - self_regex.union(&other_regex) + self_regex.union_(&other_regex) } } else { - RegularExpression::Repetition(Box::new(self_regex), 0, Some(1)) + Cow::Owned(RegularExpression::Repetition( + Box::new(self_regex), + 0, + Some(1), + )) } } else if !other_regex.is_empty_string() { - RegularExpression::Repetition(Box::new(other_regex), 0, Some(1)) + Cow::Owned(RegularExpression::Repetition( + Box::new(other_regex), + 0, + Some(1), + )) } else { - RegularExpression::new_empty_string() + Cow::Owned(RegularExpression::new_empty_string()) }; regex = regex.concat(®ex_from_alternate, true); @@ -354,6 +381,11 @@ mod tests { #[test] fn test_union() -> Result<(), String> { assert_union("(a+|a+b)", "a+b?"); + assert_union("(a+|a*)", "a*"); + assert_union("(a?|a{0,2})", "a{0,2}"); + assert_union("(a{2,4}|a{1,3})", "a{1,4}"); + assert_union("(a{1,2}|a{3,4})", "a{1,4}"); + assert_union("(a{3,4}|a{1,2})", "a{1,4}"); Ok(()) } diff --git a/src/traits.rs b/src/traits.rs new file mode 100644 index 0000000..6254022 --- /dev/null +++ b/src/traits.rs @@ -0,0 +1,54 @@ +pub trait MethodParameters<'a, T: 'a> { + /// the iterator that yields `&'a T` + type Iter: Iterator; + fn parameters(self) -> Self::Iter; +} + +impl<'a, T> MethodParameters<'a, T> for &'a T { + type Iter = std::iter::Once<&'a T>; + fn parameters(self) -> Self::Iter { + std::iter::once(self) + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a [&'a T] { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a [T] { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a Vec { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T> MethodParameters<'a, T> for &'a Vec<&'a T> { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} + +impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [T; N] { + type Iter = std::slice::Iter<'a, T>; + fn parameters(self) -> Self::Iter { + self.iter() + } +} + +impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [&'a T; N] { + type Iter = std::iter::Copied>; + fn parameters(self) -> Self::Iter { + self.iter().copied() + } +} From 8abe57347434ad8f3343754a021bc1e312a7355d Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 6 Jul 2025 17:58:30 +0200 Subject: [PATCH 02/62] WIP --- src/fast_automaton/operation/alternation.rs | 33 +++++------- src/fast_automaton/operation/concatenate.rs | 43 ++++++---------- src/fast_automaton/operation/intersection.rs | 12 +++-- src/lib.rs | 36 ++++++++----- src/regex/mod.rs | 5 +- src/regex/operation/mod.rs | 10 ++++ src/regex/operation/union.rs | 12 +++-- src/traits.rs | 54 -------------------- 8 files changed, 81 insertions(+), 124 deletions(-) diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 90437d1..3daa9ca 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -2,36 +2,27 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::{error::EngineError, traits::MethodParameters}; +use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn union<'o, S>(&self, others: S) -> Result + pub fn union(&self, other: &FastAutomaton) -> Result { + Self::build_union([self, other]) + } + + pub fn union_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { - let mut result = self.clone(); - - for other in others.parameters() { - result.alternate(other)?; - - if result.is_total() { - break; - } - } - - Ok(result) + Self::build_union(std::iter::once(self).chain(others.into_iter())) } - pub fn alternation(automatons: &Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } + pub(crate) fn build_union<'a, I>(automatons: I) -> Result + where + I: IntoIterator, + { let mut new_automaton = FastAutomaton::new_empty(); - if automatons.is_empty() { - return Ok(new_automaton); - } for automaton in automatons { new_automaton.alternate(&automaton)?; } diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index e39642f..6de4339 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -2,38 +2,29 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; -use crate::{error::EngineError, traits::MethodParameters}; +use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn concatenation<'o, S>(&self, others: S) -> Result + pub fn concat(&self, other: &FastAutomaton) -> Result { + Self::build_concat([self, other]) + } + + pub fn concat_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { - let mut result = self.clone(); - - for other in others.parameters() { - result.concat(other)?; - - if result.is_total() { - break; - } - } - - Ok(result) + Self::build_concat(std::iter::once(self).chain(others.into_iter())) } - pub fn concatenate(automatons: &Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } + pub(crate) fn build_concat<'a, I>(automatons: I) -> Result + where + I: IntoIterator, + { let mut new_automaton = FastAutomaton::new_empty_string(); - if automatons.is_empty() { - return Ok(new_automaton); - } for automaton in automatons { - new_automaton.concat(&automaton)?; + new_automaton.concat_(&automaton)?; } Ok(new_automaton) @@ -80,7 +71,7 @@ impl FastAutomaton { let iter = if min == 0 { 0..0 } else { 0..min - 1 }; for _ in iter { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; } if max_opt.is_none() { @@ -116,7 +107,7 @@ impl FastAutomaton { if min == 0 { self.apply_model(&automaton_to_repeat); } else { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; } return Ok(()); @@ -124,7 +115,7 @@ impl FastAutomaton { let mut end_states = self.accept_states.iter().cloned().collect::>(); for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat(&automaton_to_repeat)?; + self.concat_(&automaton_to_repeat)?; end_states.extend(self.accept_states.iter()); } self.accept_states.extend(end_states); @@ -134,7 +125,7 @@ impl FastAutomaton { Ok(()) } - fn concat(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + fn concat_(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() { return Ok(()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 4483199..3bf07fd 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -2,18 +2,22 @@ use std::borrow::Cow; use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams, traits::MethodParameters}; +use crate::{error::EngineError, execution_profile::ThreadLocalParams}; use super::*; impl FastAutomaton { - pub fn intersection<'o, S>(&self, others: S) -> Result + pub fn intersection(&self, other: &FastAutomaton) -> Result { + self.intersection_all([other]) + } + + pub fn intersection_all<'a, I>(&'a self, others: I) -> Result where - S: MethodParameters<'o, FastAutomaton>, + I: IntoIterator, { let mut result = Cow::Borrowed(self); - for other in others.parameters() { + for other in others { result = result.intersection_(other)?; if result.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index f1681ee..359fb65 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,6 @@ pub mod execution_profile; pub mod fast_automaton; pub mod regex; pub mod tokenizer; -pub(crate) mod traits; type IntMap = HashMap>>; type IntSet = HashSet>>; @@ -80,13 +79,13 @@ impl Term { match operand { Term::RegularExpression(regex) => { if regex.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + return Ok(Term::new_total()); } regex_list.push(regex); } Term::Automaton(automaton) => { if automaton.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + return Ok(Term::new_total()); } automaton_list.push(automaton); } @@ -97,10 +96,10 @@ impl Term { let mut return_automaton = FastAutomaton::new_empty(); match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.union(®ex_list); + return_regex = regular_expression.union_all(regex_list); } Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.union(&automaton_list)?; + return_automaton = fast_automaton.union_all(automaton_list)?; } } @@ -139,19 +138,24 @@ impl Term { /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut return_automaton = self.get_automaton()?; - for term in terms { - let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(automaton.as_ref())?); - if return_automaton.is_empty() { - return Ok(Term::RegularExpression(RegularExpression::new_empty())); + + let mut automaton_list = Vec::with_capacity(terms.len()); + for operand in terms { + let automaton = operand.get_automaton()?; + if automaton.is_empty() { + return Ok(Term::new_empty()); } + automaton_list.push(automaton); } + let return_automaton = self + .get_automaton()? + .intersection_all(automaton_list.iter().map(Cow::as_ref))?; + if let Some(regex) = return_automaton.to_regex() { Ok(Term::RegularExpression(regex)) } else { - Ok(Term::Automaton(return_automaton.into_owned())) + Ok(Term::Automaton(return_automaton)) } } @@ -322,6 +326,14 @@ impl Term { Term::Automaton(automaton) => Cow::Borrowed(automaton), }) } + + fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } } /// Represents details about a [Term]. diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 59be90b..c131d2b 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -138,14 +138,15 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(&concats) + println!("{:?}", concats); + FastAutomaton::build_concat(&concats) } RegularExpression::Alternation(alternation) => { let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { alternates.push(c.to_automaton()?); } - FastAutomaton::alternation(&alternates) + FastAutomaton::build_union(&alternates) } } } diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 2baa587..b01ac78 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -103,6 +103,8 @@ mod tests { assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); Ok(()) } @@ -201,6 +203,14 @@ mod tests { None, ); + assert_repeat_simplify( + &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + Ok(()) } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8508e4e..9589b4a 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -1,17 +1,19 @@ use std::collections::BTreeSet; -use crate::traits::MethodParameters; - use super::*; impl RegularExpression { - pub fn union<'o, S>(&self, others: S) -> RegularExpression + pub fn union(&self, other: &RegularExpression) -> RegularExpression { + self.union_all([other]) + } + + pub fn union_all<'a, I>(&'a self, others: I) -> RegularExpression where - S: MethodParameters<'o, RegularExpression>, + I: IntoIterator, { let mut result = Cow::Borrowed(self); - for other in others.parameters() { + for other in others { result = result.union_(other); if result.is_total() { diff --git a/src/traits.rs b/src/traits.rs index 6254022..e69de29 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1,54 +0,0 @@ -pub trait MethodParameters<'a, T: 'a> { - /// the iterator that yields `&'a T` - type Iter: Iterator; - fn parameters(self) -> Self::Iter; -} - -impl<'a, T> MethodParameters<'a, T> for &'a T { - type Iter = std::iter::Once<&'a T>; - fn parameters(self) -> Self::Iter { - std::iter::once(self) - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a [&'a T] { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a [T] { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a Vec { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T> MethodParameters<'a, T> for &'a Vec<&'a T> { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} - -impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [T; N] { - type Iter = std::slice::Iter<'a, T>; - fn parameters(self) -> Self::Iter { - self.iter() - } -} - -impl<'a, T, const N: usize> MethodParameters<'a, T> for &'a [&'a T; N] { - type Iter = std::iter::Copied>; - fn parameters(self) -> Self::Iter { - self.iter().copied() - } -} From 1e7ec951e8ea0d7bc17ac37f1bd8ea26cf7ad74e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:59:09 +0200 Subject: [PATCH 03/62] WIP --- Cargo.toml | 4 +- README.md | 114 ++++---- src/fast_automaton/builder.rs | 6 +- .../condition/fast_bit_vec/mod.rs | 2 +- src/fast_automaton/condition/mod.rs | 11 +- .../convert/to_regex/builder/mod.rs | 5 +- src/fast_automaton/convert/to_regex/mod.rs | 45 +-- src/fast_automaton/mod.rs | 17 +- src/fast_automaton/operation/alternation.rs | 6 +- src/fast_automaton/operation/concatenate.rs | 101 +------ src/fast_automaton/operation/intersection.rs | 4 +- src/fast_automaton/operation/mod.rs | 1 + src/fast_automaton/operation/repeat.rs | 107 +++++++ src/lib.rs | 264 ++++++++++++++---- src/regex/mod.rs | 19 +- src/regex/operation/mod.rs | 67 +---- src/regex/operation/repeat.rs | 67 +++++ src/regex/operation/union.rs | 3 +- src/regex/serializer.rs | 5 +- src/traits.rs | 0 20 files changed, 507 insertions(+), 341 deletions(-) create mode 100644 src/fast_automaton/operation/repeat.rs create mode 100644 src/regex/operation/repeat.rs delete mode 100644 src/traits.rs diff --git a/Cargo.toml b/Cargo.toml index cd03087..7147509 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "regexsolver" -version = "0.3.1" -edition = "2021" +version = "1.0.0" +edition = "2024" authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" diff --git a/README.md b/README.md index dcb0b47..2d2bff0 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,67 @@ -# RegexSolver +# RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) - -This repository contains the code of [RegexSolver](https://regexsolver.com/) engine. - -For more information, you can check the library's [documentation](https://docs.rs/regexsolver/latest/regexsolver/). - -If you want to use this library with other programming languages, we provide a wide range of wrappers: - -- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) -- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) -- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) - -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). + A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. + +Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. + +## Key Features +- **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. +- **Set Operations**: Concatenate, union, intersect, subtract, and repeat regex/automaton terms. +- **Analysis & Properties**: + - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. + - Check **equivalence** and **subset** relations between terms. +- **String Generation**: Generate example strings matching a term, for testing or sampling. +- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound cost and resource usage. ## Installation - Add the following line in your `Cargo.toml`: - ```toml [dependencies] -regexsolver = "0.3" +regexsolver = "1" ``` - ## Examples -### Union - ```rust -use regexsolver::Term; - -let term1 = Term::from_regex("abc").unwrap(); -let term2 = Term::from_regex("de").unwrap(); -let term3 = Term::from_regex("fghi").unwrap(); - -let union = term1.union(&[term2, term3]).unwrap(); - -if let Term::RegularExpression(regex) = union { - println!("{}", regex.to_string()); // (abc|de|fghi) -} +// Create terms from regex +let t1 = Term::from_regex("abc.*")?; +let t2 = Term::from_regex(".*xyz")?; + +// Concatenate +let concat = t1.concat(&[t2])?; +assert_eq!(concat.to_string(), "abc.*xyz"); + +// Union +let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) +assert_eq!(union.to_string(), "(abc.*|fgh)"); + +// Intersection +let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy +assert_eq!(inter.to_string(), "(ab|xy)xy"); + +// Subtraction +let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; +assert_eq!(diff.to_string(), "a+"); + +// Repetition +let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} +assert_eq!(rep.to_string(), "(abc){2,4}"); + +// Analyze +let details = rep.get_details()?; +assert_eq!(details.get_length(), &(Some(6), Some(12))); +assert!(!details.is_empty()); + +// Generate examples +let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; +println!("Some matches: {:?}", samples); + +// Equivalence & subset +let a = Term::from_regex("a+")?; +let b = Term::from_regex("a*")?; +assert!(!a.are_equivalent(&b)?); +assert!(a.is_subset_of(&b)?); ``` -### Intersection - -```rust -use regexsolver::Term; - -let term1 = Term::from_regex("(abc|de){2}").unwrap(); -let term2 = Term::from_regex("de.*").unwrap(); -let term3 = Term::from_regex(".*abc").unwrap(); - -let intersection = term1.intersection(&[term2, term3]).unwrap(); - -if let Term::RegularExpression(regex) = intersection { - println!("{}", regex.to_string()); // deabc -} -``` - -### Difference/Subtraction - -```rust -use regexsolver::Term; - -let term1 = Term::from_regex("(abc|de)").unwrap(); -let term2 = Term::from_regex("de").unwrap(); - -let subtraction = term1.subtraction(&term2).unwrap(); - -if let Term::RegularExpression(regex) = subtraction { - println!("{}", regex.to_string()); // abc -} -``` +## Execution Profiles +By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap time, memory or term count: diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index b6cf50b..c597747 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -192,8 +192,7 @@ impl FastAutomaton { self.assert_state_exists(state); if self.start_state == state { panic!( - "Can not remove the state {}, it is still used as start state.", - state + "Can not remove the state {state}, it is still used as start state." ); } self.accept_states.remove(&state); @@ -228,8 +227,7 @@ impl FastAutomaton { for &state in states { if self.start_state == state { panic!( - "Can not remove the state {}, it is still used as start state.", - state + "Can not remove the state {state}, it is still used as start state." ); } if self.transitions.len() - 1 == state { diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index bbf4376..82b0ead 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -8,7 +8,7 @@ impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { let bit = if self.get(i).unwrap() { 1 } else { 0 }; - write!(f, "{}", bit)?; + write!(f, "{bit}")?; } Ok(()) } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index da9c2b8..40415e3 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -151,7 +151,8 @@ impl Condition { Ok(self.to_range(spanning_set)?.get_cardinality()) } - pub fn get_bits(&self) -> Vec { + #[inline] + pub fn get_binary_representation(&self) -> Vec { self.0.get_bits() } } @@ -193,11 +194,11 @@ mod tests { let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_bits()); + assert_eq!(vec![false, false, false, false], empty.get_binary_representation()); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_bits()); + assert_eq!(vec![true, true, true, true], total.get_binary_representation()); assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); @@ -225,13 +226,13 @@ mod tests { empty, Condition::from_range(&Range::empty(), &spanning_set).unwrap() ); - assert_eq!(vec![false], empty.get_bits()); + assert_eq!(vec![false], empty.get_binary_representation()); assert_eq!( total, Condition::from_range(&Range::total(), &spanning_set).unwrap() ); - assert_eq!(vec![true], total.get_bits()); + assert_eq!(vec![true], total.get_binary_representation()); assert_eq!(empty, total.complement()); assert_eq!(total, empty.complement()); diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/builder/mod.rs index b6c8dd5..648f733 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/builder/mod.rs @@ -85,7 +85,7 @@ impl StateEliminationAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } @@ -124,8 +124,7 @@ impl StateEliminationAutomaton { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( - "Can not remove the state {}, it is still used as start state or accept state.", - state + "Can not remove the state {state}, it is still used as start state or accept state." ); } self.transitions_in.remove(&state); diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index d9a1dd0..2d84ff8 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,5 +1,5 @@ use std::{ - collections::{hash_map::Entry, VecDeque}, + collections::{VecDeque, hash_map::Entry}, fmt::Display, }; @@ -56,7 +56,7 @@ impl StateEliminationAutomaton { #[allow(dead_code)] #[inline] pub fn to_dot(&self) { - println!("{}", self); + println!("{self}"); } #[inline] @@ -68,8 +68,8 @@ impl StateEliminationAutomaton { let is_subgraph; let indent; let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{} {{", prefix)?; - writeln!(sb, "\t\tlabel = \"{} - cyclic={}\";", prefix, self.cyclic)?; + writeln!(sb, "\tsubgraph cluster_{prefix} {{")?; + writeln!(sb, "\t\tlabel = \"{prefix} - cyclic={}\";", self.cyclic)?; indent = "\t"; is_subgraph = true; prefix @@ -89,16 +89,16 @@ impl StateEliminationAutomaton { format!("S{from_state}") }; - write!(sb, "{indent}\t{}", from_state_with_prefix)?; + write!(sb, "{indent}\t{from_state_with_prefix}")?; if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "{indent}\t[shape=circle,label=\"{from_state}\"];")?; } if !is_subgraph && self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state_with_prefix)?; + writeln!(sb, "\tinitial -> {from_state_with_prefix}")?; } for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { let to_state_with_prefix = if is_subgraph { @@ -117,23 +117,21 @@ impl StateEliminationAutomaton { state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; writeln!(sb)?; let subgraph_start_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.start_state + "S{subgraph_prefix}_{}", + state_elimination_automaton.start_state ); writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, subgraph_start_state + "{indent}\t{from_state_with_prefix} -> {subgraph_start_state} [label=\"ε\"]" )?; let subgraph_accept_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.accept_state + "S{subgraph_prefix}_{}", + state_elimination_automaton.accept_state ); writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - subgraph_accept_state, to_state_with_prefix + "{indent}\t{subgraph_accept_state} -> {to_state_with_prefix} [label=\"ε\"]" ) } GraphTransition::Weight(range) => { @@ -150,8 +148,7 @@ impl StateEliminationAutomaton { } GraphTransition::Epsilon => writeln!( sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, to_state_with_prefix + "{indent}\t{from_state_with_prefix} -> {to_state_with_prefix} [label=\"ε\"]" ), }?; } @@ -259,20 +256,26 @@ impl FastAutomaton { Ok(automaton) => match self.is_equivalent_of(&automaton) { Ok(result) => { if !result { - warn!("The automaton is not equivalent to the generated regex; automaton={}, regex={}", self, regex); + warn!( + "The automaton is not equivalent to the generated regex; automaton={self}, regex={regex}" + ); None } else { Some(regex) } } Err(err) => { - warn!("Engine error while checking for equivalence ({}); automaton={}, regex={}", err, self, regex); + warn!( + "Engine error while checking for equivalence ({err}); automaton={self}, regex={regex}" + ); None } }, Err(err) => { if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!("The generated regex cannot be converted to automaton to be checked for equivalence ({}); automaton={}, regex={}", err, self, regex); + warn!( + "The generated regex cannot be converted to automaton to be checked for equivalence ({err}); automaton={self}, regex={regex}" + ); } None } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 6d6fcbc..224b150 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -6,6 +6,7 @@ use spanning_set::SpanningSet; use std::collections::hash_map::Entry; use std::collections::VecDeque; use std::fmt::Display; +use crate::error::EngineError; use crate::{IntMap, IntSet}; @@ -40,23 +41,21 @@ impl Display for FastAutomaton { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; for from_state in self.transitions_iter() { - write!(sb, "\t{}", from_state)?; + write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; } if self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state)?; + writeln!(sb, "\tinitial -> {from_state}")?; } for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { writeln!( sb, - "\t{} -> {} [label=\"{}\"]", - from_state, - to_state, + "\t{from_state} -> {to_state} [label=\"{}\"]", cond.to_range(&self.spanning_set) .expect("Cannot convert condition to range.") .to_regex() @@ -73,7 +72,7 @@ impl FastAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } @@ -292,7 +291,7 @@ impl FastAutomaton { #[inline] pub fn to_dot(&self) { - println!("{}", self); + println!("{self}"); } } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 3daa9ca..84c8749 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -15,7 +15,7 @@ impl FastAutomaton { where I: IntoIterator, { - Self::build_union(std::iter::once(self).chain(others.into_iter())) + Self::build_union(std::iter::once(self).chain(others)) } pub(crate) fn build_union<'a, I>(automatons: I) -> Result @@ -24,7 +24,7 @@ impl FastAutomaton { { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { - new_automaton.alternate(&automaton)?; + new_automaton.union_mut(automaton)?; } Ok(new_automaton) } @@ -136,7 +136,7 @@ impl FastAutomaton { * - the start states can't be merged if they have incoming edges * - the accept states can't be merged if they have outgoing edges */ - fn alternate(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + pub(crate) fn union_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() || self.is_total() { return Ok(()); } else if other.is_total() { diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 6de4339..1dbd644 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -15,7 +15,7 @@ impl FastAutomaton { where I: IntoIterator, { - Self::build_concat(std::iter::once(self).chain(others.into_iter())) + Self::build_concat(std::iter::once(self).chain(others)) } pub(crate) fn build_concat<'a, I>(automatons: I) -> Result @@ -24,108 +24,13 @@ impl FastAutomaton { { let mut new_automaton = FastAutomaton::new_empty_string(); for automaton in automatons { - new_automaton.concat_(&automaton)?; + new_automaton.concat_mut(automaton)?; } Ok(new_automaton) } - pub fn repeat(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { - if let Some(max) = max_opt { - if min > max { - self.make_empty(); - return Ok(()); - } - } - - let automaton_to_repeat = self.clone(); - - if min == 0 && self.in_degree(self.start_state) != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); - } - self.start_state = new_state; - - if max_opt.is_none() { - for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); - } - self.accept(self.start_state); - return Ok(()); - } - } - - if let Some(max) = max_opt { - if min <= 1 && max == 1 { - if min == 0 { - self.accept_states.insert(self.start_state); - } - return Ok(()); - } - } - - let iter = if min == 0 { 0..0 } else { 0..min - 1 }; - for _ in iter { - self.concat_(&automaton_to_repeat)?; - } - - if max_opt.is_none() { - let mut automaton_to_repeat = automaton_to_repeat.clone(); - - let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); - if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 - { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); - let old_start_state = automaton_to_repeat.start_state; - automaton_to_repeat.start_state = accept_state; - automaton_to_repeat.remove_state(old_start_state); - } else { - let t = Self::transitions_from_state_set( - &automaton_to_repeat.transitions, - automaton_to_repeat.start_state, - ); - let transitions = - Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); - - for state in automaton_to_repeat.accept_states.clone() { - for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); - } - } - - automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); - } - automaton_to_repeat.cyclic = true; - - if min == 0 { - self.apply_model(&automaton_to_repeat); - } else { - self.concat_(&automaton_to_repeat)?; - } - - return Ok(()); - } - - let mut end_states = self.accept_states.iter().cloned().collect::>(); - for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat_(&automaton_to_repeat)?; - end_states.extend(self.accept_states.iter()); - } - self.accept_states.extend(end_states); - if min == 0 { - self.accept(self.start_state); - } - Ok(()) - } - - fn concat_(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { if other.is_empty() { return Ok(()); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 3bf07fd..2664aec 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -18,7 +18,7 @@ impl FastAutomaton { let mut result = Cow::Borrowed(self); for other in others { - result = result.intersection_(other)?; + result = result.intersection_internal(other)?; if result.is_empty() { break; @@ -28,7 +28,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - fn intersection_<'a>( + fn intersection_internal<'a>( &self, other: &'a FastAutomaton, ) -> Result, EngineError> { diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 7c7c0f1..bf0523e 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -9,6 +9,7 @@ mod concatenate; mod determinize; mod intersection; mod subtraction; +mod repeat; impl FastAutomaton { pub fn remove_dead_transitions(&mut self) { diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs new file mode 100644 index 0000000..f451678 --- /dev/null +++ b/src/fast_automaton/operation/repeat.rs @@ -0,0 +1,107 @@ +use super::*; + +impl FastAutomaton { + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + let mut automaton = self.clone(); + if let Err(error) = automaton.repeat_mut(min, max_opt) { + Err(error) + } else { + Ok(automaton) + } + } + + pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { + if let Some(max) = max_opt { + if min > max { + self.make_empty(); + return Ok(()); + } + } + + let automaton_to_repeat = self.clone(); + + if min == 0 && self.in_degree(self.start_state) != 0 { + let new_state = self.new_state(); + if self.is_accepted(&self.start_state) { + self.accept(new_state); + } + + for to_state in self.transitions_from_state(&self.start_state) { + self.add_epsilon(new_state, to_state); + } + self.start_state = new_state; + + if max_opt.is_none() { + for accept_state in self.accept_states.clone() { + self.add_epsilon(accept_state, self.start_state); + } + self.accept(self.start_state); + return Ok(()); + } + } + + if let Some(max) = max_opt { + if min <= 1 && max == 1 { + if min == 0 { + self.accept_states.insert(self.start_state); + } + return Ok(()); + } + } + + let iter = if min == 0 { 0..0 } else { 0..min - 1 }; + for _ in iter { + self.concat_mut(&automaton_to_repeat)?; + } + + if max_opt.is_none() { + let mut automaton_to_repeat = automaton_to_repeat.clone(); + + let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); + if automaton_to_repeat.accept_states.len() == 1 + && automaton_to_repeat.out_degree(accept_state) == 0 + && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 + { + automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); + let old_start_state = automaton_to_repeat.start_state; + automaton_to_repeat.start_state = accept_state; + automaton_to_repeat.remove_state(old_start_state); + } else { + let t = Self::transitions_from_state_set( + &automaton_to_repeat.transitions, + automaton_to_repeat.start_state, + ); + let transitions = + Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); + + for state in automaton_to_repeat.accept_states.clone() { + for &(to_state, condition) in &transitions { + automaton_to_repeat.add_transition_to(state, *to_state, condition); + } + } + + automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); + } + automaton_to_repeat.cyclic = true; + + if min == 0 { + self.apply_model(&automaton_to_repeat); + } else { + self.concat_mut(&automaton_to_repeat)?; + } + + return Ok(()); + } + + let mut end_states = self.accept_states.iter().cloned().collect::>(); + for _ in cmp::max(min, 1)..max_opt.unwrap() { + self.concat_mut(&automaton_to_repeat)?; + end_states.extend(self.accept_states.iter()); + } + self.accept_states.extend(end_states); + if min == 0 { + self.accept(self.start_state); + } + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 359fb65..17f26c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ use std::{ - borrow::Cow, - collections::{HashMap, HashSet}, - hash::BuildHasherDefault, + borrow::Cow, collections::{HashMap, HashSet}, fmt::Display, hash::BuildHasherDefault }; use cardinality::Cardinality; @@ -38,6 +36,15 @@ pub enum Term { Automaton(FastAutomaton), } +impl Display for Term { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Term::RegularExpression(regular_expression) => write!(f, "{regular_expression}"), + Term::Automaton(fast_automaton) => write!(f, "{fast_automaton}"), + } + } +} + impl Term { /// Create a term based on the given pattern. /// @@ -52,7 +59,67 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(regex)?)) } - /// Compute the union of the given collection of terms. + /// Compute the concatenation of the current term with the given list of terms. + /// Returns the resulting term. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term1 = Term::from_regex("abc").unwrap(); + /// let term2 = Term::from_regex("d.").unwrap(); + /// let term3 = Term::from_regex(".*").unwrap(); + /// + /// let concat = term1.concat(&[term2, term3]).unwrap(); + /// + /// if let Term::RegularExpression(regex) = concat { + /// assert_eq!("abcd.+", regex.to_string()); + /// } + /// ``` + pub fn concat(&self, terms: &[Term]) -> Result { + Self::check_number_of_terms(terms)?; + + let mut return_regex = RegularExpression::new_empty(); + let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; + match self { + Term::RegularExpression(regular_expression) => { + return_regex = regular_expression.clone() + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = fast_automaton.clone(); + } + } + for term in terms { + if has_automaton { + return_automaton = return_automaton.concat(term.get_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.concat(regular_expression, true); + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.concat(fast_automaton)?; + } + } + } + } + + if !has_automaton { + Ok(Term::RegularExpression(return_regex)) + } else { + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) + } else { + Ok(Term::Automaton(return_automaton)) + } + } + } + + /// Compute the union of the current term with the given collection of terms. /// Returns the resulting term. /// /// # Example: @@ -73,52 +140,53 @@ impl Term { pub fn union(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; - let mut regex_list = Vec::with_capacity(terms.len()); - let mut automaton_list = Vec::with_capacity(terms.len()); - for operand in terms { - match operand { - Term::RegularExpression(regex) => { - if regex.is_total() { - return Ok(Term::new_total()); - } - regex_list.push(regex); - } - Term::Automaton(automaton) => { - if automaton.is_total() { - return Ok(Term::new_total()); - } - automaton_list.push(automaton); - } - } + if self.is_total() { + return Ok(Term::new_total()); } let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.union_all(regex_list); + return_regex = regular_expression.clone() } Term::Automaton(fast_automaton) => { - return_automaton = fast_automaton.union_all(automaton_list)?; + has_automaton = true; + return_automaton = fast_automaton.clone(); + } + } + for term in terms { + if term.is_total() { + return Ok(Term::new_total()); + } + if has_automaton { + return_automaton = return_automaton.union(term.get_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.union(regular_expression); + } + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.union(fast_automaton)?; + } + } } } - if return_automaton.is_empty() { + if !has_automaton { Ok(Term::RegularExpression(return_regex)) } else { - if !return_regex.is_empty() { - return_automaton = return_automaton.union(&return_regex.to_automaton()?)?; - } - - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } } } - /// Compute the intersection of the given collection of terms. + /// Compute the intersection of the current term with the given collection of terms. /// Returns the resulting term. /// /// # Example: @@ -139,27 +207,30 @@ impl Term { pub fn intersection(&self, terms: &[Term]) -> Result { Self::check_number_of_terms(terms)?; + if self.is_empty() { + return Ok(Term::new_empty()); + } + let mut automaton_list = Vec::with_capacity(terms.len()); - for operand in terms { - let automaton = operand.get_automaton()?; - if automaton.is_empty() { + for term in terms { + if term.is_empty() { return Ok(Term::new_empty()); } - automaton_list.push(automaton); + automaton_list.push(term.get_automaton()?); } let return_automaton = self .get_automaton()? .intersection_all(automaton_list.iter().map(Cow::as_ref))?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } } - /// Compute the subtraction/difference of the two given terms. + /// Compute the subtraction of the current term and the given `subtrahend`. /// Returns the resulting term. /// /// # Example: @@ -183,8 +254,8 @@ impl Term { Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } @@ -196,7 +267,45 @@ impl Term { self.subtraction(subtrahend) } - /// Returns the Details of the given term. + /// Returns the repetition of the current term, + /// between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_regex("abc").unwrap(); + /// + /// let repeat = term.repeat(1, None).unwrap(); + /// + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc)+", regex.to_string()); + /// } + /// + /// let repeat = term.repeat(3, Some(5)).unwrap(); + /// + /// if let Term::RegularExpression(regex) = repeat { + /// assert_eq!("(abc){3,5}", regex.to_string()); + /// } + /// ``` + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + match self { + Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( + regular_expression.repeat(min, max_opt), + )), + Term::Automaton(fast_automaton) => { + let repeat_automaton = fast_automaton.repeat(min, max_opt)?; + Ok(if let Some(repeat_regex) = repeat_automaton.to_regex() { + Term::RegularExpression(repeat_regex) + } else { + Term::Automaton(repeat_automaton) + }) + } + } + } + + /// Returns the details of the current term, including cardinality, length, and emptiness. /// /// # Example: /// @@ -250,7 +359,8 @@ impl Term { .collect()) } - /// Compute if the two given terms are equivalent. + /// Compute whether the current term and the given term are equivalent. + /// Returns `true` if both terms accept the same language. /// /// # Example: /// @@ -272,7 +382,8 @@ impl Term { automaton_1.is_equivalent_of(&automaton_2) } - /// Compute if the first term is a subset of the second one. + /// Compute whether the current term is a subset of the given term. + /// Returns `true` if all strings matched by the current term are also matched by the given term. /// /// # Example: /// @@ -327,13 +438,31 @@ impl Term { }) } - fn new_empty() -> Self { + /// Create a term that matches the empty language. + pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) } - fn new_total() -> Self { + /// Create a term that matches all possible strings. + pub fn new_total() -> Self { Term::RegularExpression(RegularExpression::new_total()) } + + /// Check if the current term matches the empty language. + pub fn is_empty(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty(), + } + } + + /// Check if the current term matches all possible strings. + pub fn is_total(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_total(), + Term::Automaton(fast_automaton) => fast_automaton.is_total(), + } + } } /// Represents details about a [Term]. @@ -448,12 +577,45 @@ mod tests { } #[test] - fn test__() -> Result<(), String> { - let term = Term::from_regex("(abc|de){2}").unwrap(); - - let strings = term.generate_strings(3).unwrap(); - - println!("strings={:?}", strings); + fn test__() -> Result<(), EngineError> { + // Create terms from regex + let t1 = Term::from_regex("abc.*")?; + let t2 = Term::from_regex(".*xyz")?; + + // Concatenate + let concat = t1.concat(&[t2])?; + assert_eq!(concat.to_string(), "abc.*xyz"); + + // Union + let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) + assert_eq!(union.to_string(), "(abc.*|fgh)"); + + // Intersection + let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy + assert_eq!(inter.to_string(), "(ab|xy)xy"); + + // Subtraction + let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; + assert_eq!(diff.to_string(), "a+"); + + // Repetition + let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} + assert_eq!(rep.to_string(), "(abc){2,4}"); + + // Analyze + let details = rep.get_details()?; + assert_eq!(details.get_length(), &(Some(6), Some(12))); + assert!(!details.is_empty()); + + // Generate examples + let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; + println!("Some matches: {:?}", samples); + + // Equivalence & subset + let a = Term::from_regex("a+")?; + let b = Term::from_regex("a*")?; + assert!(!a.are_equivalent(&b)?); + assert!(a.is_subset_of(&b)?); Ok(()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index c131d2b..59ceb29 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -44,21 +44,21 @@ impl Display for RegularExpression { multiplicator_part = String::from("?"); } else if let Some(max) = max_opt { if max == min { - multiplicator_part = format!("{{{}}}", max); + multiplicator_part = format!("{{{max}}}"); } else { - multiplicator_part = format!("{{{},{}}}", min, max); + multiplicator_part = format!("{{{min},{max}}}"); } } else { - multiplicator_part = format!("{{{},}}", min); + multiplicator_part = format!("{{{min},}}"); } match **regular_expression { RegularExpression::Repetition(_, _, _) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } RegularExpression::Concat(_) => { - format!("({}){}", regex_part, multiplicator_part) + format!("({regex_part}){multiplicator_part}") } - _ => format!("{}{}", regex_part, multiplicator_part), + _ => format!("{regex_part}{multiplicator_part}"), } } RegularExpression::Concat(concat) => { @@ -82,11 +82,11 @@ impl Display for RegularExpression { if alternation.len() == 1 { sb } else { - format!("({})", sb) + format!("({sb})") } } }; - write!(f, "{}", str) + write!(f, "{str}") } } @@ -130,7 +130,7 @@ impl RegularExpression { RegularExpression::Character(range) => FastAutomaton::make_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; - automaton.repeat(*min, *max_opt)?; + automaton.repeat_mut(*min, *max_opt)?; Ok(automaton) } RegularExpression::Concat(concat) => { @@ -138,7 +138,6 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - println!("{:?}", concats); FastAutomaton::build_concat(&concats) } RegularExpression::Alternation(alternation) => { diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index b01ac78..382c885 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -3,72 +3,7 @@ use super::*; mod concat; mod simplify; mod union; - -impl RegularExpression { - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { - if self.is_total() { - return RegularExpression::new_total(); - } else if self.is_empty() { - return RegularExpression::new_empty(); - } else if self.is_empty_string() { - return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { - return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { - return self.clone(); - } - } - - match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) - } else { - None - }; - - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) - } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } - } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), - } - } -} +mod repeat; #[cfg(test)] mod tests { diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs new file mode 100644 index 0000000..00b9685 --- /dev/null +++ b/src/regex/operation/repeat.rs @@ -0,0 +1,67 @@ +use super::*; + +impl RegularExpression { + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + if self.is_total() { + return RegularExpression::new_total(); + } else if self.is_empty() { + return RegularExpression::new_empty(); + } else if self.is_empty_string() { + return Self::new_empty_string(); + } else if let Some(max) = max_opt { + if max < min || max == 0 { + return RegularExpression::new_empty_string(); + } else if min == 1 && max == 1 { + return self.clone(); + } + } + + match self { + RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { + let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { + Some(max * o_max) + } else { + None + }; + + let o_min = *o_min; + if let Some(o_max) = o_max_opt { + let o_max = *o_max; + if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { + RegularExpression::Repetition( + regular_expression.clone(), + min * o_min, + new_max, + ) + } else if o_min == o_max && o_min > 1 { + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } else { + let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); + if r > cmp::max(2, min) as f64 { + return RegularExpression::Repetition( + Box::new(self.clone()), + min, + max_opt, + ); + } + + RegularExpression::Repetition( + regular_expression.clone(), + min * o_min, + new_max, + ) + } + } else if o_max_opt.is_none() + || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) + || o_max_opt.is_some() && o_max_opt.unwrap() == 1 + || max_opt.is_none() && o_min == 0 + { + RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) + } else { + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } + } + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + } + } +} \ No newline at end of file diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 9589b4a..62789e6 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -116,8 +116,7 @@ impl RegularExpression { } } else { panic!( - "Not character and repetition {:?} {:?}", - this_character, that_repetition + "Not character and repetition {this_character:?} {that_repetition:?}" ) } } diff --git a/src/regex/serializer.rs b/src/regex/serializer.rs index 83fd99f..0832756 100644 --- a/src/regex/serializer.rs +++ b/src/regex/serializer.rs @@ -16,10 +16,7 @@ impl<'de> serde::Deserialize<'de> for RegularExpression { where D: Deserializer<'de>, { - let regex_string = match String::deserialize(deserializer) { - Ok(str) => str, - Err(err) => return Err(err), - }; + let regex_string = String::deserialize(deserializer)?; match RegularExpression::new(®ex_string) { Ok(regex) => Ok(regex), Err(err) => Err(de::Error::custom(err.to_string())), diff --git a/src/traits.rs b/src/traits.rs deleted file mode 100644 index e69de29..0000000 From 9cf30a632129bfeb0c05d42526bfe0c083cf9e57 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 10 Jul 2025 22:17:34 +0200 Subject: [PATCH 04/62] WIP --- README.md | 74 +++- src/error/mod.rs | 23 +- src/execution_profile.rs | 382 +++++++++++-------- src/fast_automaton/convert/to_regex/mod.rs | 4 +- src/fast_automaton/generate.rs | 6 +- src/fast_automaton/operation/determinize.rs | 14 +- src/fast_automaton/operation/intersection.rs | 6 +- src/lib.rs | 119 +++--- src/regex/mod.rs | 8 +- 9 files changed, 378 insertions(+), 258 deletions(-) diff --git a/README.md b/README.md index 2d2bff0..040684f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Ideal for constraint solvers, code generators, test-case generators, and any use - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. - Check **equivalence** and **subset** relations between terms. - **String Generation**: Generate example strings matching a term, for testing or sampling. -- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound cost and resource usage. +- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. ## Installation Add the following line in your `Cargo.toml`: @@ -23,45 +23,89 @@ regexsolver = "1" ## Examples ```rust +use regexsolver::Term; + // Create terms from regex -let t1 = Term::from_regex("abc.*")?; -let t2 = Term::from_regex(".*xyz")?; +let t1 = Term::from_regex("abc.*").unwrap(); +let t2 = Term::from_regex(".*xyz").unwrap(); // Concatenate -let concat = t1.concat(&[t2])?; +let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) +let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection -let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy +let inter = Term::from_regex("(ab|xy){2}") + .unwrap() + .intersection(&[Term::from_regex(".*xy").unwrap()]) + .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction -let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; +let diff = Term::from_regex("a*") + .unwrap() + .subtraction(&Term::from_regex("").unwrap()) + .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} +let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze -let details = rep.get_details()?; +let details = rep.get_details().unwrap(); assert_eq!(details.get_length(), &(Some(6), Some(12))); assert!(!details.is_empty()); // Generate examples -let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; +let samples = Term::from_regex("(x|y){1,3}") + .unwrap() + .generate_strings(5) + .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset -let a = Term::from_regex("a+")?; -let b = Term::from_regex("a*")?; -assert!(!a.are_equivalent(&b)?); -assert!(a.is_subset_of(&b)?); +let a = Term::from_regex("a+").unwrap(); +let b = Term::from_regex("a*").unwrap(); +assert!(!a.are_equivalent(&b).unwrap()); +assert!(a.is_subset_of(&b).unwrap()); ``` ## Execution Profiles -By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap time, memory or term count: +By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap execution time and maximum number of states in used automata. + +### Example: Limit the execution time +```rust +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; + +let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + +let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // We set the limit (5ms) + .build(); + +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +}); +``` + +### Example: Limit the number of states +```rust +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; + +let term1 = Term::from_regex(".*abcdef.*").unwrap(); +let term2 = Term::from_regex(".*defabc.*").unwrap(); + +let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // We set the limit + .build(); + +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +}); +``` \ No newline at end of file diff --git a/src/error/mod.rs b/src/error/mod.rs index 6447ebe..91085b3 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -15,8 +15,6 @@ pub enum EngineError { AutomatonHasTooManyStates, /// The regular expression can not be parsed. RegexSyntaxError(String), - /// Too many terms are used in the operation. - TooMuchTerms(usize, usize), /// The provided range can not be built from the spanning set. ConditionInvalidRange, /// The provided index is out of bound of the condition. @@ -30,13 +28,21 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => write!(f, "The given automaton should be deterministic."), - EngineError::AutomatonHasTooManyStates => write!(f, "The automaton has too many states."), + EngineError::AutomatonShouldBeDeterministic => { + write!(f, "The given automaton should be deterministic.") + } + EngineError::AutomatonHasTooManyStates => { + write!(f, "The automaton has too many states.") + } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), - EngineError::TooMuchTerms(max, got) => write!(f, "Too many terms are used in this operation, the maximum allowed for your plan is {max} and you used {got}."), - EngineError::TokenError(err) => write!(f, "{err}."), - EngineError::ConditionInvalidRange => write!(f, "The provided range can not be built from the spanning set."), - EngineError::ConditionIndexOutOfBound => write!(f, "The provided index is out of bound of the condition."), + EngineError::TokenError(err) => write!(f, "{err}."), + EngineError::ConditionInvalidRange => write!( + f, + "The provided range can not be built from the spanning set." + ), + EngineError::ConditionIndexOutOfBound => { + write!(f, "The provided index is out of bound of the condition.") + } } } } @@ -53,7 +59,6 @@ impl EngineError { EngineError::AutomatonShouldBeDeterministic => true, EngineError::AutomatonHasTooManyStates => false, EngineError::RegexSyntaxError(_) => false, - EngineError::TooMuchTerms(_, _) => false, EngineError::TokenError(_) => false, EngineError::ConditionInvalidRange => true, EngineError::ConditionIndexOutOfBound => true, diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 2ae8e2b..008cffb 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -4,102 +4,76 @@ use crate::error::EngineError; /// Hold settings about limitations and constraints of operations execution within the engine. /// -/// To apply the settings on the current thread you need to call the following function: -/// ``` -/// use regexsolver::execution_profile::{ExecutionProfile, ThreadLocalParams}; -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// -/// // Store the settings on the current thread. -/// ThreadLocalParams::init_profile(&execution_profile); -/// ``` -/// /// # Examples: /// /// ## Limiting the number of states /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); -/// -/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); -/// ``` +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// ## Limiting the number of terms -/// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// let term3 = Term::from_regex(".*hij.*").unwrap(); +/// let term1 = Term::from_regex(".*abcdef.*").unwrap(); +/// let term2 = Term::from_regex(".*defabc.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 2, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .max_number_of_states(5) +/// .build(); /// -/// assert_eq!(EngineError::TooMuchTerms(2,3), term1.intersection(&[term2, term3]).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +/// }); /// ``` /// /// ## Limiting the execution time /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// use std::time::SystemTime; /// /// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: Some(SystemTime::now()), -/// execution_timeout: 1, -/// max_number_of_terms: 50, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .execution_timeout(5) // 5ms +/// .build(); /// -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(100).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +/// }); /// ``` +#[derive(Clone, Debug)] pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. - pub max_number_of_states: usize, + max_number_of_states: Option, /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - pub start_execution_time: Option, + start_execution_time: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - pub execution_timeout: u128, - /// The maximum number of terms that an operation can have. - pub max_number_of_terms: usize, + execution_timeout: Option, +} + +impl PartialEq for ExecutionProfile { + fn eq(&self, other: &ExecutionProfile) -> bool { + self.max_number_of_states == other.max_number_of_states + && self.execution_timeout == other.execution_timeout + } } impl ExecutionProfile { + pub fn get() -> ExecutionProfile { + ThreadLocalParams::get_execution_profile() + } + /// Assert that `execution_timeout` is not exceeded. /// - /// Return empty if `execution_timeout` is not exceeded or if `start_execution_time` is not set. + /// Return empty if `execution_timeout` is not exceeded. /// /// Return [`EngineError::OperationTimeOutError`] otherwise. - pub fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let Some(start) = self.start_execution_time { + pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { + if let (Some(start), Some(execution_timeout)) = + (self.start_execution_time, self.execution_timeout) + { let run_duration = SystemTime::now() .duration_since(start) .expect("Time went backwards") .as_millis(); - if run_duration > self.execution_timeout { + if run_duration > execution_timeout { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -108,32 +82,103 @@ impl ExecutionProfile { Ok(()) } } + + /// Assert that `max_number_of_states` is not exceeded. + /// + /// Return empty if `max_number_of_states` is not exceeded. + /// + /// Return [`EngineError::AutomatonHasTooManyStates`] otherwise. + pub(crate) fn assert_max_number_of_states( + &self, + number_of_states: usize, + ) -> Result<(), EngineError> { + if let Some(max_number_of_states) = self.max_number_of_states { + if number_of_states >= max_number_of_states { + return Err(EngineError::AutomatonHasTooManyStates); + } + } + Ok(()) + } + + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn with_max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn set(&self) -> &Self { + self + } + + pub fn run(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + let mut execution_profile = self.clone(); + execution_profile.start_execution_time = Some(SystemTime::now()); + + ThreadLocalParams::set_execution_profile(&execution_profile); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } -/// Hold [`ExecutionProfile`] on the current thread. -/// -/// The default [`ExecutionProfile`] is the following: -/// ``` -/// use regexsolver::execution_profile::ExecutionProfile; -/// -/// ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1500, -/// max_number_of_terms: 50, -/// }; -/// ``` -pub struct ThreadLocalParams; +pub struct ExecutionProfileBuilder { + /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. + max_number_of_states: Option, + /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. + execution_timeout: Option, +} +impl Default for ExecutionProfileBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ExecutionProfileBuilder { + pub fn new() -> Self { + Self { + max_number_of_states: None, + execution_timeout: None, + } + } + + pub fn execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + pub fn max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + pub fn build(self) -> ExecutionProfile { + ExecutionProfile { + max_number_of_states: self.max_number_of_states, + execution_timeout: self.execution_timeout, + start_execution_time: None, + } + } +} + +struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { - static MAX_NUMBER_OF_STATES: RefCell = const { RefCell::new(8192) }; + static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell = const { RefCell::new(1500) }; - static MAX_NUMBER_OF_TERMS: RefCell = const { RefCell::new(50) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; } /// Store on the current thread [`ExecutionProfile`]. - pub fn init_profile(profile: &ExecutionProfile) { + fn set_execution_profile(profile: &ExecutionProfile) { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| { *cell.borrow_mut() = profile.max_number_of_states; }); @@ -145,62 +190,64 @@ impl ThreadLocalParams { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { *cell.borrow_mut() = profile.execution_timeout; }); - - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| { - *cell.borrow_mut() = profile.max_number_of_terms; - }); } - pub fn get_max_number_of_states() -> usize { + fn get_max_number_of_states() -> Option { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - pub fn get_start_execution_time() -> Option { + fn get_start_execution_time() -> Option { ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) } - pub fn get_execution_timeout() -> u128 { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } - pub fn get_max_number_of_terms() -> usize { - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| *cell.borrow()) - } - /// Return the [`ExecutionProfile`] stored on the current thread. - pub fn get_execution_profile() -> ExecutionProfile { + fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), start_execution_time: Self::get_start_execution_time(), execution_timeout: Self::get_execution_timeout(), - max_number_of_terms: Self::get_max_number_of_terms(), } } } #[cfg(test)] mod tests { - use crate::{regex::RegularExpression, Term}; + use crate::{Term, regex::RegularExpression}; use super::*; #[test] - fn test_execution() -> Result<(), String> { - let execution_profile = ExecutionProfile { - max_number_of_states: 1, - start_execution_time: None, - execution_timeout: 1000, - max_number_of_terms: 10, - }; - ThreadLocalParams::init_profile(&execution_profile); + fn test_execution_get() -> Result<(), String> { + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(1000) + .max_number_of_states(8192) + .build(); + + execution_profile.run(|| { + assert_eq!(execution_profile, ExecutionProfile::get()); + }); - let regex = RegularExpression::new("test").unwrap(); + Ok(()) + } - assert!(regex.to_automaton().is_err()); - assert_eq!( - EngineError::AutomatonHasTooManyStates, - regex.to_automaton().unwrap_err() - ); + #[test] + fn test_execution() -> Result<(), String> { + ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build() + .run(|| { + let regex = RegularExpression::new("test").unwrap(); + + assert!(regex.to_automaton().is_err()); + assert_eq!( + EngineError::AutomatonHasTooManyStates, + regex.to_automaton().unwrap_err() + ); + }); Ok(()) } @@ -209,27 +256,26 @@ mod tests { fn test_execution_timeout_generate_strings() -> Result<(), String> { let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 10; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(100).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(100).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 50); + }); + Ok(()) } @@ -238,27 +284,26 @@ mod tests { let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 50; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.difference(&term2).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.difference(&term2).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 25); + }); + Ok(()) } @@ -267,27 +312,26 @@ mod tests { let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.intersection(&[term2]).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.intersection(&[term2]).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 50); + }); + Ok(()) } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 2d84ff8..17d539f 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -7,7 +7,7 @@ use ahash::{HashMapExt, HashSetExt}; use log::warn; use nohash_hasher::IntMap; -use crate::{error::EngineError, execution_profile::ThreadLocalParams, regex::RegularExpression}; +use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; use super::{FastAutomaton, IntSet, Range, State}; @@ -248,7 +248,7 @@ impl FastAutomaton { if self.is_empty() { return Some(RegularExpression::new_empty()); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); if let Ok(graph) = StateEliminationAutomaton::new(self) { if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { let regex = regex?; diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 638ba11..0efa0e3 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,6 +1,6 @@ use std::cmp; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use ahash::AHashSet; use super::*; @@ -13,7 +13,7 @@ impl FastAutomaton { let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let mut ranges_cache: AHashMap<&Condition, Range> = AHashMap::with_capacity(self.get_number_of_states()); @@ -98,7 +98,7 @@ mod tests { assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); assert_generate_strings( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", - 500 + 500, ); assert_generate_strings("[0-9]+[A-Z]*", 500); assert_generate_strings("a+(ba+)*", 200); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 3d4057b..1cf7a88 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,6 +1,6 @@ use ahash::HashMapExt; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -9,7 +9,7 @@ impl FastAutomaton { if self.deterministic { return Ok(self.clone()); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let ranges = self.get_ranges()?; @@ -125,9 +125,11 @@ mod tests { deterministic_automaton.get_number_of_states() ); assert!(deterministic_automaton.is_determinitic()); - assert!(automaton - .subtraction(&deterministic_automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&deterministic_automaton) + .unwrap() + .is_empty() + ); } } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 2664aec..a2b381e 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -39,7 +39,7 @@ impl FastAutomaton { } else if other.is_total() { return Ok(Cow::Owned(self.clone())); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -105,7 +105,7 @@ impl FastAutomaton { } else if self.is_total() || other.is_total() { return Ok(true); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); diff --git a/src/lib.rs b/src/lib.rs index 17f26c7..9c55997 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,12 @@ use std::{ - borrow::Cow, collections::{HashMap, HashSet}, fmt::Display, hash::BuildHasherDefault + borrow::Cow, + collections::{HashMap, HashSet}, + fmt::Display, + hash::BuildHasherDefault, }; use cardinality::Cardinality; use error::EngineError; -use execution_profile::ThreadLocalParams; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; use regex::RegularExpression; @@ -78,8 +80,6 @@ impl Term { /// } /// ``` pub fn concat(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); let mut has_automaton = false; @@ -110,12 +110,10 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) + } else if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } } @@ -138,8 +136,6 @@ impl Term { /// } /// ``` pub fn union(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - if self.is_total() { return Ok(Term::new_total()); } @@ -177,12 +173,10 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) + } else if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) } else { - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } } @@ -205,8 +199,6 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - if self.is_empty() { return Ok(Term::new_empty()); } @@ -405,19 +397,6 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } - fn check_number_of_terms(terms: &[Term]) -> Result<(), EngineError> { - let number_of_terms = terms.len() + 1; - let max_number_of_terms = ThreadLocalParams::get_max_number_of_terms(); - if number_of_terms > max_number_of_terms { - Err(EngineError::TooMuchTerms( - max_number_of_terms, - number_of_terms, - )) - } else { - Ok(()) - } - } - fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -500,7 +479,7 @@ impl Details { #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{execution_profile::ExecutionProfileBuilder, regex::RegularExpression}; use super::*; @@ -577,45 +556,93 @@ mod tests { } #[test] - fn test__() -> Result<(), EngineError> { + fn test_readme_code_1() -> Result<(), String> { // Create terms from regex - let t1 = Term::from_regex("abc.*")?; - let t2 = Term::from_regex(".*xyz")?; + let t1 = Term::from_regex("abc.*").unwrap(); + let t2 = Term::from_regex(".*xyz").unwrap(); // Concatenate - let concat = t1.concat(&[t2])?; + let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union - let union = t1.union(&[Term::from_regex("fgh")?])?; // (abc.*|fgh) + let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection - let inter = Term::from_regex("(ab|xy){2}")?.intersection(&[Term::from_regex(".*xy")?])?; // (ab|xy)xy + let inter = Term::from_regex("(ab|xy){2}") + .unwrap() + .intersection(&[Term::from_regex(".*xy").unwrap()]) + .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction - let diff = Term::from_regex("a*")?.subtraction(&Term::from_regex("")?)?; + let diff = Term::from_regex("a*") + .unwrap() + .subtraction(&Term::from_regex("").unwrap()) + .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_regex("abc")?.repeat(2, Some(4))?; // (abc){2,4} + let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze - let details = rep.get_details()?; + let details = rep.get_details().unwrap(); assert_eq!(details.get_length(), &(Some(6), Some(12))); assert!(!details.is_empty()); // Generate examples - let samples = Term::from_regex("(x|y){1,3}")?.generate_strings(5)?; + let samples = Term::from_regex("(x|y){1,3}") + .unwrap() + .generate_strings(5) + .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset - let a = Term::from_regex("a+")?; - let b = Term::from_regex("a*")?; - assert!(!a.are_equivalent(&b)?); - assert!(a.is_subset_of(&b)?); + let a = Term::from_regex("a+").unwrap(); + let b = Term::from_regex("a*").unwrap(); + assert!(!a.are_equivalent(&b).unwrap()); + assert!(a.is_subset_of(&b).unwrap()); + + Ok(()) + } + + #[test] + fn test_readme_code_2() -> Result<(), String> { + let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // We set the limit (5ms) + .build(); + + // We run the operation with the defined limitation + execution_profile.run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(1000).unwrap_err() + ); + }); + + Ok(()) + } + + #[test] + fn test_readme_code_3() -> Result<(), String> { + let term1 = Term::from_regex(".*abcdef.*").unwrap(); + let term2 = Term::from_regex(".*defabc.*").unwrap(); + + let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // We set the limit + .build(); + + // We run the operation with the defined limitation + execution_profile.run(|| { + assert_eq!( + EngineError::AutomatonHasTooManyStates, + term1.intersection(&[term2]).unwrap_err() + ); + }); Ok(()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 59ceb29..4965cad 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,7 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::Range; -use execution_profile::ThreadLocalParams; +use crate::{Range, execution_profile::ExecutionProfile}; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -123,9 +122,8 @@ impl RegularExpression { } pub fn to_automaton(&self) -> Result { - if self.get_number_of_states_in_nfa() >= ThreadLocalParams::get_max_number_of_states() { - return Err(EngineError::AutomatonHasTooManyStates); - } + ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; + match self { RegularExpression::Character(range) => FastAutomaton::make_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { From 671f3a38f5e8301c78e0d9a5fd319c830f56901b Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 10 Jul 2025 22:19:21 +0200 Subject: [PATCH 05/62] WIP --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 040684f..6ad02c5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) - A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. + +A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. From 75c06b3ccc87d89b36d40f3cfb5d653e91087e61 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 27 Jul 2025 20:19:07 +0200 Subject: [PATCH 06/62] add parallel intersection --- Cargo.toml | 3 +- README.md | 27 ++-- src/error/mod.rs | 7 ++ src/execution_profile.rs | 26 ++++ src/fast_automaton/mod.rs | 11 ++ src/fast_automaton/operation/intersection.rs | 64 ++++++++-- src/lib.rs | 124 ++++++++----------- 7 files changed, 170 insertions(+), 92 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7147509..c691486 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ lazy_static = "1.4.0" regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } +rayon = "1.10.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } @@ -47,4 +48,4 @@ serde = [ [[bench]] name = "my_benchmark" -harness = false \ No newline at end of file +harness = false diff --git a/README.md b/README.md index 6ad02c5..3c89a0f 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. - -Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations at scale. +Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. ## Key Features - **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. @@ -15,6 +14,8 @@ Ideal for constraint solvers, code generators, test-case generators, and any use - **String Generation**: Generate example strings matching a term, for testing or sampling. - **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. +This library also exposes the `regex` and `fast_automaton` modules for advanced use, providing low-level APIs for direct pattern and automaton operations. + ## Installation Add the following line in your `Cargo.toml`: ```toml @@ -35,7 +36,7 @@ let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) +let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection @@ -53,13 +54,12 @@ let diff = Term::from_regex("a*") assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} +let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze -let details = rep.get_details().unwrap(); -assert_eq!(details.get_length(), &(Some(6), Some(12))); -assert!(!details.is_empty()); +assert_eq!(rep.get_length(), (Some(6), Some(12))); +assert!(!rep.is_empty()); // Generate examples let samples = Term::from_regex("(x|y){1,3}") @@ -76,7 +76,7 @@ assert!(a.is_subset_of(&b).unwrap()); ``` ## Execution Profiles -By default, all operations run without limits. For heavy or untrusted patterns, use an `ExecutionProfile` to cap execution time and maximum number of states in used automata. +By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. ### Example: Limit the execution time ```rust @@ -109,4 +109,13 @@ let execution_profile = ExecutionProfileBuilder::new() execution_profile.run(|| { assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); }); -``` \ No newline at end of file +``` + +## Usage with other programming languages + +If you want to use this library with other programming languages, we provide a wide range of wrappers: +- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) +- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) +- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) + +For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). \ No newline at end of file diff --git a/src/error/mod.rs b/src/error/mod.rs index 91085b3..e88d1e1 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -21,6 +21,8 @@ pub enum EngineError { ConditionIndexOutOfBound, /// There is an error with one of the token. TokenError(TokenError), + /// Computing the cardinality of the provided automaton failed. + CannotComputeAutomatonCardinality, } impl fmt::Display for EngineError { @@ -43,6 +45,10 @@ impl fmt::Display for EngineError { EngineError::ConditionIndexOutOfBound => { write!(f, "The provided index is out of bound of the condition.") } + EngineError::CannotComputeAutomatonCardinality => write!( + f, + "Computing the cardinality of the provided automaton failed." + ), } } } @@ -62,6 +68,7 @@ impl EngineError { EngineError::TokenError(_) => false, EngineError::ConditionInvalidRange => true, EngineError::ConditionIndexOutOfBound => true, + EngineError::CannotComputeAutomatonCardinality => false, } } } diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 008cffb..de3c485 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -55,6 +55,7 @@ impl PartialEq for ExecutionProfile { } impl ExecutionProfile { + /// Retrieve the current thread-local execution profile. pub fn get() -> ExecutionProfile { ThreadLocalParams::get_execution_profile() } @@ -114,6 +115,7 @@ impl ExecutionProfile { self } + /// Run the given closure with this profile at thread level, setting its start time to now. pub fn run(&self, f: F) -> R where F: FnOnce() -> R, @@ -128,6 +130,19 @@ impl ExecutionProfile { ThreadLocalParams::set_execution_profile(&initial_execution_profile); result } + + /// Like [`run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + pub fn apply(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + ThreadLocalParams::set_execution_profile(self); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } pub struct ExecutionProfileBuilder { @@ -220,6 +235,17 @@ mod tests { use super::*; + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } + #[test] fn test_execution_get() -> Result<(), String> { let execution_profile = ExecutionProfileBuilder::new() diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 224b150..a4da641 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -314,4 +314,15 @@ mod tests { assert!(automaton.is_total()); Ok(()) } + + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index a2b381e..1498d7f 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,24 +1,29 @@ use std::borrow::Cow; +use rayon::prelude::*; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ExecutionProfile}; +use crate::{ + error::EngineError, + execution_profile::{ExecutionProfile}, +}; use super::*; impl FastAutomaton { pub fn intersection(&self, other: &FastAutomaton) -> Result { - self.intersection_all([other]) + FastAutomaton::intersection_all([self, other]) } - pub fn intersection_all<'a, I>(&'a self, others: I) -> Result + pub fn intersection_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { - let mut result = Cow::Borrowed(self); + let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); - for other in others { - result = result.intersection_internal(other)?; + for automaton in automatons { + result = result.intersection_internal(automaton)?; if result.is_empty() { break; @@ -28,6 +33,22 @@ impl FastAutomaton { Ok(result.into_owned()) } + pub fn intersection_all_par<'a, I>(others: I) -> Result + where + I: IntoParallelIterator, + { + let execution_profile = ExecutionProfile::get(); + + let total = FastAutomaton::new_total(); + + others.into_par_iter().cloned().map(Result::Ok).try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) + } + fn intersection_internal<'a>( &self, other: &'a FastAutomaton, @@ -182,7 +203,7 @@ impl FastAutomaton { #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { @@ -282,4 +303,33 @@ mod tests { assert!(intersection.match_string("avb@gmail.com")); Ok(()) } + + #[test] + fn test_intersection_par() -> Result<(), String> { + let c = 12; + let mut automaton_list = Vec::with_capacity(c); + + for i in 0..c { + automaton_list.push( + RegularExpression::new(&format!(".*{i}.*")) + .unwrap() + .to_automaton() + .unwrap(), + ) + } + + // FastAutomaton::intersection_all(automaton_list.iter().collect::>()); + + // 3.76 + // 4.47 + // 3.84 + + let _ = FastAutomaton::intersection_all_par(automaton_list.iter().collect::>()); + + // 0.59 + // 0.55 + // 0.53 + + Ok(()) + } } diff --git a/src/lib.rs b/src/lib.rs index 9c55997..232c602 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,11 +9,14 @@ use cardinality::Cardinality; use error::EngineError; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; +use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use crate::execution_profile::ExecutionProfile; + pub mod cardinality; pub mod error; pub mod execution_profile; @@ -199,21 +202,33 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - if self.is_empty() { + if self.is_empty() || terms.iter().any(|t| t.is_empty()) { return Ok(Term::new_empty()); } - let mut automaton_list = Vec::with_capacity(terms.len()); - for term in terms { - if term.is_empty() { - return Ok(Term::new_empty()); - } - automaton_list.push(term.get_automaton()?); - } + let parallel = terms.len() > 3; - let return_automaton = self - .get_automaton()? - .intersection_all(automaton_list.iter().map(Cow::as_ref))?; + let mut automaton_list = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.get_automaton())) + .collect::, _>>()? + } else { + terms + .iter() + .map(Term::get_automaton) + .collect::, _>>()? + }; + automaton_list.push(self.get_automaton()?); + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::intersection_all_par(automaton_list) + } else { + FastAutomaton::intersection_all(automaton_list) + }?; if let Some(return_regex) = return_automaton.to_regex() { Ok(Term::RegularExpression(return_regex)) @@ -297,39 +312,6 @@ impl Term { } } - /// Returns the details of the current term, including cardinality, length, and emptiness. - /// - /// # Example: - /// - /// ``` - /// use regexsolver::{Term, cardinality::Cardinality}; - /// - /// let term = Term::from_regex("(abc|de)").unwrap(); - /// - /// let details = term.get_details().unwrap(); - /// - /// assert_eq!(Some(Cardinality::Integer(2)), *details.get_cardinality()); - /// assert_eq!((Some(2), Some(3)), *details.get_length()); - /// assert!(!details.is_empty()); - /// assert!(!details.is_total()); - /// ``` - pub fn get_details(&self) -> Result { - match self { - Term::RegularExpression(regex) => Ok(Details { - cardinality: Some(regex.get_cardinality()), - length: regex.get_length(), - empty: regex.is_empty(), - total: regex.is_total(), - }), - Term::Automaton(automaton) => Ok(Details { - cardinality: automaton.get_cardinality(), - length: automaton.get_length(), - empty: automaton.is_empty(), - total: automaton.is_total(), - }), - } - } - /// Generate strings matched by the given term. /// /// # Example: @@ -442,38 +424,31 @@ impl Term { Term::Automaton(fast_automaton) => fast_automaton.is_total(), } } -} - -/// Represents details about a [Term]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", rename = "details"))] -pub struct Details { - cardinality: Option>, - length: (Option, Option), - empty: bool, - total: bool, -} - -impl Details { - /// Return the number of unique strings matched. - pub fn get_cardinality(&self) -> &Option> { - &self.cardinality - } - /// Return the minimum and the maximum length of matched strings. - pub fn get_length(&self) -> &(Option, Option) { - &self.length + pub fn get_length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.get_length(), + Term::Automaton(automaton) => automaton.get_length(), + } } - /// Return `true` if it does not match any string. - pub fn is_empty(&self) -> bool { - self.empty - } + pub fn get_cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::Automaton(automaton) => { + let cardinality = if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }; - /// Return `true` if it match all possible strings. - pub fn is_total(&self) -> bool { - self.total + if let Some(cardinality) = cardinality { + Ok(cardinality) + } else { + Err(EngineError::CannotComputeAutomatonCardinality) + } + } + } } } @@ -588,9 +563,8 @@ mod tests { assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze - let details = rep.get_details().unwrap(); - assert_eq!(details.get_length(), &(Some(6), Some(12))); - assert!(!details.is_empty()); + assert_eq!(rep.get_length(), (Some(6), Some(12))); + assert!(!rep.is_empty()); // Generate examples let samples = Term::from_regex("(x|y){1,3}") From b37ef6599e3cbc9769735874c4defe0d5949130c Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 28 Jul 2025 21:23:54 +0200 Subject: [PATCH 07/62] WIP --- src/execution_profile.rs | 2 +- src/fast_automaton/operation/alternation.rs | 42 ++++-- src/fast_automaton/operation/concatenate.rs | 11 +- src/fast_automaton/operation/intersection.rs | 13 +- src/lib.rs | 134 ++++++++++++------- src/regex/mod.rs | 4 +- src/regex/operation/union.rs | 8 +- 7 files changed, 140 insertions(+), 74 deletions(-) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index de3c485..86045d5 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -355,7 +355,7 @@ mod tests { .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 50); + assert!(run_duration <= execution_timeout_in_ms + 100); }); Ok(()) diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs index 84c8749..fe1ab80 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/alternation.rs @@ -1,24 +1,18 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; +use rayon::prelude::*; -use crate::error::EngineError; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { pub fn union(&self, other: &FastAutomaton) -> Result { - Self::build_union([self, other]) + Self::union_all([self, other]) } - pub fn union_all<'a, I>(&'a self, others: I) -> Result - where - I: IntoIterator, - { - Self::build_union(std::iter::once(self).chain(others)) - } - - pub(crate) fn build_union<'a, I>(automatons: I) -> Result + pub fn union_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { @@ -29,6 +23,34 @@ impl FastAutomaton { Ok(new_automaton) } + pub fn union_all_par<'a, I>(automatons: I) -> Result + where + I: IntoParallelIterator, + { + let execution_profile = ExecutionProfile::get(); + + let empty = FastAutomaton::new_empty(); + + automatons.into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ).try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) + } + fn prepare_start_states( &mut self, other: &FastAutomaton, diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs index 1dbd644..b22ad2d 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concatenate.rs @@ -8,17 +8,10 @@ use super::*; impl FastAutomaton { pub fn concat(&self, other: &FastAutomaton) -> Result { - Self::build_concat([self, other]) + Self::concat_all([self, other]) } - pub fn concat_all<'a, I>(&'a self, others: I) -> Result - where - I: IntoIterator, - { - Self::build_concat(std::iter::once(self).chain(others)) - } - - pub(crate) fn build_concat<'a, I>(automatons: I) -> Result + pub fn concat_all<'a, I>(automatons: I) -> Result where I: IntoIterator, { diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 1498d7f..8f3e4a3 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - pub fn intersection_all_par<'a, I>(others: I) -> Result + pub fn intersection_all_par<'a, I>(automatons: I) -> Result where I: IntoParallelIterator, { @@ -41,7 +41,14 @@ impl FastAutomaton { let total = FastAutomaton::new_total(); - others.into_par_iter().cloned().map(Result::Ok).try_reduce( + automatons.into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( || total.clone(), |acc, next| { execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) @@ -306,7 +313,7 @@ mod tests { #[test] fn test_intersection_par() -> Result<(), String> { - let c = 12; + let c = 14; let mut automaton_list = Vec::with_capacity(c); for i in 0..c { diff --git a/src/lib.rs b/src/lib.rs index 232c602..3cf1c82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,43 +143,45 @@ impl Term { return Ok(Term::new_total()); } - let mut return_regex = RegularExpression::new_empty(); - let mut return_automaton = FastAutomaton::new_empty(); - let mut has_automaton = false; - match self { - Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone() - } - Term::Automaton(fast_automaton) => { - has_automaton = true; - return_automaton = fast_automaton.clone(); - } - } - for term in terms { - if term.is_total() { - return Ok(Term::new_total()); - } - if has_automaton { - return_automaton = return_automaton.union(term.get_automaton()?.as_ref())?; - } else { - match term { - Term::RegularExpression(regular_expression) => { - return_regex = return_regex.union(regular_expression); - } - Term::Automaton(fast_automaton) => { - has_automaton = true; - return_automaton = return_regex.to_automaton()?.union(fast_automaton)?; - } + let mut has_automaton = matches!(self, Term::Automaton(_)); + if !has_automaton { + for term in terms { + if term.is_total() { + return Ok(Term::new_total()); + } + if matches!(term, Term::Automaton(_)) { + has_automaton = true; + break; } } } - if !has_automaton { - Ok(Term::RegularExpression(return_regex)) - } else if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) + if has_automaton { + let parallel = terms.len() > 3; + + let automaton_list = self.get_automata(terms, parallel)?; + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + let return_automaton = if parallel { + FastAutomaton::union_all_par(automaton_list) + } else { + FastAutomaton::union_all(automaton_list) + }?; + + if let Some(return_regex) = return_automaton.to_regex() { + Ok(Term::RegularExpression(return_regex)) + } else { + Ok(Term::Automaton(return_automaton)) + } } else { - Ok(Term::Automaton(return_automaton)) + let regexes_list = self.get_regexes(terms)?; + + let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); + + Ok(Term::RegularExpression(RegularExpression::union_all( + regexes_list, + ))) } } @@ -208,19 +210,7 @@ impl Term { let parallel = terms.len() > 3; - let mut automaton_list = if parallel { - let execution_profile = ExecutionProfile::get(); - terms - .par_iter() - .map(|a| execution_profile.apply(|| a.get_automaton())) - .collect::, _>>()? - } else { - terms - .iter() - .map(Term::get_automaton) - .collect::, _>>()? - }; - automaton_list.push(self.get_automaton()?); + let automaton_list = self.get_automata(terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); @@ -392,6 +382,47 @@ impl Term { } } + fn get_automata<'a>( + &'a self, + terms: &'a [Term], + parallel: bool, + ) -> Result>, EngineError> { + let mut automaton_list = Vec::with_capacity(terms.len() + 1); + automaton_list.push(self.get_automaton()?); + + let mut terms_automata = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.get_automaton())) + .collect::, _>>() + } else { + terms + .iter() + .map(Term::get_automaton) + .collect::, _>>() + }?; + automaton_list.append(&mut terms_automata); + + Ok(automaton_list) + } + + fn get_regexes<'a>( + &'a self, + terms: &'a [Term], + ) -> Result>, EngineError> { + let mut regex_list = Vec::with_capacity(terms.len() + 1); + regex_list.push(self.get_regex()?); + + let mut terms_regexes = terms + .iter() + .map(Term::get_regex) + .collect::, _>>()?; + regex_list.append(&mut terms_regexes); + + Ok(regex_list) + } + fn get_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -399,6 +430,19 @@ impl Term { }) } + fn get_regex(&self) -> Result, EngineError> { + Ok(match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => { + if let Some(regex) = automaton.to_regex() { + Cow::Owned(regex) + } else { + todo!() + } + } + }) + } + /// Create a term that matches the empty language. pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 4965cad..848842f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -136,14 +136,14 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::build_concat(&concats) + FastAutomaton::concat_all(&concats) } RegularExpression::Alternation(alternation) => { let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { alternates.push(c.to_automaton()?); } - FastAutomaton::build_union(&alternates) + FastAutomaton::union_all(&alternates) } } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 62789e6..65b34f4 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -4,16 +4,16 @@ use super::*; impl RegularExpression { pub fn union(&self, other: &RegularExpression) -> RegularExpression { - self.union_all([other]) + Self::union_all([self, other]) } - pub fn union_all<'a, I>(&'a self, others: I) -> RegularExpression + pub fn union_all<'a, I>(regexes: I) -> RegularExpression where I: IntoIterator, { - let mut result = Cow::Borrowed(self); + let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); - for other in others { + for other in regexes { result = result.union_(other); if result.is_total() { From 3ea0dec5f7d25545a223138f9c50eaff5db43a46 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 29 Jul 2025 21:54:57 +0200 Subject: [PATCH 08/62] update readme --- README.md | 95 +++++++++++---- src/execution_profile.rs | 10 +- src/fast_automaton/analyze/mod.rs | 5 + src/lib.rs | 190 +++++++++++++++++------------- 4 files changed, 193 insertions(+), 107 deletions(-) diff --git a/README.md b/README.md index 3c89a0f..c6b28c5 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,18 @@ - # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -A high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. -Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. -## Key Features -- **Dual Representation**: Work interchangeably with regex syntax or compiled automata via the `Term` enum. -- **Set Operations**: Concatenate, union, intersect, subtract, and repeat regex/automaton terms. -- **Analysis & Properties**: - - Compute language **cardinality**, **length bounds**, **emptiness**, and **totality**. - - Check **equivalence** and **subset** relations between terms. -- **String Generation**: Generate example strings matching a term, for testing or sampling. -- **Performance & Tuning**: Pluggable `ExecutionProfile` to bound time and resource usage. +## Installation -This library also exposes the `regex` and `fast_automaton` modules for advanced use, providing low-level APIs for direct pattern and automaton operations. +Add to your `Cargo.toml`: -## Installation -Add the following line in your `Cargo.toml`: ```toml [dependencies] regexsolver = "1" ``` -## Examples + +## Example ```rust use regexsolver::Term; @@ -75,10 +65,54 @@ assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` -## Execution Profiles +## API + +### Term + +`Term` is an enum designed to represent either a regular expression pattern or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. + +| Method | Return | Description | +| -------- | ------- | ------- | +| `Term::new_empty()` | `Term` | Create a term that matches the empty language. | +| `Term::new_total()` | `Term` | Create a term that matches all possible strings. | +| `Term::new_empty_string()` | `Term` | Create a term that only match the empty string `""`. | +| `Term::from_pattern(pattern: &str)` | `Result` | Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | +| `Term::from_regex(regex: RegularExpression)` | `Term` | Create a new `Term` holding the provided `RegularExpression`. | +| `Term::from_automaton(automaton: FastAutomaton)` | `Term` | Create a new `Term` holding the provided `FastAutomaton`. | +| `self.concat(terms: &[Term])` | `Result` | Compute the concatenation of the given collection of terms. Returns the resulting term. | +| `self.union(terms: &[Term])` | `Result` | Compute the union of the given collection of terms. Returns the resulting term. | +| `self.intersection(terms: &[Term])` | `Result` | Compute the intersection of the given collection of terms. Returns the resulting term. | +| `self.subtraction(subtrahend: &Term)` | `Result` | Compute the subtraction/difference of the two given terms. Returns the resulting term. | +| `self.difference(subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | +| `self.repeat(min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `self.generate_strings(count: usize)` | `Result, EngineError>` | Generate the given count of strings matched by the given term. | +| `self.are_equivalent(term: &Term)` | `Result` | Compute whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `self.is_subset_of(term: &Term)` | `Result` | Compute whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | +| `self.is_empty()` | `bool` | Check if the current term matches the empty language. | +| `self.is_total()` | `bool` | Check if the current term matches all possible strings. | +| `self.is_empty_string()` | `bool` | Check if the current term only match the empty string `""`. | +| `self.get_length()` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `self.get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | + + +### FastAutomaton + +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used, not all automaton can be converted to a regular expression. + + + +### RegularExpression + +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. + +## Error Handling + +## Bound Execution + By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. -### Example: Limit the execution time +### Time-Bounded Execution + ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; @@ -94,7 +128,8 @@ execution_profile.run(|| { }); ``` -### Example: Limit the number of states +### State-Limited Execution + ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; @@ -111,11 +146,31 @@ execution_profile.run(|| { }); ``` -## Usage with other programming languages + + +## Key Concepts & Limitations + +RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: + +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. +- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character . matches every possible unicode characters including the line feed (`\n`). +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. + +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing expressions. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. + +## Cross-Language Support + If you want to use this library with other programming languages, we provide a wide range of wrappers: - [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) - [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). \ No newline at end of file +For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). + +## License + +This project is licensed under the MIT License. diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 86045d5..708fbac 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -280,7 +280,7 @@ mod tests { #[test] fn test_execution_timeout_generate_strings() -> Result<(), String> { - let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 10; let start_time = SystemTime::now(); @@ -307,8 +307,8 @@ mod tests { #[test] fn test_execution_timeout_difference() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 50; let start_time = SystemTime::now(); @@ -335,8 +335,8 @@ mod tests { #[test] fn test_execution_timeout_intersection() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 56f0884..9340d7d 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -25,6 +25,11 @@ impl FastAutomaton { false } + #[inline] + pub fn is_empty_string(&self) -> bool { + self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.in_degree(self.start_state) == 0 + } + pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); diff --git a/src/lib.rs b/src/lib.rs index 3cf1c82..5bd5ea0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,17 +51,42 @@ impl Display for Term { } impl Term { - /// Create a term based on the given pattern. + /// Create a term that matches the empty language. + pub fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + /// Create a term that matches all possible strings. + pub fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } + + /// Create a term that only match the empty string `""`. + pub fn new_empty_string() -> Self { + Term::RegularExpression(RegularExpression::new_empty_string()) + } + + /// Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex(".*abc.*").unwrap(); + /// let term = Term::from_pattern(".*abc.*").unwrap(); /// ``` - pub fn from_regex(regex: &str) -> Result { - Ok(Term::RegularExpression(RegularExpression::new(regex)?)) + pub fn from_pattern(pattern: &str) -> Result { + Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) + } + + /// Create a new `Term` holding the provided `RegularExpression`. + pub fn from_regex(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } + + /// Create a new `Term` holding the provided `FastAutomaton`. + pub fn from_automaton(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) } /// Compute the concatenation of the current term with the given list of terms. @@ -302,7 +327,7 @@ impl Term { } } - /// Generate strings matched by the given term. + /// Generate the given count of strings matched by the given term. /// /// # Example: /// @@ -369,6 +394,59 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } + + /// Check if the current term matches the empty language. + pub fn is_empty(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty(), + } + } + + /// Check if the current term matches all possible strings. + pub fn is_total(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_total(), + Term::Automaton(fast_automaton) => fast_automaton.is_total(), + } + } + + /// Check if the current term only match the empty string `""`. + pub fn is_empty_string(&self) -> bool { + match self { + Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), + Term::Automaton(fast_automaton) => fast_automaton.is_empty_string(), + } + } + + /// Returns the minimum and maximum length of the possible matched strings. + pub fn get_length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.get_length(), + Term::Automaton(automaton) => automaton.get_length(), + } + } + + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + pub fn get_cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::Automaton(automaton) => { + let cardinality = if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }; + + if let Some(cardinality) = cardinality { + Ok(cardinality) + } else { + Err(EngineError::CannotComputeAutomatonCardinality) + } + } + } + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -442,58 +520,6 @@ impl Term { } }) } - - /// Create a term that matches the empty language. - pub fn new_empty() -> Self { - Term::RegularExpression(RegularExpression::new_empty()) - } - - /// Create a term that matches all possible strings. - pub fn new_total() -> Self { - Term::RegularExpression(RegularExpression::new_total()) - } - - /// Check if the current term matches the empty language. - pub fn is_empty(&self) -> bool { - match self { - Term::RegularExpression(regular_expression) => regular_expression.is_empty(), - Term::Automaton(fast_automaton) => fast_automaton.is_empty(), - } - } - - /// Check if the current term matches all possible strings. - pub fn is_total(&self) -> bool { - match self { - Term::RegularExpression(regular_expression) => regular_expression.is_total(), - Term::Automaton(fast_automaton) => fast_automaton.is_total(), - } - } - - pub fn get_length(&self) -> (Option, Option) { - match self { - Term::RegularExpression(regex) => regex.get_length(), - Term::Automaton(automaton) => automaton.get_length(), - } - } - - pub fn get_cardinality(&self) -> Result, EngineError> { - match self { - Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => { - let cardinality = if !automaton.is_determinitic() { - automaton.determinize()?.get_cardinality() - } else { - automaton.get_cardinality() - }; - - if let Some(cardinality) = cardinality { - Ok(cardinality) - } else { - Err(EngineError::CannotComputeAutomatonCardinality) - } - } - } - } } #[cfg(test)] @@ -504,8 +530,8 @@ mod tests { #[test] fn test_details() -> Result<(), String> { - let regex1 = Term::from_regex("a").unwrap(); - let regex2 = Term::from_regex("b").unwrap(); + let regex1 = Term::from_pattern("a").unwrap(); + let regex2 = Term::from_pattern("b").unwrap(); let details = regex1.intersection(&vec![regex2]); assert!(details.is_ok()); @@ -515,8 +541,8 @@ mod tests { #[test] fn test_subtraction_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("").unwrap(); let result = regex1.subtraction(®ex2); assert!(result.is_ok()); @@ -531,8 +557,8 @@ mod tests { #[test] fn test_subtraction_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); let result = regex1.subtraction(®ex2); assert!(result.is_ok()); @@ -547,21 +573,21 @@ mod tests { #[test] fn test_intersection_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("b*").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("b*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); let result = result.unwrap(); - assert_eq!(Term::from_regex("").unwrap(), result); + assert_eq!(Term::from_pattern("").unwrap(), result); Ok(()) } #[test] fn test_intersection_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); @@ -577,33 +603,33 @@ mod tests { #[test] fn test_readme_code_1() -> Result<(), String> { // Create terms from regex - let t1 = Term::from_regex("abc.*").unwrap(); - let t2 = Term::from_regex(".*xyz").unwrap(); + let t1 = Term::from_pattern("abc.*").unwrap(); + let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union - let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); // (abc.*|fgh) + let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection - let inter = Term::from_regex("(ab|xy){2}") + let inter = Term::from_pattern("(ab|xy){2}") .unwrap() - .intersection(&[Term::from_regex(".*xy").unwrap()]) + .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction - let diff = Term::from_regex("a*") + let diff = Term::from_pattern("a*") .unwrap() - .subtraction(&Term::from_regex("").unwrap()) + .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} + let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze @@ -611,15 +637,15 @@ mod tests { assert!(!rep.is_empty()); // Generate examples - let samples = Term::from_regex("(x|y){1,3}") + let samples = Term::from_pattern("(x|y){1,3}") .unwrap() .generate_strings(5) .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset - let a = Term::from_regex("a+").unwrap(); - let b = Term::from_regex("a*").unwrap(); + let a = Term::from_pattern("a+").unwrap(); + let b = Term::from_pattern("a*").unwrap(); assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); @@ -628,7 +654,7 @@ mod tests { #[test] fn test_readme_code_2() -> Result<(), String> { - let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); + let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .execution_timeout(5) // We set the limit (5ms) @@ -647,8 +673,8 @@ mod tests { #[test] fn test_readme_code_3() -> Result<(), String> { - let term1 = Term::from_regex(".*abcdef.*").unwrap(); - let term2 = Term::from_regex(".*defabc.*").unwrap(); + let term1 = Term::from_pattern(".*abcdef.*").unwrap(); + let term2 = Term::from_pattern(".*defabc.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .max_number_of_states(5) // We set the limit From a47c77912c06b6080ef338242485f22658d30ab9 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 17:43:49 +0200 Subject: [PATCH 09/62] rename methods --- README.md | 206 +++++++++++++----- src/execution_profile.rs | 6 +- src/fast_automaton/analyze/cardinality.rs | 6 +- src/fast_automaton/analyze/length.rs | 4 +- src/fast_automaton/analyze/mod.rs | 8 +- src/fast_automaton/builder.rs | 123 ++++++----- src/fast_automaton/condition/converter.rs | 27 ++- src/fast_automaton/condition/mod.rs | 73 +++---- .../convert/to_regex/builder/mod.rs | 14 +- .../convert/to_regex/builder/scc.rs | 2 +- src/fast_automaton/convert/to_regex/mod.rs | 11 +- .../convert/to_regex/transform.rs | 2 +- src/fast_automaton/generate.rs | 6 +- src/fast_automaton/mod.rs | 191 ++++++++-------- .../operation/{concatenate.rs => concat.rs} | 16 +- src/fast_automaton/operation/determinize.rs | 4 +- src/fast_automaton/operation/intersection.rs | 28 +-- src/fast_automaton/operation/mod.rs | 8 +- src/fast_automaton/operation/repeat.rs | 16 +- src/fast_automaton/operation/subtraction.rs | 10 +- .../operation/{alternation.rs => union.rs} | 38 ++-- src/fast_automaton/spanning_set/mod.rs | 26 ++- src/lib.rs | 145 ++++++------ src/regex/builder.rs | 14 +- src/regex/mod.rs | 6 +- src/regex/operation/mod.rs | 40 ++-- src/tokenizer/embed_automaton.rs | 61 +++--- src/tokenizer/embed_regex.rs | 4 +- src/tokenizer/mod.rs | 8 +- src/tokenizer/range_tokenizer.rs | 14 +- 30 files changed, 607 insertions(+), 510 deletions(-) rename src/fast_automaton/operation/{concatenate.rs => concat.rs} (96%) rename src/fast_automaton/operation/{alternation.rs => union.rs} (87%) diff --git a/README.md b/README.md index c6b28c5..1385f96 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,20 @@ **RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +## Table of Contents + + - [Installation](#installation) + - [Example](#example) + - [Key Concepts & Limitations](#key-concepts-limitations) + - [API](#api) + - [Term](#term) + - [FastAutomaton](#fastautomaton) + - [RegularExpression](#regularexpression) + - [Error Handling](#error-handling) + - [Bound Execution](#bound-execution) + - [Cross-Language Support](#cross-language-support) + - [License](#license) + ## Installation Add to your `Cargo.toml`: @@ -18,33 +32,33 @@ regexsolver = "1" use regexsolver::Term; // Create terms from regex -let t1 = Term::from_regex("abc.*").unwrap(); -let t2 = Term::from_regex(".*xyz").unwrap(); +let t1 = Term::from_pattern("abc.*").unwrap(); +let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); assert_eq!(concat.to_string(), "abc.*xyz"); // Union -let union = t1.union(&[Term::from_regex("fgh").unwrap()]).unwrap(); +let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); assert_eq!(union.to_string(), "(abc.*|fgh)"); // Intersection -let inter = Term::from_regex("(ab|xy){2}") +let inter = Term::from_pattern("(ab|xy){2}") .unwrap() - .intersection(&[Term::from_regex(".*xy").unwrap()]) + .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy assert_eq!(inter.to_string(), "(ab|xy)xy"); // Subtraction -let diff = Term::from_regex("a*") +let diff = Term::from_pattern("a*") .unwrap() - .subtraction(&Term::from_regex("").unwrap()) + .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); assert_eq!(diff.to_string(), "a+"); // Repetition -let rep = Term::from_regex("abc").unwrap().repeat(2, Some(4)).unwrap(); +let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze @@ -52,58 +66,161 @@ assert_eq!(rep.get_length(), (Some(6), Some(12))); assert!(!rep.is_empty()); // Generate examples -let samples = Term::from_regex("(x|y){1,3}") +let samples = Term::from_pattern("(x|y){1,3}") .unwrap() .generate_strings(5) .unwrap(); println!("Some matches: {:?}", samples); // Equivalence & subset -let a = Term::from_regex("a+").unwrap(); -let b = Term::from_regex("a*").unwrap(); +let a = Term::from_pattern("a+").unwrap(); +let b = Term::from_pattern("a*").unwrap(); assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` +## Key Concepts & Limitations + +RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. +- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character `.` matches every possible unicode characters including the line feed (`\n`). +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. + +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. + ## API ### Term -`Term` is an enum designed to represent either a regular expression pattern or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new_empty()` | `Term` | Creates a term that matches the empty language. | +| `new_total()` | `Term` | Creates a term that matches all possible strings. | +| `new_empty_string()` | `Term` | Creates a term that only match the empty string `""`. | +| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | +| `from_pattern(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | +| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given collection of terms. Returns the resulting term. | +| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given collection of terms. Returns the resulting term. | +| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given collection of terms. Returns the resulting term. | +| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the subtraction/difference of the two given terms. Returns the resulting term. | +| `difference(&self, subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | + +#### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `Term::new_empty()` | `Term` | Create a term that matches the empty language. | -| `Term::new_total()` | `Term` | Create a term that matches all possible strings. | -| `Term::new_empty_string()` | `Term` | Create a term that only match the empty string `""`. | -| `Term::from_pattern(pattern: &str)` | `Result` | Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | -| `Term::from_regex(regex: RegularExpression)` | `Term` | Create a new `Term` holding the provided `RegularExpression`. | -| `Term::from_automaton(automaton: FastAutomaton)` | `Term` | Create a new `Term` holding the provided `FastAutomaton`. | -| `self.concat(terms: &[Term])` | `Result` | Compute the concatenation of the given collection of terms. Returns the resulting term. | -| `self.union(terms: &[Term])` | `Result` | Compute the union of the given collection of terms. Returns the resulting term. | -| `self.intersection(terms: &[Term])` | `Result` | Compute the intersection of the given collection of terms. Returns the resulting term. | -| `self.subtraction(subtrahend: &Term)` | `Result` | Compute the subtraction/difference of the two given terms. Returns the resulting term. | -| `self.difference(subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | -| `self.repeat(min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | -| `self.generate_strings(count: usize)` | `Result, EngineError>` | Generate the given count of strings matched by the given term. | -| `self.are_equivalent(term: &Term)` | `Result` | Compute whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | -| `self.is_subset_of(term: &Term)` | `Result` | Compute whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | -| `self.is_empty()` | `bool` | Check if the current term matches the empty language. | -| `self.is_total()` | `bool` | Check if the current term matches all possible strings. | -| `self.is_empty_string()` | `bool` | Check if the current term only match the empty string `""`. | -| `self.get_length()` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `self.get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | +| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | +| `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current term only match the empty string `""`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `to_automaton(&self)` | `Result, EngineError>` | Converts the current `Term` to a `FastAutomaton`. | +| `to_regex(&self)` | `Option>` | Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. | ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used, not all automaton can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. + +When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To construct a Condition, call: To build a `Condition`, call: +```rust +Condition::from_range(&range, &spanning_set); +``` +where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: +1. Merge an existing spanning set with another: +```rust +let new_set = SpanningSet::merge(&old_set, &other_set); +``` + +2. Recompute from a list of ranges: +```rust +let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); +``` + +After constructing `new_set`, apply it to the automaton: +```rust +fast_automaton.apply_new_spanning_set(&new_set); +``` + +This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). + +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new_empty()` | `FastAutomaton` | Create an automaton that matches the empty language. | +| `new_total()` | `FastAutomaton` | Create an automaton that matches all possible strings. | +| `new_empty_string()` | `FastAutomaton` | Create an automaton that only match the empty string `""`. | +| `new_from_range(range: &CharRange)` | `Result` | Create an automaton that matches one of the characters in the provided `CharRange`. | +| `new_state(&mut self)` | `State` | Create a new state in the automaton and returns its identifier. | +| `accept(&mut self, state: State)` | | Make the automaton accept the provided state as a valid final state. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Create a new epsilon transition between the two provided states. | +| `remove_state(&mut self, state: State)` | | Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. | +| `remove_states(&mut self, states: &IntSet)` | | Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. | +| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Apply the provided spanning set to the automaton and project all of its conditions on it. | + +#### Manipulate +| Method | Return | Description | +| -------- | ------- | ------- | +| `union(&self, other: &FastAutomaton)` | `Result` | | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | +| `concat(&self, other: &FastAutomaton)` | `Result` | | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `determinize(&self)` | `Result` | | +| `intersection(&self, other: &FastAutomaton)` | `Result` | | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | +| `complement(&mut self)` | `Result<(), EngineError>` | | +| `subtraction(&self, other: &FastAutomaton)` | `Result` | | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | +| `all_states_vec(&self)` | `Vec` | Returns a vector containing the states of the automaton. | +| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over all states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector containing all states directly reachable from the given state in one transition. | +| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions to the provided state. | +| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions from the provided state. | +| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator containing the transitions from the provided state. | +| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator containing the transitions from the provided state. | +| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator containing the transitions from the provided state. | +| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition between the two provided states. | +| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a reference of the directed transtion's condition between the two provided states. | +| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a mutable reference of the directed transtion's condition between the two provided states. | +| `get_start_state(&self)` | `State` | Returns the start state of the automaton. | +| `get_accept_states(&self)` | `&IntSet` | Get a reference to the set of accept (final) states of the automaton. | +| `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | +| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given `state` is one of the automaton's accept states. | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | +| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | +| `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | | ### RegularExpression -`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. ## Error Handling @@ -116,7 +233,7 @@ By default, all operations run without limits. For heavy or untrusted patterns, ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .execution_timeout(5) // We set the limit (5ms) @@ -133,8 +250,8 @@ execution_profile.run(|| { ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_regex(".*abcdef.*").unwrap(); -let term2 = Term::from_regex(".*defabc.*").unwrap(); +let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +let term2 = Term::from_pattern(".*defabc.*").unwrap(); let execution_profile = ExecutionProfileBuilder::new() .max_number_of_states(5) // We set the limit @@ -146,21 +263,6 @@ execution_profile.run(|| { }); ``` - - -## Key Concepts & Limitations - -RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: - -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". -- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. -- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. -- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character . matches every possible unicode characters including the line feed (`\n`). -- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. -- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. - -RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing expressions. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. - ## Cross-Language Support diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 708fbac..3ba3a33 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -10,8 +10,8 @@ use crate::error::EngineError; /// ``` /// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// let term1 = Term::from_regex(".*abcdef.*").unwrap(); -/// let term2 = Term::from_regex(".*defabc.*").unwrap(); +/// let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +/// let term2 = Term::from_pattern(".*defabc.*").unwrap(); /// /// let execution_profile = ExecutionProfileBuilder::new() /// .max_number_of_states(5) @@ -27,7 +27,7 @@ use crate::error::EngineError; /// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// use std::time::SystemTime; /// -/// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +/// let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); /// /// let execution_profile = ExecutionProfileBuilder::new() /// .execution_timeout(5) // 5ms diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 04ea226..a2d6d91 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,9 +65,9 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.transitions_vec() { + for from_state in &self.all_states_vec() { in_degree.entry(*from_state).or_insert(0); - for to_state in self.transitions_from_state_iter(from_state) { + for to_state in self.direct_states_iter(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.transitions_from_state_iter(&from_state) { + for to_state in self.direct_states_iter(&from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 70eccbd..c03ee80 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -26,7 +26,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states_iter(&state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -53,7 +53,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.transitions_from_state_iter(&state) { + for to_state in self.direct_states_iter(&state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 9340d7d..de49b73 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -27,15 +27,15 @@ impl FastAutomaton { #[inline] pub fn is_empty_string(&self) -> bool { - self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.in_degree(self.start_state) == 0 + self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 } pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.transitions_iter() { - for (to_state, transition) in self.transitions_from_state_enumerate_iter(&from_state) { - if transition.is_empty() { + for from_state in self.all_states_iter() { + for (condition, to_state) in self.transitions_from_iter(from_state) { + if condition.is_empty() { continue; } match states_map.entry(*to_state) { diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index c597747..d6b69f4 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,6 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Create an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -19,6 +20,7 @@ impl FastAutomaton { } } + /// Create an automaton that only match the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); @@ -26,26 +28,18 @@ impl FastAutomaton { automaton } + /// Create an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); automaton.spanning_set = SpanningSet::new_total(); automaton.accept(automaton.start_state); - automaton.add_transition_to(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); automaton } - #[inline] - pub fn make_empty(&mut self) { - self.apply_model(&Self::new_empty()) - } - - #[inline] - pub fn make_total(&mut self) { - self.apply_model(&Self::new_total()) - } - - pub fn make_from_range(range: &Range) -> Result { + /// Create an automaton that matches one of the characters in the provided `CharRange`. + pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { return Ok(automaton); @@ -55,44 +49,12 @@ impl FastAutomaton { let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); let condition = Condition::from_range(range, &spanning_set)?; automaton.spanning_set = spanning_set; - automaton.add_transition_to(0, new_state, &condition); + automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); Ok(automaton) } - pub fn apply_new_spanning_set( - &mut self, - new_spanning_set: &SpanningSet, - ) -> Result<(), EngineError> { - if new_spanning_set == &self.spanning_set { - return Ok(()); - } - let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.transitions_vec() { - for to_state in self.transitions_from_state(from_state) { - match self.transitions[*from_state].entry(to_state) { - Entry::Occupied(mut o) => { - o.insert(condition_converter.convert(o.get())?); - } - Entry::Vacant(_) => {} - }; - } - } - self.spanning_set = new_spanning_set.clone(); - Ok(()) - } - - #[inline] - pub fn apply_model(&mut self, model: &FastAutomaton) { - self.transitions = model.transitions.clone(); - self.start_state = model.start_state; - self.accept_states = model.accept_states.clone(); - self.removed_states = model.removed_states.clone(); - self.spanning_set = model.spanning_set.clone(); - self.deterministic = model.deterministic; - self.cyclic = model.cyclic; - } - + /// Create a new state in the automaton and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { if let Some(new_state) = self.removed_states.clone().iter().next() { @@ -104,13 +66,15 @@ impl FastAutomaton { } } + /// Make the automaton accept the provided state as a valid final state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); self.accept_states.insert(state); } - pub fn add_transition_to(&mut self, from_state: State, to_state: State, new_cond: &Condition) { + /// Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. + pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { self.assert_state_exists(to_state); @@ -121,7 +85,7 @@ impl FastAutomaton { if self.deterministic { let mut deterministic = true; - for (state, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, state) in self.transitions_from_iter(from_state) { if state == &to_state { continue; } @@ -147,7 +111,8 @@ impl FastAutomaton { }; } - pub fn add_epsilon(&mut self, from_state: State, to_state: State) { + /// Create a new epsilon transition between the two provided states. + pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; } @@ -157,12 +122,12 @@ impl FastAutomaton { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_state_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self.transitions_from_into_iter(&to_state).collect(); - for (state, cond) in transitions_to { + for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (s, c) in self.transitions_from_state_enumerate_iter(&from_state) { + for (c, s) in self.transitions_from_iter(from_state) { if state == *s { continue; } @@ -188,12 +153,11 @@ impl FastAutomaton { } } + /// Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { - panic!( - "Can not remove the state {state}, it is still used as start state." - ); + panic!("Can not remove the state {state}, it is still used as start state."); } self.accept_states.remove(&state); self.transitions_in.remove(&state); @@ -219,6 +183,7 @@ impl FastAutomaton { } } + /// Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); @@ -226,9 +191,7 @@ impl FastAutomaton { for &state in states { if self.start_state == state { - panic!( - "Can not remove the state {state}, it is still used as start state." - ); + panic!("Can not remove the state {state}, it is still used as start state."); } if self.transitions.len() - 1 == state { self.transitions.remove(state); @@ -259,6 +222,50 @@ impl FastAutomaton { } } } + + /// Apply the provided spanning set to the automaton and project all of its conditions on it. + pub fn apply_new_spanning_set( + &mut self, + new_spanning_set: &SpanningSet, + ) -> Result<(), EngineError> { + if new_spanning_set == &self.spanning_set { + return Ok(()); + } + let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; + for from_state in &self.all_states_vec() { + for to_state in self.direct_states_vec(from_state) { + match self.transitions[*from_state].entry(to_state) { + Entry::Occupied(mut o) => { + o.insert(condition_converter.convert(o.get())?); + } + Entry::Vacant(_) => {} + }; + } + } + self.spanning_set = new_spanning_set.clone(); + Ok(()) + } + + #[inline] + pub(crate) fn make_empty(&mut self) { + self.apply_model(&Self::new_empty()) + } + + #[inline] + pub(crate) fn make_total(&mut self) { + self.apply_model(&Self::new_total()) + } + + #[inline] + pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { + self.transitions = model.transitions.clone(); + self.start_state = model.start_state; + self.accept_states = model.accept_states.clone(); + self.removed_states = model.removed_states.clone(); + self.spanning_set = model.spanning_set.clone(); + self.deterministic = model.deterministic; + self.cyclic = model.cyclic; + } } #[cfg(test)] diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 89bb123..503d6ce 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -86,17 +86,16 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::range::AnyRange}; - - use crate::Range; + use regex_charclass::{char::Char, irange::{range::AnyRange}}; + use crate::CharRange; use super::*; fn get_from_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -104,11 +103,11 @@ mod tests { fn get_to_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -127,7 +126,7 @@ mod tests { let total = Condition::total(&from_spanning_set); assert!(converter.convert(&total).unwrap().is_total()); - let range = Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')); + let range = CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -138,7 +137,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); + let range = CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -149,7 +148,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_ranges(&[ + let range = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), AnyRange::from(Char::new('\u{9}')..=Char::new('\u{9}')), ]); diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 40415e3..08a439b 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,10 +1,9 @@ use std::hash::Hash; -use crate::Range; use fast_bit_vec::FastBitVec; use regex_charclass::{char::Char, CharacterClass}; -use crate::error::EngineError; +use crate::{error::EngineError, CharRange}; use super::spanning_set::SpanningSet; pub mod converter; @@ -43,7 +42,7 @@ impl Condition { )) } - pub fn from_range(range: &Range, spanning_set: &SpanningSet) -> Result { + pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); } else if range.is_total() { @@ -69,8 +68,8 @@ impl Condition { Ok(cond) } - pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { - let mut range = Range::empty(); + pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { + let mut range = CharRange::empty(); for (i, base) in spanning_set .get_spanning_ranges_with_rest() @@ -166,25 +165,25 @@ mod tests { fn get_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) } - fn get_test_cases_range() -> Vec { + fn get_test_cases_range() -> Vec { vec![ - Range::empty(), - Range::total(), - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_ranges(&[ + CharRange::empty(), + CharRange::total(), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{2}')), AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), ]), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ] } @@ -200,16 +199,16 @@ mod tests { assert!(total.is_total()); assert_eq!(vec![true, true, true, true], total.get_binary_representation()); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(empty, total.complement()); @@ -219,18 +218,18 @@ mod tests { let empty = Condition::empty(&spanning_set); let total = Condition::total(&spanning_set); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!(vec![false], empty.get_binary_representation()); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(vec![true], total.get_binary_representation()); @@ -252,7 +251,7 @@ mod tests { Ok(()) } - fn assert_range_convertion_to_range(range: &Range, spanning_set: &SpanningSet) { + fn assert_range_convertion_to_range(range: &CharRange, spanning_set: &SpanningSet) { let condition = Condition::from_range(range, spanning_set).unwrap(); let range_from_condition = condition.to_range(spanning_set).unwrap(); assert_eq!(range, &range_from_condition); @@ -267,11 +266,11 @@ mod tests { let current_spanning_set = get_spanning_set(); let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); let condition_converter = @@ -296,7 +295,7 @@ mod tests { } fn assert_project_to( - range: &Range, + range: &CharRange, currently_used_spanning_set: &SpanningSet, newly_used_spanning_set: &SpanningSet, condition_converter: &ConditionConverter, @@ -348,8 +347,8 @@ mod tests { } fn assert_union_intersection_complement( - range_1: &Range, - range_2: &Range, + range_1: &CharRange, + range_2: &CharRange, used_characters: &SpanningSet, ) { let condition_1 = Condition::from_range(range_1, used_characters).unwrap(); @@ -378,14 +377,14 @@ mod tests { #[test] fn test_1() -> Result<(), String> { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), - Range::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), + CharRange::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), ]; let spanning_set = SpanningSet::compute_spanning_set(&ranges); println!("{:?}", spanning_set); - let range1 = Range::new_from_ranges(&[ + let range1 = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{9}')), AnyRange::from(Char::new('\u{B}')..=Char::new('\u{63}')), AnyRange::from(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), @@ -393,7 +392,7 @@ mod tests { let condition1 = Condition::from_range(&range1, &spanning_set).unwrap(); assert_eq!(range1, condition1.to_range(&spanning_set).unwrap()); - let range2 = Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); + let range2 = CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); let condition2 = Condition::from_range(&range2, &spanning_set).unwrap(); assert_eq!(range2, condition2.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/builder/mod.rs index 648f733..0790851 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/builder/mod.rs @@ -2,7 +2,7 @@ use super::*; mod scc; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { if automaton.is_empty() { return Ok(None); @@ -19,15 +19,15 @@ impl StateEliminationAutomaton { let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); - for from_state in automaton.transitions_iter() { + for from_state in automaton.all_states_iter() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, condition) in - automaton.transitions_from_state_enumerate_into_iter(&from_state) + for (condition, to_state) in + automaton.transitions_from_iter(from_state) { let new_to_state = *states_map - .entry(to_state) + .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); state_elimination_automaton.add_transition_to( @@ -93,7 +93,7 @@ impl StateEliminationAutomaton { &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: GraphTransition, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -163,7 +163,7 @@ impl StateEliminationAutomaton { self.transitions[from_state].remove(&to_state); } - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { + pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { self.transitions.get(from_state)?.get(&to_state) } } diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs index 815188a..c99cbc5 100644 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ b/src/fast_automaton/convert/to_regex/builder/scc.rs @@ -1,6 +1,6 @@ use super::*; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { let mut index = 0; let mut stack = Vec::new(); diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 17d539f..2469d03 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -5,11 +5,10 @@ use std::{ use ahash::{HashMapExt, HashSetExt}; use log::warn; -use nohash_hasher::IntMap; use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; -use super::{FastAutomaton, IntSet, Range, State}; +use super::*; mod builder; mod transform; @@ -45,13 +44,13 @@ struct StateEliminationAutomaton { cyclic: bool, } -impl Display for StateEliminationAutomaton { +impl Display for StateEliminationAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.to_graph_dot(sb, None) } } -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { //#[cfg(test)] #[allow(dead_code)] #[inline] @@ -165,7 +164,7 @@ impl StateEliminationAutomaton { pub fn transitions_from_state_enumerate_iter( &self, from_state: &State, - ) -> impl Iterator)> { + ) -> impl Iterator)> { self.transitions[*from_state] .iter() .filter(|s| !self.removed_states.contains(s.0)) @@ -180,7 +179,7 @@ impl StateEliminationAutomaton { .collect() } - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { + pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs index aaeca76..4498578 100644 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ b/src/fast_automaton/convert/to_regex/transform.rs @@ -4,7 +4,7 @@ use crate::execution_profile::ExecutionProfile; use super::*; -impl StateEliminationAutomaton { +impl StateEliminationAutomaton { pub fn convert_to_regex( &self, execution_profile: &ExecutionProfile, diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 0efa0e3..7bbaf58 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -15,10 +15,10 @@ impl FastAutomaton { let execution_profile = ExecutionProfile::get(); - let mut ranges_cache: AHashMap<&Condition, Range> = + let mut ranges_cache: AHashMap<&Condition, CharRange> = AHashMap::with_capacity(self.get_number_of_states()); - let mut worklist: VecDeque<(Vec, usize)> = + let mut worklist: VecDeque<(Vec, usize)> = VecDeque::with_capacity(cmp::min(number, 1000)); let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); @@ -57,7 +57,7 @@ impl FastAutomaton { break; } } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&state) { + for (cond, to_state) in self.transitions_from_iter(state) { execution_profile.assert_not_timed_out()?; let range = match ranges_cache.entry(cond) { Entry::Occupied(o) => o.get().clone(), diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index a4da641..4b2475b 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -1,18 +1,25 @@ -use crate::Range; +use crate::error::EngineError; use ahash::{AHashMap, HashSetExt}; use condition::Condition; use regex_charclass::CharacterClass; use spanning_set::SpanningSet; -use std::collections::hash_map::Entry; use std::collections::VecDeque; +use std::collections::hash_map::Entry; use std::fmt::Display; -use crate::error::EngineError; -use crate::{IntMap, IntSet}; +use super::*; -pub(crate) type State = usize; pub(crate) type Transitions = IntMap; +/// The identifier of state in an [`FastAutomaton`] +pub type State = usize; + +/// A tuple containing the condition of a transition to a state. +pub type TransitionTo = (Condition, State); + +/// A tuple containing the condition of a transition from a state. +pub type TransitionFrom = (State, Condition); + mod analyze; mod builder; pub mod condition; @@ -40,7 +47,7 @@ impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; @@ -52,7 +59,7 @@ impl Display for FastAutomaton { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; writeln!(sb, "\tinitial -> {from_state}")?; } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { + for (cond, to_state) in self.transitions_from_iter(from_state) { writeln!( sb, "\t{from_state} -> {to_state} [label=\"{}\"]", @@ -76,80 +83,110 @@ impl FastAutomaton { } } + /// Returns the number of transitions to the provided state. #[inline] - pub fn in_degree(&self, state: State) -> usize { + pub fn state_in_degree(&self, state: State) -> usize { self.transitions_in .get(&state) .unwrap_or(&IntSet::new()) .len() } + /// Returns the number of transitions from the provided state. #[inline] - pub fn out_degree(&self, state: State) -> usize { + pub fn state_out_degree(&self, state: State) -> usize { self.transitions[state].len() } - pub fn in_transitions(&self, state: State) -> Vec<(usize, Condition)> { + /// Returns an iterator of the state of the automaton. + #[inline] + pub fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + /// Returns a vector containing the states of the automaton. + #[inline] + pub fn all_states_vec(&self) -> Vec { + self.all_states_iter().collect() + } + + /// Returns an iterator over all states directly reachable from the given state in one transition. + #[inline] + pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { + self.transitions[*state] + .keys() + .cloned() + .filter(|s| !self.removed_states.contains(s)) + } + + /// Returns a vector containing all states directly reachable from the given state in one transition. + #[inline] + pub fn direct_states_vec(&self, state: &State) -> Vec { + self.direct_states_iter(state).collect() + } + + /// Returns a vector containing the transitions to the provided state. + pub fn transitions_to_vec(&self, state: State) -> Vec { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { - for (to_state, condition) in self.transitions_from_state_enumerate_vec(from_state) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { if to_state == state { in_transitions.push((*from_state, condition)); + break; } } } in_transitions } - pub fn in_states(&self, state: State) -> IntSet { - self.transitions_in - .get(&state) - .unwrap_or(&IntSet::new()) - .clone() - } - + /// Returns a vector containing the transitions from the provided state. #[inline] - pub fn transitions_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_vec(&self) -> Vec { - self.transitions_iter().collect() + pub fn transitions_from_vec(&self, state: State) -> Vec { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() } + /// Returns an iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_iter( + pub fn transitions_from_iter( &self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] + state: State, + ) -> impl Iterator { + self.transitions[state] .iter() - .filter(|s| !self.removed_states.contains(s.0)) + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) } + /// Returns a mutable iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_iter_mut( + pub fn transitions_from_iter_mut( &mut self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] + state: &State, + ) -> impl Iterator { + self.transitions[*state] .iter_mut() - .filter(|s| !self.removed_states.contains(s.0)) + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) } + /// Returns an owned iterator containing the transitions from the provided state. #[inline] - pub fn transitions_from_state_enumerate_vec( + pub fn transitions_from_into_iter( &self, - from_state: &State, - ) -> Vec<(State, Condition)> { - self.transitions[*from_state] - .iter() - .map(|(s, c)| (*s, c.clone())) - .filter(|s| !self.removed_states.contains(&s.0)) - .collect() + state: &State, + ) -> impl Iterator + '_ { + self.transitions[*state] + .clone() + .into_iter() + .map(|(s, c)| (c, s)) + .filter(|(_, state)| !self.removed_states.contains(state)) } + /// Returns `true` if there is a directed transition between the two provided states. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { @@ -172,91 +209,65 @@ impl FastAutomaton { .collect() } - #[inline] - pub fn transitions_from_state_enumerate_into_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions - .get(*from_state) // Assume transitions is a map; adjust accordingly. - .into_iter() // Creates an iterator over Option<&V> - .flat_map(|transitions| transitions.iter()) // Flattens into Iterator - .filter(move |(state, _)| !self.removed_states.contains(state)) // Filters out removed states - .map(|(state, condition)| (*state, condition.clone())) // Creates owned data; adjust if cloning is expensive - } - - #[inline] - pub fn transitions_from_state_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions[*from_state] - .keys() - .cloned() - .filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state(&self, from_state: &State) -> Vec { - self.transitions_from_state_iter(from_state).collect() - } - - #[inline] - pub fn transitions_from_state_into_iter<'a>( - &'a self, - from_state: &State, - ) -> impl Iterator + 'a { - self.transitions[*from_state] - .clone() - .into_iter() - .filter(|s| !self.removed_states.contains(&s.0)) - } - + // Returns the number of states in the automaton. #[inline] pub fn get_number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } + // Get a reference of the directed transtion's condition between the two provided states. #[inline] - pub fn get_condition(&self, from_state: &State, to_state: &State) -> Option<&Condition> { - self.transitions[*from_state].get(to_state) + pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { + self.transitions[from_state].get(&to_state) } + // Get a mutable reference of the directed transtion's condition between the two provided states. #[inline] - pub fn get_start_state(&self) -> State { - self.start_state + pub fn get_condition_mut( + &mut self, + from_state: State, + to_state: State, + ) -> Option<&mut Condition> { + self.transitions[from_state].get_mut(&to_state) } + /// Returns the start state of the automaton. #[inline] - pub fn get_removed_states(&self) -> &IntSet { - &self.removed_states + pub fn get_start_state(&self) -> State { + self.start_state } + // Get a reference to the set of accept (final) states of the automaton. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states } + /// Returns a reference to the automaton's spanning set. #[inline] pub fn get_spanning_set(&self) -> &SpanningSet { &self.spanning_set } + /// Returns `true` if the given `state` is one of the automaton's accept states. #[inline] pub fn is_accepted(&self, state: &State) -> bool { self.accept_states.contains(state) } + /// Returns `true` if the automaton is deterministic. #[inline] pub fn is_determinitic(&self) -> bool { self.deterministic } + /// Returns `true` if the automaton contains at least one cycle. #[inline] pub fn is_cyclic(&self) -> bool { self.cyclic } + /// Returns `true` if the automaton has the provided state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) @@ -274,7 +285,7 @@ impl FastAutomaton { continue; } let curr_char = input.chars().nth(position).unwrap() as u32; - for (to_state, cond) in self.transitions_from_state_enumerate_iter(current_state) { + for (cond, to_state) in self.transitions_from_iter(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { if position + 1 == input.len() { if self.accept_states.contains(to_state) { diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concat.rs similarity index 96% rename from src/fast_automaton/operation/concatenate.rs rename to src/fast_automaton/operation/concat.rs index b22ad2d..71d97b0 100644 --- a/src/fast_automaton/operation/concatenate.rs +++ b/src/fast_automaton/operation/concat.rs @@ -11,9 +11,7 @@ impl FastAutomaton { Self::concat_all([self, other]) } - pub fn concat_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); for automaton in automatons { @@ -41,12 +39,12 @@ impl FastAutomaton { BuildHasherDefault::default(), ); - let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 + let start_state_and_accept_states_not_mergeable = other.state_in_degree(other.start_state) > 0 && self .accept_states .iter() .cloned() - .any(|s| self.out_degree(s) > 0); + .any(|s| self.state_out_degree(s) > 0); let accept_states = self.accept_states.iter().cloned().collect::>(); @@ -67,7 +65,7 @@ impl FastAutomaton { } } - for from_state in other.transitions_iter() { + for from_state in other.all_states_iter() { let new_from_states = match new_states.entry(from_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -86,7 +84,7 @@ impl FastAutomaton { } }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { + for (condition, to_state) in other.transitions_from_iter(from_state) { let new_to_states = match new_states.entry(*to_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -107,7 +105,7 @@ impl FastAutomaton { let projected_condition = condition_converter.convert(condition)?; for new_from_state in new_from_states.iter() { for new_to_state in new_to_states.iter() { - self.add_transition_to( + self.add_transition( *new_from_state, *new_to_state, &projected_condition, @@ -120,7 +118,7 @@ impl FastAutomaton { if start_state_and_accept_states_not_mergeable { if let Some(&other_start_state) = new_states.get(&other.start_state) { for accept_state in &accept_states { - self.add_epsilon(*accept_state, other_start_state); + self.add_epsilon_transition(*accept_state, other_start_state); } } } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 1cf7a88..b0efb67 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -39,7 +39,7 @@ impl FastAutomaton { for base in &ranges { for from_state in &states { - for (to_state, cond) in self.transitions_from_state_enumerate_iter(from_state) { + for (cond, to_state) in self.transitions_from_iter(*from_state) { if cond.has_intersection(base) { match new_states_to_add.binary_search(to_state) { Ok(_) => {} // element already in vector @ `pos` @@ -60,7 +60,7 @@ impl FastAutomaton { } }; - new_automaton.add_transition_to(r, q, base); + new_automaton.add_transition(r, q, base); } new_states_to_add.clear(); } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 8f3e4a3..5dac078 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -16,9 +16,7 @@ impl FastAutomaton { FastAutomaton::intersection_all([self, other]) } - pub fn intersection_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -33,9 +31,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - pub fn intersection_all_par<'a, I>(automatons: I) -> Result - where - I: IntoParallelIterator, + pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -102,8 +98,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -118,7 +114,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -168,8 +164,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -184,7 +180,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -195,11 +191,11 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_state_enumerate_iter(&state) - .map(|(&s, c)| match condition_converter.convert(c) { - Ok(condition) => Ok((s, condition)), + .transitions_from_iter(state) + .map(|(c, &s)| match condition_converter.convert(c) { + Ok(condition) => Ok((condition, s)), Err(err) => Err(err), }) .collect(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index bf0523e..a574a0e 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -4,20 +4,20 @@ use ahash::AHasher; use super::*; -mod alternation; -mod concatenate; +mod union; +mod concat; mod determinize; mod intersection; mod subtraction; mod repeat; impl FastAutomaton { - pub fn remove_dead_transitions(&mut self) { + pub(crate) fn remove_dead_transitions(&mut self) { if !self.is_empty() { let reacheable_states = self.get_reacheable_states(); let mut dead_states = IntSet::default(); - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { if !reacheable_states.contains(&from_state) { dead_states.insert(from_state); } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index f451678..2fd14db 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -20,20 +20,20 @@ impl FastAutomaton { let automaton_to_repeat = self.clone(); - if min == 0 && self.in_degree(self.start_state) != 0 { + if min == 0 && self.state_in_degree(self.start_state) != 0 { let new_state = self.new_state(); if self.is_accepted(&self.start_state) { self.accept(new_state); } - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); + for to_state in self.direct_states_vec(&self.start_state) { + self.add_epsilon_transition(new_state, to_state); } self.start_state = new_state; if max_opt.is_none() { for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); + self.add_epsilon_transition(accept_state, self.start_state); } self.accept(self.start_state); return Ok(()); @@ -59,10 +59,10 @@ impl FastAutomaton { let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 + && automaton_to_repeat.state_out_degree(accept_state) == 0 + && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); + automaton_to_repeat.add_epsilon_transition(accept_state, automaton_to_repeat.start_state); let old_start_state = automaton_to_repeat.start_state; automaton_to_repeat.start_state = accept_state; automaton_to_repeat.remove_state(old_start_state); @@ -76,7 +76,7 @@ impl FastAutomaton { for state in automaton_to_repeat.accept_states.clone() { for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); + automaton_to_repeat.add_transition(state, *to_state, condition); } } diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index d513fbb..8d45ae7 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -17,9 +17,9 @@ impl FastAutomaton { ); let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.transitions_iter() { + for from_state in self.all_states_iter() { let mut new_condition = Condition::empty(&self.spanning_set); - for (_, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, _) in self.transitions_from_iter(from_state) { new_condition = new_condition.union(condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -30,14 +30,14 @@ impl FastAutomaton { } for (from_state, condition) in &transitions_to_crash_state { - self.add_transition_to(*from_state, crash_state, condition); + self.add_transition(*from_state, crash_state, condition); ranges.push(condition.to_range(self.get_spanning_set())?); } let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); self.apply_new_spanning_set(&new_spanning_set)?; - if self.in_degree(crash_state) == 1 { + if self.state_in_degree(crash_state) == 1 { self.remove_state(crash_state); } Ok(()) @@ -47,7 +47,7 @@ impl FastAutomaton { self.totalize()?; let mut new_accept_states = IntSet::default(); - for state in self.transitions_iter() { + for state in self.all_states_iter() { if self.accept_states.contains(&state) { continue; } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/union.rs similarity index 87% rename from src/fast_automaton/operation/alternation.rs rename to src/fast_automaton/operation/union.rs index fe1ab80..ea9ad44 100644 --- a/src/fast_automaton/operation/alternation.rs +++ b/src/fast_automaton/operation/union.rs @@ -12,9 +12,7 @@ impl FastAutomaton { Self::union_all([self, other]) } - pub fn union_all<'a, I>(automatons: I) -> Result - where - I: IntoIterator, + pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { @@ -23,9 +21,7 @@ impl FastAutomaton { Ok(new_automaton) } - pub fn union_all_par<'a, I>(automatons: I) -> Result - where - I: IntoParallelIterator, + pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -57,9 +53,9 @@ impl FastAutomaton { new_states: &mut IntMap, condition_converter: &ConditionConverter, ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.in_degree(self.start_state); - let other_start_state_in_degree = other.in_degree(other.start_state); + let mut imcomplete_states = IntSet::with_capacity(other.state_out_degree(other.start_state) + 1); + let self_start_state_in_degree = self.state_in_degree(self.start_state); + let other_start_state_in_degree = other.state_in_degree(other.start_state); if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { // The start states can be the same state without any consequence new_states.insert(other.start_state, self.start_state); @@ -71,9 +67,9 @@ impl FastAutomaton { self.accept(new_state); } - for (to_state, cond) in self.transitions_from_state_enumerate_vec(&self.start_state) + for (cond, to_state) in self.transitions_from_vec(self.start_state) { - self.add_transition_to(new_state, to_state, &cond); + self.add_transition(new_state, to_state, &cond); } self.start_state = new_state; } @@ -87,8 +83,8 @@ impl FastAutomaton { new_states.insert(other.start_state, new_state); imcomplete_states.insert(new_state); - for (other_to_state, cond) in - other.transitions_from_state_enumerate_vec(&other.start_state) + for (cond, other_to_state) in + other.transitions_from_vec(other.start_state) { let cond = condition_converter.convert(&cond)?; let to_state = match new_states.entry(other_to_state) { @@ -100,7 +96,7 @@ impl FastAutomaton { new_state } }; - self.add_transition_to(self.start_state, to_state, &cond); + self.add_transition(self.start_state, to_state, &cond); } } } @@ -115,7 +111,7 @@ impl FastAutomaton { ) { let mut self_accept_states_without_outgoing_edges = vec![]; for &state in &self.accept_states { - if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { + if self.state_out_degree(state) == 0 && !imcomplete_states.contains(&state) { self_accept_states_without_outgoing_edges.push(state); } } @@ -127,8 +123,8 @@ impl FastAutomaton { self.accept(new_state); for &accept_state in &self_accept_states_without_outgoing_edges { - for (from_state, condition) in self.in_transitions(accept_state) { - self.add_transition_to(from_state, new_state, &condition); + for (from_state, condition) in self.transitions_to_vec(accept_state) { + self.add_transition(from_state, new_state, &condition); } self.remove_state(accept_state); } @@ -142,7 +138,7 @@ impl FastAutomaton { }; for &state in &other.accept_states { - if other.out_degree(state) == 0 { + if other.state_out_degree(state) == 0 { new_states .entry(state) .or_insert(accept_state_without_outgoing_edges); @@ -182,7 +178,7 @@ impl FastAutomaton { self.prepare_start_states(other, &mut new_states, &condition_converter)?; self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - for from_state in other.transitions_iter() { + for from_state in other.all_states_iter() { let new_from_state = match new_states.entry(from_state) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -191,7 +187,7 @@ impl FastAutomaton { new_state } }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { + for (condition, to_state) in other.transitions_from_iter(from_state) { let new_condition = condition_converter.convert(condition)?; let new_to_state = match new_states.entry(*to_state) { Entry::Occupied(o) => *o.get(), @@ -201,7 +197,7 @@ impl FastAutomaton { new_state } }; - self.add_transition_to(new_from_state, new_to_state, &new_condition); + self.add_transition(new_from_state, new_to_state, &new_condition); } } self.cyclic = self.cyclic || other.cyclic; diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2aa2780..2e998b8 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -1,22 +1,24 @@ use std::slice::Iter; use ahash::AHashSet; -use regex_charclass::{char::Char, irange::RangeSet}; + #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -/// Contains a set of [`RangeSet`] that span all the transition of a [`crate::FastAutomaton`]. +use crate::CharRange; + +/// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] -pub struct SpanningSet(Vec>, RangeSet); +pub struct SpanningSet(Vec, CharRange); impl SpanningSet { pub fn new_empty() -> Self { - SpanningSet(vec![], RangeSet::total()) + SpanningSet(vec![], CharRange::total()) } pub fn new_total() -> Self { - SpanningSet(vec![RangeSet::total()], RangeSet::empty()) + SpanningSet(vec![CharRange::total()], CharRange::empty()) } pub fn is_empty(&self) -> bool { @@ -35,7 +37,7 @@ impl SpanningSet { } } - pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec> { + pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec { if self.1.is_empty() { self.0.clone() } else { @@ -45,7 +47,7 @@ impl SpanningSet { } } - pub fn get_spanning_ranges(&self) -> Iter> { + pub fn get_spanning_ranges(&self) -> Iter { self.0.iter() } @@ -53,11 +55,11 @@ impl SpanningSet { self.0.len() } - pub fn get_spanning_range(&self, i: usize) -> Option<&RangeSet> { + pub fn get_spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } - pub fn get_rest(&self) -> &RangeSet { + pub fn get_rest(&self) -> &CharRange { &self.1 } @@ -69,8 +71,8 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } - pub fn compute_spanning_set(ranges: &[RangeSet]) -> Self { - let mut spanning_ranges: Vec> = ranges.to_vec(); + pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { + let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); spanning_ranges.dedup(); @@ -105,7 +107,7 @@ impl SpanningSet { spanning_ranges.sort_unstable(); - let mut total = RangeSet::empty(); + let mut total = CharRange::empty(); for base in &spanning_ranges { total = total.union(base); } diff --git a/src/lib.rs b/src/lib.rs index 5bd5ea0..3de5f60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,9 +24,9 @@ pub mod fast_automaton; pub mod regex; pub mod tokenizer; -type IntMap = HashMap>>; -type IntSet = HashSet>>; -type Range = RangeSet; +pub type IntMap = HashMap>>; +pub type IntSet = HashSet>>; +pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// @@ -97,9 +97,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("d.").unwrap(); - /// let term3 = Term::from_regex(".*").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("d.").unwrap(); + /// let term3 = Term::from_pattern(".*").unwrap(); /// /// let concat = term1.concat(&[term2, term3]).unwrap(); /// @@ -122,7 +122,7 @@ impl Term { } for term in terms { if has_automaton { - return_automaton = return_automaton.concat(term.get_automaton()?.as_ref())?; + return_automaton = return_automaton.concat(term.to_automaton()?.as_ref())?; } else { match term { Term::RegularExpression(regular_expression) => { @@ -138,8 +138,6 @@ impl Term { if !has_automaton { Ok(Term::RegularExpression(return_regex)) - } else if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) } else { Ok(Term::Automaton(return_automaton)) } @@ -153,9 +151,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); - /// let term3 = Term::from_regex("fghi").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); + /// let term3 = Term::from_pattern("fghi").unwrap(); /// /// let union = term1.union(&[term2, term3]).unwrap(); /// @@ -194,13 +192,11 @@ impl Term { FastAutomaton::union_all(automaton_list) }?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } else { - let regexes_list = self.get_regexes(terms)?; + let regexes_list = self + .get_regexes(terms) + .expect("No automaton should be here so this operation is not supposed to fail."); let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); @@ -218,9 +214,9 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de){2}").unwrap(); - /// let term2 = Term::from_regex("de.*").unwrap(); - /// let term3 = Term::from_regex(".*abc").unwrap(); + /// let term1 = Term::from_pattern("(abc|de){2}").unwrap(); + /// let term2 = Term::from_pattern("de.*").unwrap(); + /// let term3 = Term::from_pattern(".*abc").unwrap(); /// /// let intersection = term1.intersection(&[term2, term3]).unwrap(); /// @@ -245,11 +241,7 @@ impl Term { FastAutomaton::intersection_all(automaton_list) }?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } /// Compute the subtraction of the current term and the given `subtrahend`. @@ -260,8 +252,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); /// /// let subtraction = term1.subtraction(&term2).unwrap(); /// @@ -270,17 +262,13 @@ impl Term { /// } /// ``` pub fn subtraction(&self, subtrahend: &Term) -> Result { - let minuend_automaton = self.get_automaton()?; - let subtrahend_automaton = subtrahend.get_automaton()?; + let minuend_automaton = self.to_automaton()?; + let subtrahend_automaton = subtrahend.to_automaton()?; let subtrahend_automaton = Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; - if let Some(return_regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(return_regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } /// See [`Self::subtraction`]. @@ -297,7 +285,7 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("abc").unwrap(); + /// let term = Term::from_pattern("abc").unwrap(); /// /// let repeat = term.repeat(1, None).unwrap(); /// @@ -318,11 +306,7 @@ impl Term { )), Term::Automaton(fast_automaton) => { let repeat_automaton = fast_automaton.repeat(min, max_opt)?; - Ok(if let Some(repeat_regex) = repeat_automaton.to_regex() { - Term::RegularExpression(repeat_regex) - } else { - Term::Automaton(repeat_automaton) - }) + Ok(Term::Automaton(repeat_automaton)) } } } @@ -334,7 +318,7 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("(abc|de){2}").unwrap(); + /// let term = Term::from_pattern("(abc|de){2}").unwrap(); /// /// let strings = term.generate_strings(3).unwrap(); /// @@ -342,7 +326,7 @@ impl Term { /// ``` pub fn generate_strings(&self, count: usize) -> Result, EngineError> { Ok(self - .get_automaton()? + .to_automaton()? .generate_strings(count)? .into_iter() .collect()) @@ -356,8 +340,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("(abc|de)*").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// /// assert!(!term1.are_equivalent(&term2).unwrap()); /// ``` @@ -366,8 +350,8 @@ impl Term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; + let automaton_1 = self.to_automaton()?; + let automaton_2 = that.to_automaton()?; automaton_1.is_equivalent_of(&automaton_2) } @@ -379,8 +363,8 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("de").unwrap(); - /// let term2 = Term::from_regex("(abc|de)").unwrap(); + /// let term1 = Term::from_pattern("de").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// /// assert!(term1.is_subset_of(&term2).unwrap()); /// ``` @@ -389,12 +373,11 @@ impl Term { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; + let automaton_1 = self.to_automaton()?; + let automaton_2 = that.to_automaton()?; automaton_1.is_subset_of(&automaton_2) } - /// Check if the current term matches the empty language. pub fn is_empty(&self) -> bool { match self { @@ -447,6 +430,20 @@ impl Term { } } + pub fn to_automaton(&self) -> Result, EngineError> { + Ok(match self { + Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), + Term::Automaton(automaton) => Cow::Borrowed(automaton), + }) + } + + pub fn to_regex(&self) -> Option> { + Some(match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()?), + }) + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -466,18 +463,18 @@ impl Term { parallel: bool, ) -> Result>, EngineError> { let mut automaton_list = Vec::with_capacity(terms.len() + 1); - automaton_list.push(self.get_automaton()?); + automaton_list.push(self.to_automaton()?); let mut terms_automata = if parallel { let execution_profile = ExecutionProfile::get(); terms .par_iter() - .map(|a| execution_profile.apply(|| a.get_automaton())) + .map(|a| execution_profile.apply(|| a.to_automaton())) .collect::, _>>() } else { terms .iter() - .map(Term::get_automaton) + .map(Term::to_automaton) .collect::, _>>() }?; automaton_list.append(&mut terms_automata); @@ -485,40 +482,17 @@ impl Term { Ok(automaton_list) } - fn get_regexes<'a>( - &'a self, - terms: &'a [Term], - ) -> Result>, EngineError> { + fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { let mut regex_list = Vec::with_capacity(terms.len() + 1); - regex_list.push(self.get_regex()?); + regex_list.push(self.to_regex()?); let mut terms_regexes = terms .iter() - .map(Term::get_regex) - .collect::, _>>()?; + .map(Term::to_regex) + .collect::>>()?; regex_list.append(&mut terms_regexes); - Ok(regex_list) - } - - fn get_automaton(&self) -> Result, EngineError> { - Ok(match self { - Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), - Term::Automaton(automaton) => Cow::Borrowed(automaton), - }) - } - - fn get_regex(&self) -> Result, EngineError> { - Ok(match self { - Term::RegularExpression(regex) => Cow::Borrowed(regex), - Term::Automaton(automaton) => { - if let Some(regex) = automaton.to_regex() { - Cow::Owned(regex) - } else { - todo!() - } - } - }) + Some(regex_list) } } @@ -629,7 +603,10 @@ mod tests { assert_eq!(diff.to_string(), "a+"); // Repetition - let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); // (abc){2,4} + let rep = Term::from_pattern("abc") + .unwrap() + .repeat(2, Some(4)) + .unwrap(); // (abc){2,4} assert_eq!(rep.to_string(), "(abc){2,4}"); // Analyze diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e8a354f..b1958a1 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -34,14 +34,14 @@ impl RegularExpression { pub fn new_total() -> Self { RegularExpression::Repetition( - Box::new(RegularExpression::Character(Range::total())), + Box::new(RegularExpression::Character(CharRange::total())), 0, None, ) } pub fn new_empty() -> Self { - RegularExpression::Character(Range::empty()) + RegularExpression::Character(CharRange::empty()) } pub fn new_empty_string() -> Self { @@ -56,7 +56,7 @@ impl RegularExpression { if let Ok(string) = String::from_utf8(literal.0.clone().into_vec()) { for char in string.chars() { regex_concat = regex_concat.concat( - &RegularExpression::Character(Range::new_from_range( + &RegularExpression::Character(CharRange::new_from_range( Char::new(char)..=Char::new(char), )), true, @@ -104,24 +104,24 @@ impl RegularExpression { } } - fn to_range_unicode(class_unicode: &ClassUnicode) -> Range { + fn to_range_unicode(class_unicode: &ClassUnicode) -> CharRange { let mut new_range = Vec::with_capacity(class_unicode.ranges().len()); for range in class_unicode.ranges() { new_range.push(AnyRange::from( Char::new(range.start())..=Char::new(range.end()), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } - fn to_range_bytes(class_bytes: &ClassBytes) -> Range { + fn to_range_bytes(class_bytes: &ClassBytes) -> CharRange { let mut new_range = Vec::with_capacity(class_bytes.ranges().len()); for range in class_bytes.ranges() { new_range.push(AnyRange::from( Char::new(range.start() as char)..=Char::new(range.end() as char), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 848842f..26e4c7f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,6 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::{Range, execution_profile::ExecutionProfile}; +use crate::execution_profile::ExecutionProfile; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -17,7 +17,7 @@ mod serializer; /// Represent a regular expression. #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] pub enum RegularExpression { - Character(Range), + Character(CharRange), Repetition(Box, u32, Option), Concat(VecDeque), Alternation(Vec), @@ -125,7 +125,7 @@ impl RegularExpression { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; match self { - RegularExpression::Character(range) => FastAutomaton::make_from_range(range), + RegularExpression::Character(range) => FastAutomaton::new_from_range(range), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; automaton.repeat_mut(*min, *max_opt)?; diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 382c885..ae7da22 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -1,15 +1,16 @@ use super::*; mod concat; +mod repeat; mod simplify; mod union; -mod repeat; #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::RangeSet}; - use crate::regex::RegularExpression; + use regex_charclass::char::Char; + + use crate::{regex::RegularExpression, CharRange}; #[test] fn test_parse_and_simplify() -> Result<(), String> { @@ -37,8 +38,11 @@ mod tests { assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); - + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); Ok(()) } @@ -51,7 +55,7 @@ mod tests { #[test] fn test_repeat_simplify() -> Result<(), String> { assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(2), 3, @@ -59,7 +63,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(2), 2, @@ -67,7 +71,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 3, Some(3), 0, @@ -75,7 +79,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, Some(3), 1, @@ -83,7 +87,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 1, Some(2), 1, @@ -91,7 +95,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(3), 1, @@ -99,7 +103,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 3, Some(4), 1, @@ -107,7 +111,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 7, Some(8), 1, @@ -115,7 +119,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, None, 3, @@ -123,7 +127,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 1, None, 0, @@ -131,7 +135,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 0, Some(1), 1, @@ -139,7 +143,7 @@ mod tests { ); assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), 2, Some(4), 2, @@ -150,7 +154,7 @@ mod tests { } fn assert_repeat_simplify( - range: &RangeSet, + range: &CharRange, min1: u32, max1: Option, min2: u32, diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 79697dd..602cb93 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{error::EngineError, fast_automaton::condition::Condition}; +use crate::{error::EngineError, fast_automaton::condition::Condition, CharRange}; use self::token::range_token::RangeToken; @@ -32,10 +32,7 @@ impl Tokenizer<'_> { vec.push(AutomatonToken::AcceptState) } - for (to_state, condition) in self - .automaton - .transitions_from_state_enumerate_iter(¤t_state) - { + for (condition, to_state) in self.automaton.transitions_from_iter(current_state) { if condition.is_empty() { continue; } @@ -73,7 +70,7 @@ impl Tokenizer<'_> { let mut from_state = None; let mut to_state = None; - let mut range = Range::empty(); + let mut range = CharRange::empty(); for token in vec { match token { AutomatonToken::Range(r) => { @@ -86,7 +83,7 @@ impl Tokenizer<'_> { if let Some(fs) = from_state { if let Some(ts) = to_state { Self::apply_transition(&mut automaton, fs, ts, &range)?; - range = Range::empty(); + range = CharRange::empty(); } to_state = Some((*s).into()); } else { @@ -107,7 +104,7 @@ impl Tokenizer<'_> { } from_state = None; to_state = None; - range = Range::empty(); + range = CharRange::empty(); } _ => return Err(EngineError::TokenError(TokenError::UnknownToken)), }; @@ -122,10 +119,10 @@ impl Tokenizer<'_> { automaton: &mut FastAutomaton, from_state: State, to_state: State, - range: &Range, + range: &CharRange, ) -> Result<(), EngineError> { let condition = Condition::from_range(range, automaton.get_spanning_set())?; - automaton.add_transition_to(from_state, to_state, &condition); + automaton.add_transition(from_state, to_state, &condition); Ok(()) } } @@ -150,7 +147,9 @@ mod tests { assert_embedding_convertion_for_fair( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); - assert_embedding_convertion_for_fair("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); + assert_embedding_convertion_for_fair( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); Ok(()) } @@ -181,14 +180,18 @@ mod tests { let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&unembedded_automaton) + .unwrap() + .is_empty() + ); + assert!( + unembedded_automaton + .subtraction(&automaton) + .unwrap() + .is_empty() + ); if !ignore_ai { // AI @@ -200,14 +203,18 @@ mod tests { let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); + assert!( + automaton + .subtraction(&unembedded_automaton) + .unwrap() + .is_empty() + ); + assert!( + unembedded_automaton + .subtraction(&automaton) + .unwrap() + .is_empty() + ); } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index cb581e6..3e05757 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::regex::RegularExpression; +use crate::{regex::RegularExpression, CharRange}; use self::token::regex_token::RegexToken; @@ -94,7 +94,7 @@ impl Tokenizer<'_> { vec: &[RegexToken], ) -> Result { let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; + let mut current_range: Option = None; let mut current_min = None; for i in 0..vec.len() { let token = vec[i]; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 2e3e4ed..3273b0e 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -2,8 +2,6 @@ use std::{cmp::Ordering, collections::VecDeque, vec}; use ahash::HashMapExt; use crate::fast_automaton::spanning_set::SpanningSet; -use crate::Range; - use crate::{ fast_automaton::{FastAutomaton, State}, IntMap, IntSet, @@ -43,9 +41,9 @@ impl Tokenizer<'_> { state_counter += 1; automaton - .transitions_from_state_enumerate_iter(¤t_state) - .filter(|(_, c)| !c.is_empty()) - .for_each(|(to_state, _)| { + .transitions_from_iter(current_state) + .filter(|(c, _)| !c.is_empty()) + .for_each(|(_, to_state)| { if !seen.contains(to_state) { worklist.push_front(*to_state); } diff --git a/src/tokenizer/range_tokenizer.rs b/src/tokenizer/range_tokenizer.rs index 3950033..e3b3c9c 100644 --- a/src/tokenizer/range_tokenizer.rs +++ b/src/tokenizer/range_tokenizer.rs @@ -1,3 +1,5 @@ +use crate::CharRange; + use self::token::range_token::RangeToken; use super::*; @@ -5,7 +7,7 @@ use super::*; #[derive(Debug)] pub struct RangeTokenizer<'a> { spanning_set: &'a SpanningSet, - total: Range, + total: CharRange, } impl RangeTokenizer<'_> { @@ -21,7 +23,7 @@ impl RangeTokenizer<'_> { } } - pub fn range_to_embedding(&self, range: &Range) -> Option> { + pub fn range_to_embedding(&self, range: &CharRange) -> Option> { if range == &self.total { return Some(vec![RangeToken::Total]); } else if !range.difference(&self.total).is_empty() { @@ -39,12 +41,12 @@ impl RangeTokenizer<'_> { Some(vec) } - pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { + pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { if vec.is_empty() { - return Some(Range::empty()); + return Some(CharRange::empty()); } - let mut range = Range::empty(); + let mut range = CharRange::empty(); if vec[0] == RangeToken::Total { return Some(self.total.clone()); } @@ -60,7 +62,7 @@ impl RangeTokenizer<'_> { Some(range) } - pub fn token_to_range(&self, token: &RangeToken) -> Option<&Range> { + pub fn token_to_range(&self, token: &RangeToken) -> Option<&CharRange> { match token { RangeToken::Total => Some(&self.total), RangeToken::Base(b) => self.spanning_set.get_spanning_range(*b), From bcc9d7d699dd66518a4e0ff32ba073c33a5542c4 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 18:11:27 +0200 Subject: [PATCH 10/62] Update description --- README.md | 26 ++++++++++---------- src/fast_automaton/operation/concat.rs | 2 ++ src/fast_automaton/operation/determinize.rs | 1 + src/fast_automaton/operation/intersection.rs | 3 +++ src/fast_automaton/operation/repeat.rs | 1 + src/fast_automaton/operation/subtraction.rs | 2 ++ src/fast_automaton/operation/union.rs | 3 +++ 7 files changed, 25 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1385f96..f009adc 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - [Installation](#installation) - [Example](#example) - - [Key Concepts & Limitations](#key-concepts-limitations) + - [Key Concepts & Limitations](#key-concepts--limitations) - [API](#api) - [Term](#term) - [FastAutomaton](#fastautomaton) @@ -177,18 +177,18 @@ This design allows us to perform unions, intersections, and complements of trans #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `union(&self, other: &FastAutomaton)` | `Result` | | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | -| `concat(&self, other: &FastAutomaton)` | `Result` | | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `determinize(&self)` | `Result` | | -| `intersection(&self, other: &FastAutomaton)` | `Result` | | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | | -| `complement(&mut self)` | `Result<(), EngineError>` | | -| `subtraction(&self, other: &FastAutomaton)` | `Result` | | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | | +| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | +| `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | +| `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | +| `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | #### Analyze | Method | Return | Description | diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 71d97b0..6fa9d1b 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -7,10 +7,12 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. pub fn concat(&self, other: &FastAutomaton) -> Result { Self::concat_all([self, other]) } + /// Returns a new `FastAutomaton` that is the concatenation of all automatons in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index b0efb67..55d8c46 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,6 +5,7 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { + /// Determinize the automaton and returns it as a new `FastAutomaton`. pub fn determinize(&self) -> Result { if self.deterministic { return Ok(self.clone()); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 5dac078..29e5d49 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -12,10 +12,12 @@ use crate::{ use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the intersection of `self` and `other`. pub fn intersection(&self, other: &FastAutomaton) -> Result { FastAutomaton::intersection_all([self, other]) } + /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -31,6 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 2fd14db..cc4fb76 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,6 +1,7 @@ use super::*; impl FastAutomaton { + // Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index 8d45ae7..e4406e8 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -43,6 +43,7 @@ impl FastAutomaton { Ok(()) } + /// Complement the automaton, the automaton needs to be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { self.totalize()?; @@ -58,6 +59,7 @@ impl FastAutomaton { Ok(()) } + /// Returns a new `FastAutomaton` representing the substraction of `self` and `other`. pub fn subtraction(&self, other: &FastAutomaton) -> Result { let mut complement = other.clone(); match complement.complement() { diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index ea9ad44..d83e7de 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -8,10 +8,12 @@ use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { + /// Returns a new `FastAutomaton` representing the union of `self` and `other`. pub fn union(&self, other: &FastAutomaton) -> Result { Self::union_all([self, other]) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); @@ -21,6 +23,7 @@ impl FastAutomaton { Ok(new_automaton) } + /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); From 8d5b66e77e9430a1afc4d8b4c68abb401d755354 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 2 Aug 2025 21:19:46 +0200 Subject: [PATCH 11/62] Update docs --- README.md | 39 ++++++++++++++++++-- src/fast_automaton/analyze/cardinality.rs | 1 + src/fast_automaton/analyze/equivalence.rs | 1 + src/fast_automaton/analyze/length.rs | 1 + src/fast_automaton/analyze/mod.rs | 3 +- src/fast_automaton/analyze/subset.rs | 1 + src/fast_automaton/operation/intersection.rs | 3 +- src/lib.rs | 8 ++-- src/regex/analyze/affixes.rs | 4 +- src/regex/analyze/mod.rs | 2 + src/regex/analyze/number_of_states.rs | 14 +++---- src/regex/builder.rs | 12 ++++-- src/regex/mod.rs | 4 ++ src/regex/operation/concat.rs | 1 + src/regex/operation/repeat.rs | 1 + src/regex/operation/simplify.rs | 1 + src/regex/operation/union.rs | 12 +++--- 17 files changed, 79 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index f009adc..a808caf 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ println!("Some matches: {:?}", samples); // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); -assert!(!a.are_equivalent(&b).unwrap()); +assert!(!a.is_equivalent_of(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` @@ -121,7 +121,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `is_equivalent_of(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | | `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | | `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | @@ -185,7 +185,7 @@ This design allows us to perform unions, intersections, and complements of trans | `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | | `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | | `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | @@ -193,6 +193,10 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `is_empty(&self)` | `bool` | Checks if the current `FastAutomaton` matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current `FastAutomaton` matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current `FastAutomaton` only match the empty string `""`. | +| `get_reacheable_states(&self)` | `IntSet` | Get a set of all reacheable states from the start state. | | `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | | `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | @@ -215,13 +219,40 @@ This design allows us to perform unions, intersections, and complements of trans | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | +| `is_equivalent_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | +| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | ### RegularExpression `RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +#### Build +| Method | Return | Description | +| -------- | ------- | ------- | +| `new(pattern: &str)` | `Result` | Parses the provided pattern and return the resulting `RegularExpression`. | +| `new_empty()` | `RegularExpression` | Create a `RegularExpression` that matches the empty language. | +| `new_total()` | `RegularExpression` | Create a `RegularExpression` that matches all possible strings. | +| `new_empty_string()` | `RegularExpression` | Create a `RegularExpression` that only match the empty string `""`. | +| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, other: &RegularExpression)` | `RegularExpression` | Create a`RegularExpression` that only match the empty string `""`. | +| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. | +| `simplify(&self)` | `RegularExpression` | Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. | + +#### Analyze +| Method | Return | Description | +| -------- | ------- | ------- | +| `is_empty(&self)` | `bool` | Checks if the current `RegularExpression` matches the empty language. | +| `is_total(&self)` | `bool` | Checks if the current `RegularExpression` matches all possible strings. | +| `is_empty_string(&self)` | `bool` | Checks if the current `RegularExpression` only match the empty string `""`. | +| `to_automaton(&self)` | `Result` | Convert the current `RegularExpression` to an equivalent `FastAutomaton`. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | + ## Error Handling ## Bound Execution diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index a2d6d91..57a346a 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,6 +3,7 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). pub fn get_cardinality(&self) -> Option> { if self.is_empty() { return Some(Cardinality::Integer(0)); diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index d81294c..18a6f14 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,6 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index c03ee80..638a93c 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,6 +1,7 @@ use super::*; impl FastAutomaton { + /// Returns the minimum and maximum length of the possible matched strings. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index de49b73..7d5d7fa 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -30,6 +30,7 @@ impl FastAutomaton { self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 } + /// Get a set of all reacheable states from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); @@ -66,7 +67,7 @@ impl FastAutomaton { live } - pub fn get_ranges(&self) -> Result, EngineError> { + pub(crate) fn get_ranges(&self) -> Result, EngineError> { self.spanning_set.get_spanning_ranges().map(|range| { Condition::from_range(range, &self.spanning_set) }).collect() diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 5705fc2..6eb8888 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,6 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 29e5d49..6987c31 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. + /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); @@ -126,6 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } + // Returns `true` if the two automatons have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); diff --git a/src/lib.rs b/src/lib.rs index 3de5f60..0dc2ec9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -343,9 +343,9 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.is_equivalent_of(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { + pub fn is_equivalent_of(&self, that: &Term) -> Result { if self == that { return Ok(true); } @@ -430,6 +430,7 @@ impl Term { } } + /// Converts the current `Term` to a `FastAutomaton`. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -437,6 +438,7 @@ impl Term { }) } + /// Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. pub fn to_regex(&self) -> Option> { Some(match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -623,7 +625,7 @@ mod tests { // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); - assert!(!a.are_equivalent(&b).unwrap()); + assert!(!a.is_equivalent_of(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); Ok(()) diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 4213e3f..540580b 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - pub fn get_common_affixes( + pub(crate) fn get_common_affixes( &self, other: &RegularExpression, ) -> ( @@ -21,7 +21,7 @@ impl RegularExpression { (common_prefix, (self_regex, other_regex), common_suffix) } - pub fn get_common_affix( + pub(crate) fn get_common_affix( &self, other: &RegularExpression, is_prefix: bool, diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index ae08148..2b946cb 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,6 +6,7 @@ mod affixes; mod number_of_states; impl RegularExpression { + /// Returns the minimum and maximum length of the possible matched strings. pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -84,6 +85,7 @@ impl RegularExpression { } } + /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 90c1897..e7460f8 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -9,7 +9,7 @@ struct AbstractStateMetadata { } impl AbstractStateMetadata { - pub fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { + pub(crate) fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { AbstractStateMetadata { has_incoming_edges, has_outgoing_edges, @@ -25,7 +25,7 @@ struct AbstractNFAMetadata { } impl AbstractNFAMetadata { - pub fn new() -> Self { + pub(crate) fn new() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, true), accepted: vec![AbstractStateMetadata::new(true, false)], @@ -33,7 +33,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty_string() -> Self { + pub(crate) fn new_empty_string() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![AbstractStateMetadata::new(false, false)], @@ -41,7 +41,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty() -> Self { + pub(crate) fn new_empty() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![], @@ -49,7 +49,7 @@ impl AbstractNFAMetadata { } } - pub fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { let start_state_and_accept_states_not_mergeable = nfa.start.has_incoming_edges && self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -68,7 +68,7 @@ impl AbstractNFAMetadata { } } - pub fn repeat(&self, min: u32, max_opt: &Option) -> Self { + pub(crate) fn repeat(&self, min: u32, max_opt: &Option) -> Self { let start_state_not_mergeable = self.start.has_incoming_edges; let accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); let start_state_or_accept_states_not_mergeable = @@ -129,7 +129,7 @@ impl AbstractNFAMetadata { } } - pub fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { let self_start_state_not_mergeable = self.start.has_incoming_edges; let self_accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index b1958a1..1b8f636 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,17 +11,18 @@ lazy_static! { } impl RegularExpression { - pub fn new(regex: &str) -> Result { - if regex.is_empty() { + /// Parses the provided pattern and return the resulting `RegularExpression`. + pub fn new(pattern: &str) -> Result { + if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } - if regex == "[]" { + if pattern == "[]" { return Ok(RegularExpression::new_empty()); } match ParserBuilder::new() .dot_matches_new_line(true) .build() - .parse(&Self::remove_flags(regex)) + .parse(&Self::remove_flags(pattern)) { Ok(hir) => Self::convert_to_regex(&hir), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), @@ -32,6 +33,7 @@ impl RegularExpression { RE_FLAG_DETECTION.replace_all(regex, "").to_string() } + /// Create a `RegularExpression` that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( Box::new(RegularExpression::Character(CharRange::total())), @@ -40,10 +42,12 @@ impl RegularExpression { ) } + /// Create a `RegularExpression` that matches the empty language. pub fn new_empty() -> Self { RegularExpression::Character(CharRange::empty()) } + /// Create a`RegularExpression` that only match the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 26e4c7f..5c7ca16 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -90,6 +90,7 @@ impl Display for RegularExpression { } impl RegularExpression { + /// Checks if the current `RegularExpression` matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -98,6 +99,7 @@ impl RegularExpression { } } + /// Checks if the current `RegularExpression` matches all possible strings. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -105,6 +107,7 @@ impl RegularExpression { } } + /// Checks if the current `RegularExpression` only match the empty string `""`. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -121,6 +124,7 @@ impl RegularExpression { } } + /// Convert the current `RegularExpression` to an equivalent `FastAutomaton`. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 6907d9b..ac699d8 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 00b9685..7da36bb 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index ae87087..51f66b7 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,6 +1,7 @@ use super::*; impl RegularExpression { + /// Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { RegularExpression::Character(_) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 65b34f4..1c09e78 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,17 +3,17 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { + /// Returns a new `RegularExpression` representing the union of this expression with `other`. pub fn union(&self, other: &RegularExpression) -> RegularExpression { Self::union_all([self, other]) } - pub fn union_all<'a, I>(regexes: I) -> RegularExpression - where - I: IntoIterator, + /// Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. + pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); - for other in regexes { + for other in patterns { result = result.union_(other); if result.is_total() { @@ -115,9 +115,7 @@ impl RegularExpression { RegularExpression::Alternation(alternate) } } else { - panic!( - "Not character and repetition {this_character:?} {that_repetition:?}" - ) + panic!("Not character and repetition {this_character:?} {that_repetition:?}") } } From 90c462b5c63d5089284187522035940901748b81 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 14:08:36 +0200 Subject: [PATCH 12/62] update --- README.md | 24 ++-- src/error/mod.rs | 18 --- src/execution_profile.rs | 4 +- src/fast_automaton/analyze/equivalence.rs | 8 +- src/fast_automaton/analyze/mod.rs | 47 ++++++- src/fast_automaton/convert/to_regex/mod.rs | 6 +- src/fast_automaton/spanning_set/mod.rs | 2 + src/lib.rs | 39 +++--- src/regex/analyze/affixes.rs | 1 + src/regex/operation/mod.rs | 2 +- src/tokenizer/embed_automaton.rs | 46 ++----- src/tokenizer/embed_regex.rs | 13 -- src/tokenizer/embed_regex_operations.rs | 119 ------------------ src/tokenizer/mod.rs | 1 - src/tokenizer/token/automaton_token.rs | 43 ------- src/tokenizer/token/mod.rs | 16 --- src/tokenizer/token/range_token.rs | 34 ----- src/tokenizer/token/regex_operations_token.rs | 64 ---------- src/tokenizer/token/regex_token.rs | 53 -------- tests/integration_tests.rs | 4 +- 20 files changed, 101 insertions(+), 443 deletions(-) delete mode 100644 src/tokenizer/embed_regex_operations.rs delete mode 100644 src/tokenizer/token/regex_operations_token.rs diff --git a/README.md b/README.md index a808caf..2bca560 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ - [Term](#term) - [FastAutomaton](#fastautomaton) - [RegularExpression](#regularexpression) - - [Error Handling](#error-handling) - [Bound Execution](#bound-execution) - [Cross-Language Support](#cross-language-support) - [License](#license) @@ -37,29 +36,32 @@ let t2 = Term::from_pattern(".*xyz").unwrap(); // Concatenate let concat = t1.concat(&[t2]).unwrap(); -assert_eq!(concat.to_string(), "abc.*xyz"); +assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -assert_eq!(union.to_string(), "(abc.*|fgh)"); +assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}") .unwrap() .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy -assert_eq!(inter.to_string(), "(ab|xy)xy"); +assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); // Subtraction let diff = Term::from_pattern("a*") .unwrap() .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); -assert_eq!(diff.to_string(), "a+"); +assert_eq!(diff.to_pattern().unwrap(), "a+"); // Repetition -let rep = Term::from_pattern("abc").unwrap().repeat(2, Some(4)).unwrap(); -assert_eq!(rep.to_string(), "(abc){2,4}"); +let rep = Term::from_pattern("abc") + .unwrap() + .repeat(2, Some(4)) + .unwrap(); +assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -75,7 +77,7 @@ println!("Some matches: {:?}", samples); // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); -assert!(!a.is_equivalent_of(&b).unwrap()); +assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); ``` @@ -121,7 +123,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `is_equivalent_of(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | +| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | | `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | | `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | @@ -220,7 +222,7 @@ This design allows us to perform unions, intersections, and complements of trans | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | -| `is_equivalent_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | +| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | | `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | @@ -253,8 +255,6 @@ This design allows us to perform unions, intersections, and complements of trans | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | -## Error Handling - ## Bound Execution By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. diff --git a/src/error/mod.rs b/src/error/mod.rs index e88d1e1..303c225 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -54,21 +54,3 @@ impl fmt::Display for EngineError { } impl std::error::Error for EngineError {} - -impl EngineError { - /// Determine if the error is a server error. - /// A server error should not be shown to the end user. - pub fn is_server_error(&self) -> bool { - match self { - EngineError::InvalidCharacterInRegex => false, - EngineError::OperationTimeOutError => false, - EngineError::AutomatonShouldBeDeterministic => true, - EngineError::AutomatonHasTooManyStates => false, - EngineError::RegexSyntaxError(_) => false, - EngineError::TokenError(_) => false, - EngineError::ConditionInvalidRange => true, - EngineError::ConditionIndexOutOfBound => true, - EngineError::CannotComputeAutomatonCardinality => false, - } - } -} diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 3ba3a33..76c1b78 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -333,7 +333,7 @@ mod tests { Ok(()) } - #[test] + /*#[test] fn test_execution_timeout_intersection() -> Result<(), String> { let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); @@ -359,5 +359,5 @@ mod tests { }); Ok(()) - } + }*/ } diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 18a6f14..6483d68 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. - pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { + pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { @@ -72,14 +72,14 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_equivalent_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.are_equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_equivalent_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.are_equivalent(&automaton_2).unwrap()); assert_eq!( expected, - automaton_1.is_equivalent_of(&automaton_2).unwrap() + automaton_1.are_equivalent(&automaton_2).unwrap() ); } } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 7d5d7fa..2f220b0 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,11 +10,13 @@ mod length; mod subset; impl FastAutomaton { + /// Checks if the current `FastAutomaton` matches the empty language. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } + /// Checks if the current `FastAutomaton` matches all possible strings. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) { @@ -25,9 +27,12 @@ impl FastAutomaton { false } + /// Checks if the current `FastAutomaton` only match the empty string `""`. #[inline] pub fn is_empty_string(&self) -> bool { - self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) && self.state_in_degree(self.start_state) == 0 + self.accept_states.len() == 1 + && self.accept_states.contains(&self.start_state) + && self.state_in_degree(self.start_state) == 0 } /// Get a set of all reacheable states from the start state. @@ -68,8 +73,42 @@ impl FastAutomaton { } pub(crate) fn get_ranges(&self) -> Result, EngineError> { - self.spanning_set.get_spanning_ranges().map(|range| { - Condition::from_range(range, &self.spanning_set) - }).collect() + self.spanning_set + .get_spanning_ranges() + .map(|range| Condition::from_range(range, &self.spanning_set)) + .collect() + } +} + +#[cfg(test)] +mod tests { + + use crate::fast_automaton::FastAutomaton; + + #[test] + fn test_empty() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty()); + assert!(!FastAutomaton::new_empty_string().is_empty()); + assert!(FastAutomaton::new_empty().is_empty()); + + Ok(()) + } + + #[test] + fn test_empty_string() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty_string()); + assert!(FastAutomaton::new_empty_string().is_empty_string()); + assert!(!FastAutomaton::new_empty().is_empty_string()); + + Ok(()) + } + + #[test] + fn test_total() -> Result<(), String> { + assert!(FastAutomaton::new_total().is_total()); + assert!(!FastAutomaton::new_empty_string().is_total()); + assert!(!FastAutomaton::new_empty().is_total()); + + Ok(()) } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 2469d03..3e6193c 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -252,7 +252,7 @@ impl FastAutomaton { if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { let regex = regex?; match regex.to_automaton() { - Ok(automaton) => match self.is_equivalent_of(&automaton) { + Ok(automaton) => match self.are_equivalent(&automaton) { Ok(result) => { if !result { warn!( @@ -337,7 +337,7 @@ mod tests { println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); @@ -347,7 +347,7 @@ mod tests { println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); } #[test] diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2e998b8..bfaefcb 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -63,6 +63,7 @@ impl SpanningSet { &self.1 } + /// Compute a new minimal spanning set by merging the provided spanning set. pub fn merge(&self, other: &Self) -> Self { let mut ranges = Vec::with_capacity(self.0.len() + other.0.len()); ranges.extend_from_slice(&self.0); @@ -71,6 +72,7 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } + /// Compute a new minimal spanning set for the provided ranges. pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); diff --git a/src/lib.rs b/src/lib.rs index 0dc2ec9..d177120 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -343,16 +343,16 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.is_equivalent_of(&term2).unwrap()); + /// assert!(!term1.are_equivalent(&term2).unwrap()); /// ``` - pub fn is_equivalent_of(&self, that: &Term) -> Result { + pub fn are_equivalent(&self, that: &Term) -> Result { if self == that { return Ok(true); } let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.is_equivalent_of(&automaton_2) + automaton_1.are_equivalent(&automaton_2) } /// Compute whether the current term is a subset of the given term. @@ -446,6 +446,11 @@ impl Term { }) } + /// Converts the current `Term` to a regular expression pattern. Returns `None` if the automaton cannot be converted. + pub fn to_pattern(&self) -> Option { + Some(self.to_regex()?.to_string()) + } + fn determinize_subtrahend<'a>( minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, @@ -522,9 +527,9 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_pattern().unwrap(); assert_eq!( - Term::RegularExpression(RegularExpression::new("a+").unwrap()), + "a+", result ); @@ -538,10 +543,10 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_regex().unwrap().into_owned(); assert_eq!( Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), - result + Term::RegularExpression(result) ); Ok(()) @@ -554,8 +559,8 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(Term::from_pattern("").unwrap(), result); + let result = result.unwrap().to_pattern().unwrap(); + assert_eq!("", result); Ok(()) } @@ -567,9 +572,9 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_pattern().unwrap(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(x{3})*").unwrap()), + "(x{3})*", result ); @@ -584,32 +589,32 @@ mod tests { // Concatenate let concat = t1.concat(&[t2]).unwrap(); - assert_eq!(concat.to_string(), "abc.*xyz"); + assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) - assert_eq!(union.to_string(), "(abc.*|fgh)"); + assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}") .unwrap() .intersection(&[Term::from_pattern(".*xy").unwrap()]) .unwrap(); // (ab|xy)xy - assert_eq!(inter.to_string(), "(ab|xy)xy"); + assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); // Subtraction let diff = Term::from_pattern("a*") .unwrap() .subtraction(&Term::from_pattern("").unwrap()) .unwrap(); - assert_eq!(diff.to_string(), "a+"); + assert_eq!(diff.to_pattern().unwrap(), "a+"); // Repetition let rep = Term::from_pattern("abc") .unwrap() .repeat(2, Some(4)) .unwrap(); // (abc){2,4} - assert_eq!(rep.to_string(), "(abc){2,4}"); + assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -625,7 +630,7 @@ mod tests { // Equivalence & subset let a = Term::from_pattern("a+").unwrap(); let b = Term::from_pattern("a*").unwrap(); - assert!(!a.is_equivalent_of(&b).unwrap()); + assert!(!a.are_equivalent(&b).unwrap()); assert!(a.is_subset_of(&b).unwrap()); Ok(()) diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 540580b..34aa401 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -285,6 +285,7 @@ mod tests { assert_regex_affix(true, "(ab|cd)x", "(ab|cd)y", "(ab|cd)", "x", "y"); assert_regex_affix(true, "a+", "a+b", "a+", "", "b"); + assert_regex_affix(true, "(ab|cd)", "(ab|cd)", "(ab|cd)", "", ""); Ok(()) } diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index ae7da22..f572238 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -180,6 +180,6 @@ mod tests { let result = got.to_automaton().unwrap(); - assert!(repeat.is_equivalent_of(&result).unwrap()); + assert!(repeat.are_equivalent(&result).unwrap()); } } diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 602cb93..0838525 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -137,13 +137,13 @@ mod tests { #[test] fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair_and_ai("(a|b)"); - assert_embedding_convertion_for_fair_and_ai("(|a)"); - assert_embedding_convertion_for_fair_and_ai(".*ab"); - assert_embedding_convertion_for_fair_and_ai("toto"); - assert_embedding_convertion_for_fair_and_ai(".{2,3}"); - assert_embedding_convertion_for_fair_and_ai("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair_and_ai(".*q(ab|ca|ab|abc)x"); + assert_embedding_convertion_for_fair("(a|b)"); + assert_embedding_convertion_for_fair("(|a)"); + assert_embedding_convertion_for_fair(".*ab"); + assert_embedding_convertion_for_fair("toto"); + assert_embedding_convertion_for_fair(".{2,3}"); + assert_embedding_convertion_for_fair("q(ab|ca|ab|abc)x"); + assert_embedding_convertion_for_fair(".*q(ab|ca|ab|abc)x"); assert_embedding_convertion_for_fair( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); @@ -155,14 +155,10 @@ mod tests { } fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex, true); + assert_embedding_convertion(regex); } - fn assert_embedding_convertion_for_fair_and_ai(regex: &str) { - assert_embedding_convertion(regex, false); - } - - fn assert_embedding_convertion(regex: &str, ignore_ai: bool) { + fn assert_embedding_convertion(regex: &str) { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); @@ -192,29 +188,5 @@ mod tests { .unwrap() .is_empty() ); - - if !ignore_ai { - // AI - let embedding_u8 = AutomatonToken::to_ai_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u8 - .iter() - .map(|&t| AutomatonToken::from_ai_token(t)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!( - automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty() - ); - assert!( - unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty() - ); - } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index 3e05757..fe73cab 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -290,18 +290,5 @@ mod tests { let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); assert_eq!(regex, unembedded_regex); - - // AI - let embedding_u8 = RegexToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); } } diff --git a/src/tokenizer/embed_regex_operations.rs b/src/tokenizer/embed_regex_operations.rs deleted file mode 100644 index 4dcb19f..0000000 --- a/src/tokenizer/embed_regex_operations.rs +++ /dev/null @@ -1,119 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_operations_token::RegexOperationsToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_operations_embedding( - &self, - regex_operations: &[(bool, RegularExpression)], - ) -> Vec { - let mut vec = vec![]; - - for (not, regex) in regex_operations { - if !vec.is_empty() { - vec.push(RegexOperationsToken::And); - } - if *not { - vec.push(RegexOperationsToken::Not); - } - - vec.extend( - self.to_regex_embedding(regex) - .into_iter() - .map(RegexOperationsToken::RegexToken), - ); - } - - vec - } - - pub fn from_regex_operations_embedding( - &self, - vec: &[RegexOperationsToken], - ) -> Result, TokenError> { - let mut operations = vec![]; - let mut current_regex_not = false; - let mut current_regex_token = vec![]; - for token in vec { - match token { - RegexOperationsToken::RegexToken(regex_token) => { - current_regex_token.push(*regex_token) - } - RegexOperationsToken::And => { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - current_regex_not = false; - current_regex_token.clear(); - } - RegexOperationsToken::Not => current_regex_not = true, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }; - } - - if !current_regex_token.is_empty() { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - } - - Ok(operations) - } -} - -#[cfg(test)] -mod tests { - use embed_regex_operations::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(&[(false, "(a|b)")]); - assert_embedding_convertion(&[(false, "(|a)")]); - assert_embedding_convertion(&[(false, ".*ab")]); - assert_embedding_convertion(&[(true, "toto")]); - assert_embedding_convertion(&[(false, ".{2,3}")]); - assert_embedding_convertion(&[(false, "q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, ".*q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, "(abc){3,6}")]); - assert_embedding_convertion(&[(true, "((|a)abd+){3}")]); - - assert_embedding_convertion(&[(false, ".*a.*"), (false, ".*b.*"), (true, ".*abc.*")]); - Ok(()) - } - - fn assert_embedding_convertion(operations: &[(bool, &str)]) { - let mut automaton = FastAutomaton::new_total(); - let operations: Vec<(bool, RegularExpression)> = operations - .iter() - .map(|(not, regex)| { - let regex = RegularExpression::new(regex).unwrap(); - automaton = automaton.intersection(®ex.to_automaton().unwrap()).unwrap(); - (*not, regex) - }) - .collect(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_operations_embedding(&operations); - - // AI - let embedding_u8: Vec = RegexOperationsToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexOperationsToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_operations = tokenizer - .from_regex_operations_embedding(&embedding) - .unwrap(); - assert_eq!(operations, unembedded_operations); - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 3273b0e..7c83c1a 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -11,7 +11,6 @@ use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonTok mod embed_automaton; mod embed_regex; -mod embed_regex_operations; pub mod range_tokenizer; pub mod token; diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs index 215ffed..e5f379c 100644 --- a/src/tokenizer/token/automaton_token.rs +++ b/src/tokenizer/token/automaton_token.rs @@ -24,15 +24,6 @@ impl PartialOrd for AutomatonToken { } impl AutomatonToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_STATE: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_ACCEPT_STATE: u8 = Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES; - const TK_AI_SEPARATOR_STATE: u8 = Self::TK_AI_ACCEPT_STATE + 1; - - pub const AI_MAX_NUMBER_OF_STATES: u8 = 100; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_SEPARATOR_STATE + 1; - const TK_FAIR_RANGE: u16 = 0; const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; @@ -44,40 +35,6 @@ impl AutomatonToken { } impl Token for AutomatonToken { - fn from_ai_token(token: u8) -> AutomatonToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - AutomatonToken::Range(RangeToken::from_ai_token(token)) - } else if (Self::TK_AI_STATE..Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State((token - Self::TK_AI_STATE) as u16) - } else if token == Self::TK_AI_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_AI_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_ai_token()?, - AutomatonToken::State(s) => { - let max = Self::AI_MAX_NUMBER_OF_STATES; - let s = *s as u8; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_AI_STATE - } - AutomatonToken::AcceptState => Self::TK_AI_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_AI_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> AutomatonToken { if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) .contains(&token) diff --git a/src/tokenizer/token/mod.rs b/src/tokenizer/token/mod.rs index 2f28e32..4342be8 100644 --- a/src/tokenizer/token/mod.rs +++ b/src/tokenizer/token/mod.rs @@ -4,7 +4,6 @@ use super::*; pub mod automaton_token; pub mod range_token; -pub mod regex_operations_token; pub mod regex_token; #[derive(Debug, PartialEq, Eq)] @@ -28,21 +27,6 @@ impl Display for TokenError { } pub trait Token { - fn from_ai_token(token: u8) -> Self; - - fn to_ai_token(&self) -> Result; - - fn to_ai_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_ai_token()?); - } - Ok(vec) - } - fn from_fair_token(token: u16) -> Self; fn to_fair_token(&self) -> Result; diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs index 62a1753..7876452 100644 --- a/src/tokenizer/token/range_token.rs +++ b/src/tokenizer/token/range_token.rs @@ -8,13 +8,6 @@ pub enum RangeToken { } impl RangeToken { - const TK_AI_TOTAL: u8 = 0; - const TK_AI_BASE: u8 = 1; - - pub const AI_MAX_NUMBER_OF_BASES: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES + 1; - const TK_FAIR_TOTAL: u16 = 0; const TK_FAIR_BASE: u16 = 1; @@ -36,33 +29,6 @@ impl PartialOrd for RangeToken { } impl Token for RangeToken { - fn from_ai_token(token: u8) -> RangeToken { - if token == Self::TK_AI_TOTAL { - RangeToken::Total - } else if (Self::TK_AI_BASE..Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_AI_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_AI_TOTAL, - RangeToken::Base(b) => { - let max = Self::AI_MAX_NUMBER_OF_BASES; - let b = *b as u8; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_AI_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> RangeToken { if token == Self::TK_FAIR_TOTAL { RangeToken::Total diff --git a/src/tokenizer/token/regex_operations_token.rs b/src/tokenizer/token/regex_operations_token.rs deleted file mode 100644 index 1074f7f..0000000 --- a/src/tokenizer/token/regex_operations_token.rs +++ /dev/null @@ -1,64 +0,0 @@ -use self::regex_token::RegexToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexOperationsToken { - RegexToken(RegexToken), - And, - Not, - Error, -} - -impl Ord for RegexOperationsToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_ai_token().unwrap()).cmp(&other.to_ai_token().unwrap()) - } -} - -impl PartialOrd for RegexOperationsToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexOperationsToken { - const TK_AI_REGEX_TOKEN: u8 = 0; - const TK_AI_AND: u8 = Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE; - const TK_AI_NOT: u8 = Self::TK_AI_AND + 1; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_NOT + 1; -} - -impl Token for RegexOperationsToken { - fn from_ai_token(token: u8) -> RegexOperationsToken { - if (Self::TK_AI_REGEX_TOKEN..Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE) - .contains(&token) - { - RegexOperationsToken::RegexToken(RegexToken::from_ai_token(token)) - } else if token == Self::TK_AI_AND { - RegexOperationsToken::And - } else if token == Self::TK_AI_NOT { - RegexOperationsToken::Not - } else { - RegexOperationsToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexOperationsToken::RegexToken(regex_token) => regex_token.to_ai_token()?, - RegexOperationsToken::And => Self::TK_AI_AND, - RegexOperationsToken::Not => Self::TK_AI_NOT, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(_: u16) -> RegexOperationsToken { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } - - fn to_fair_token(&self) -> Result { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs index 2f4c2f2..bcb2e2b 100644 --- a/src/tokenizer/token/regex_token.rs +++ b/src/tokenizer/token/regex_token.rs @@ -26,18 +26,6 @@ impl PartialOrd for RegexToken { } impl RegexToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_START_GROUP: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_END_GROUP: u8 = Self::TK_AI_START_GROUP + 1; - const TK_AI_ALTERNATION: u8 = Self::TK_AI_END_GROUP + 1; - const TK_AI_REPETITION_NONE: u8 = Self::TK_AI_ALTERNATION + 1; - const TK_AI_REPETITION: u8 = Self::TK_AI_REPETITION_NONE + 1; - - pub const AI_MAX_NUMBER_OF_REPETITION: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = - Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION + 1; - const TK_FAIR_RANGE: u16 = 0; const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; @@ -52,47 +40,6 @@ impl RegexToken { } impl Token for RegexToken { - fn from_ai_token(token: u8) -> RegexToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - RegexToken::Range(RangeToken::from_ai_token(token)) - } else if token == Self::TK_AI_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_AI_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_AI_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_AI_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_AI_REPETITION - ..Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition((token - Self::TK_AI_REPETITION) as u16) - } else { - RegexToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_ai_token()?, - RegexToken::StartGroup => Self::TK_AI_START_GROUP, - RegexToken::EndGroup => Self::TK_AI_END_GROUP, - RegexToken::Alternation => Self::TK_AI_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_AI_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::AI_MAX_NUMBER_OF_REPETITION; - let r = *r as u8; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_AI_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } - fn from_fair_token(token: u16) -> RegexToken { if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) .contains(&token) diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 1e572a9..d1dd407 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -29,11 +29,11 @@ fn assert_regex(regex: &str) { assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.is_equivalent_of(&determinized_automaton).unwrap()); + assert!(automaton.are_equivalent(&determinized_automaton).unwrap()); let regex_from_automaton = automaton.to_regex().unwrap(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.is_equivalent_of(&automaton_from_regex).unwrap()); + assert!(automaton.are_equivalent(&automaton_from_regex).unwrap()); } #[test] From 7884f7336817e1ab9b9d6796730b4612bdea5eb0 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 15:54:58 +0200 Subject: [PATCH 13/62] update most descriptions --- README.md | 168 ++++++++++--------- src/fast_automaton/analyze/cardinality.rs | 2 +- src/fast_automaton/analyze/equivalence.rs | 6 +- src/fast_automaton/analyze/length.rs | 2 +- src/fast_automaton/analyze/mod.rs | 8 +- src/fast_automaton/analyze/subset.rs | 4 +- src/fast_automaton/builder.rs | 43 +++-- src/fast_automaton/convert/to_regex/mod.rs | 10 +- src/fast_automaton/generate.rs | 7 +- src/fast_automaton/mod.rs | 30 ++-- src/fast_automaton/operation/concat.rs | 2 +- src/fast_automaton/operation/determinize.rs | 8 +- src/fast_automaton/operation/intersection.rs | 2 +- src/fast_automaton/operation/repeat.rs | 2 +- src/fast_automaton/operation/subtraction.rs | 2 +- src/fast_automaton/serializer.rs | 19 ++- src/lib.rs | 67 +++----- src/regex/analyze/mod.rs | 6 +- src/regex/builder.rs | 8 +- src/regex/mod.rs | 8 +- src/regex/operation/concat.rs | 2 +- src/regex/operation/repeat.rs | 2 +- src/regex/operation/simplify.rs | 2 +- src/regex/operation/union.rs | 4 +- src/tokenizer/embed_automaton.rs | 5 +- src/tokenizer/embed_regex.rs | 3 +- 26 files changed, 215 insertions(+), 207 deletions(-) diff --git a/README.md b/README.md index 2bca560..3b1084c 100644 --- a/README.md +++ b/README.md @@ -102,43 +102,44 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re #### Build | Method | Return | Description | | -------- | ------- | ------- | +| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | +| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | +| `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | | `new_empty()` | `Term` | Creates a term that matches the empty language. | +| `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | | `new_total()` | `Term` | Creates a term that matches all possible strings. | -| `new_empty_string()` | `Term` | Creates a term that only match the empty string `""`. | -| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and return a new `Term` holding the resulting `RegularExpression`. | -| `from_pattern(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | -| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given collection of terms. Returns the resulting term. | -| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given collection of terms. Returns the resulting term. | -| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given collection of terms. Returns the resulting term. | -| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the subtraction/difference of the two given terms. Returns the resulting term. | -| `difference(&self, subtrahend: &Term)` | `Result` | See `self.subtraction(subtrahend: &Term)`. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the current term, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | +| `difference(&self, subtrahend: &Term)` | `Result` | Alias for `subtraction`. | +| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the difference between `self` and the given subtrahend. | +| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates the given count of strings matched by the given term. | -| `are_equivalent(&self, term: &Term)` | `Result` | Computes whether the current term and the given term are equivalent. Returns `true` if both terms accept the same language. | -| `is_subset_of(&self, term: &Term)` | `Result` | Computes whether the current term is a subset of the given term. Returns `true` if all strings matched by the current term are also matched by the given term. | -| `is_empty(&self)` | `bool` | Checks if the current term matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current term matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current term only match the empty string `""`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | -| `to_automaton(&self)` | `Result, EngineError>` | Converts the current `Term` to a `FastAutomaton`. | -| `to_regex(&self)` | `Option>` | Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. | +| `are_equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | +| `is_subset_of(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | +| `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | +| `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | +| `to_pattern(&self)` | `Option` | Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. | +| `to_regex(&self)` | `Option>` | Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. | ### FastAutomaton `FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. -When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To construct a Condition, call: To build a `Condition`, call: +When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust Condition::from_range(&range, &spanning_set); ``` @@ -164,68 +165,69 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `new_empty()` | `FastAutomaton` | Create an automaton that matches the empty language. | -| `new_total()` | `FastAutomaton` | Create an automaton that matches all possible strings. | -| `new_empty_string()` | `FastAutomaton` | Create an automaton that only match the empty string `""`. | -| `new_from_range(range: &CharRange)` | `Result` | Create an automaton that matches one of the characters in the provided `CharRange`. | -| `new_state(&mut self)` | `State` | Create a new state in the automaton and returns its identifier. | -| `accept(&mut self, state: State)` | | Make the automaton accept the provided state as a valid final state. | -| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. | -| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Create a new epsilon transition between the two provided states. | -| `remove_state(&mut self, state: State)` | | Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. | -| `remove_states(&mut self, states: &IntSet)` | | Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. | -| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Apply the provided spanning set to the automaton and project all of its conditions on it. | +| `accept(&mut self, state: State)` | | Marks the provided state as an accepting (final) state. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Creates a new epsilon transition between the two states. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | +| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | +| `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | +| `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | +| `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | +| `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | +| `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | +| `remove_state(&mut self, state: State)` | | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_states(&mut self, states: &IntSet)` | | Removes the given states and their connected transitions; panics if any is a start state. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | -| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | | `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `determinize(&self)` | `Result` | Determinize the automaton and returns it as a new `FastAutomaton`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. | +| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result as a new `FastAutomaton`. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | | `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | -| `complement(&mut self)` | `Result<(), EngineError>` | Complement the automaton, the automaton needs to be deterministic. | +| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | +| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `is_empty(&self)` | `bool` | Checks if the current `FastAutomaton` matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current `FastAutomaton` matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current `FastAutomaton` only match the empty string `""`. | -| `get_reacheable_states(&self)` | `IntSet` | Get a set of all reacheable states from the start state. | -| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator of the states of the automaton. | -| `all_states_vec(&self)` | `Vec` | Returns a vector containing the states of the automaton. | -| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over all states directly reachable from the given state in one transition. | -| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector containing all states directly reachable from the given state in one transition. | -| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions to the provided state. | -| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector containing the transitions from the provided state. | -| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator containing the transitions from the provided state. | -| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator containing the transitions from the provided state. | -| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator containing the transitions from the provided state. | -| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition between the two provided states. | -| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a reference of the directed transtion's condition between the two provided states. | -| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Get a mutable reference of the directed transtion's condition between the two provided states. | -| `get_start_state(&self)` | `State` | Returns the start state of the automaton. | -| `get_accept_states(&self)` | `&IntSet` | Get a reference to the set of accept (final) states of the automaton. | +| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `all_states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | +| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | +| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | +| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | +| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | +| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | +| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | -| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given `state` is one of the automaton's accept states. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `get_start_state(&self)` | `State` | Returns the start state. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | +| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `to_regex(&self)` | `Option` | Try to convert the automaton to a `RegularExpression`. If it cannot find an equivalent pattern returns `None`. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automatons have a non-empty intersection. | -| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. | -| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | +| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | +| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `to_regex(&self)` | `Option` | Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. | +| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator over transitions from the given state. | +| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | +| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator over transitions from the given state. | +| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | +| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | ### RegularExpression @@ -235,25 +237,25 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `new(pattern: &str)` | `Result` | Parses the provided pattern and return the resulting `RegularExpression`. | -| `new_empty()` | `RegularExpression` | Create a `RegularExpression` that matches the empty language. | -| `new_total()` | `RegularExpression` | Create a `RegularExpression` that matches all possible strings. | -| `new_empty_string()` | `RegularExpression` | Create a `RegularExpression` that only match the empty string `""`. | -| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. | -| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. | -| `union(&self, other: &RegularExpression)` | `RegularExpression` | Create a`RegularExpression` that only match the empty string `""`. | -| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. | -| `simplify(&self)` | `RegularExpression` | Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. | +| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | +| `new(pattern: &str)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. | +| `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | +| `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | +| `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | +| `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | +| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `is_empty(&self)` | `bool` | Checks if the current `RegularExpression` matches the empty language. | -| `is_total(&self)` | `bool` | Checks if the current `RegularExpression` matches all possible strings. | -| `is_empty_string(&self)` | `bool` | Checks if the current `RegularExpression` only match the empty string `""`. | -| `to_automaton(&self)` | `Result` | Convert the current `RegularExpression` to an equivalent `FastAutomaton`. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of the possible matched strings. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the provided term (i.e. the number of the possible matched strings). | +| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | +| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | +| `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | +| `is_empty_string(&self)` | `bool` | Checks if the regular expression only matches the empty string `""`. | +| `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | +| `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | ## Bound Execution diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 57a346a..7157bae 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,7 +3,7 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Option> { if self.is_empty() { return Some(Cardinality::Integer(0)); diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 6483d68..32f2ccb 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,7 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Computes whether the current `FastAutomaton` and the given `FastAutomaton` are equivalent. Returns `true` if both automata accept the same language. + /// Returns `true` if both automata accept the same language. pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); @@ -11,14 +11,14 @@ impl FastAutomaton { return Ok(true); } - let mut other_complement = other.determinize()?; + let mut other_complement = other.determinize()?.into_owned(); other_complement.complement()?; if self.has_intersection(&other_complement)? { return Ok(false); } - let mut self_complement = self.determinize()?; + let mut self_complement = self.determinize()?.into_owned(); self_complement.complement()?; Ok(!self_complement.has_intersection(other)?) diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 638a93c..5ab7180 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of matched strings. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 2f220b0..46b7c23 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,13 +10,13 @@ mod length; mod subset; impl FastAutomaton { - /// Checks if the current `FastAutomaton` matches the empty language. + /// Checks if the automaton matches the empty language. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } - /// Checks if the current `FastAutomaton` matches all possible strings. + /// Checks if the automaton matches all possible strings. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) { @@ -27,7 +27,7 @@ impl FastAutomaton { false } - /// Checks if the current `FastAutomaton` only match the empty string `""`. + /// Checks if the automaton only matches the empty string `""`. #[inline] pub fn is_empty_string(&self) -> bool { self.accept_states.len() == 1 @@ -35,7 +35,7 @@ impl FastAutomaton { && self.state_in_degree(self.start_state) == 0 } - /// Get a set of all reacheable states from the start state. + /// Returns the set of all states reachable from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 6eb8888..e08a476 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,7 +3,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Computes whether the current `FastAutomaton` is a subset of the given `FastAutomaton`. Returns `true` if all strings matched by the current `FastAutomaton` are also matched by the given `FastAutomaton`. + /// Returns `true` if all strings accepted by `self` are also accepted by `other`. pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); @@ -11,7 +11,7 @@ impl FastAutomaton { return Ok(false); } - let mut other = other.determinize()?; + let mut other = other.determinize()?.into_owned(); other.complement()?; Ok(!self.has_intersection(&other)?) diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index d6b69f4..8047152 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,7 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Create an automaton that matches the empty language. + /// Creates an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -20,7 +20,7 @@ impl FastAutomaton { } } - /// Create an automaton that only match the empty string `""`. + /// Creates an automaton that only matches the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); @@ -28,7 +28,7 @@ impl FastAutomaton { automaton } - /// Create an automaton that matches all possible strings. + /// Creates an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); @@ -38,7 +38,7 @@ impl FastAutomaton { automaton } - /// Create an automaton that matches one of the characters in the provided `CharRange`. + /// Creates an automaton that matches one of the characters in the given `CharRange`. pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { @@ -54,7 +54,7 @@ impl FastAutomaton { Ok(automaton) } - /// Create a new state in the automaton and returns its identifier. + /// Creates a new state and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { if let Some(new_state) = self.removed_states.clone().iter().next() { @@ -66,14 +66,37 @@ impl FastAutomaton { } } - /// Make the automaton accept the provided state as a valid final state. + /// Marks the provided state as an accepting (final) state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); self.accept_states.insert(state); } - /// Create a new transition between the two provided states with the given condition, the provided condition must follow the same spanning set as the rest of the automaton. + /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. + /// + /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: + /// ```rust,ignore + /// Condition::from_range(&range, &spanning_set); + /// ``` + /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// + /// 1. Merge an existing spanning set with another: + /// ```rust,ignore + /// let new_set = SpanningSet::merge(&old_set, &other_set); + /// ``` + /// + /// 2. Recompute from a list of ranges: + /// ```rust,ignore + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); + /// ``` + /// + /// After constructing `new_set`, apply it to the automaton: + /// ```rust,ignore + /// fast_automaton.apply_new_spanning_set(&new_set); + /// ``` + /// + /// This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { @@ -111,7 +134,7 @@ impl FastAutomaton { }; } - /// Create a new epsilon transition between the two provided states. + /// Creates a new epsilon transition between the two states. pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; @@ -153,7 +176,7 @@ impl FastAutomaton { } } - /// Remove the provided state from the automaton. Remove all the transitions it is connected to. Panic if the state is used as a start state. + /// Removes the state and all its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { @@ -223,7 +246,7 @@ impl FastAutomaton { } } - /// Apply the provided spanning set to the automaton and project all of its conditions on it. + /// Applies the provided spanning set and projects all existing conditions onto it. pub fn apply_new_spanning_set( &mut self, new_spanning_set: &SpanningSet, diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 3e6193c..e99e506 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -240,9 +240,7 @@ impl StateEliminationAutomaton { } impl FastAutomaton { - /// Try to convert the current FastAutomaton to a RegularExpression. - /// If it cannot find an equivalent regex it returns None. - /// This method is still a work in progress. + /// Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. pub fn to_regex(&self) -> Option { if self.is_empty() { return Some(RegularExpression::new_empty()); @@ -359,9 +357,8 @@ mod tests { let automaton2 = RegularExpression::new("ab") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.subtraction(&automaton2).unwrap(); @@ -403,9 +400,8 @@ mod tests { let automaton2 = RegularExpression::new("(xxx)*") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.subtraction(&automaton2).unwrap(); result.to_dot(); diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 7bbaf58..6cb0628 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -6,9 +6,10 @@ use ahash::AHashSet; use super::*; impl FastAutomaton { - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { + /// Generates `count` strings matched by the automaton. + pub fn generate_strings(&self, number: usize) -> Result, EngineError> { if self.is_empty() { - return Ok(AHashSet::new()); + return Ok(Vec::new()); } let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); @@ -81,7 +82,7 @@ impl FastAutomaton { } } - Ok(strings) + Ok(strings.into_iter().collect()) } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 4b2475b..30a09a9 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -98,19 +98,19 @@ impl FastAutomaton { self.transitions[state].len() } - /// Returns an iterator of the state of the automaton. + /// Returns an iterator over the automaton’s states. #[inline] pub fn all_states_iter(&self) -> impl Iterator + '_ { (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } - /// Returns a vector containing the states of the automaton. + /// Returns a vector containing the automaton’s states. #[inline] pub fn all_states_vec(&self) -> Vec { self.all_states_iter().collect() } - /// Returns an iterator over all states directly reachable from the given state in one transition. + /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { self.transitions[*state] @@ -119,7 +119,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s)) } - /// Returns a vector containing all states directly reachable from the given state in one transition. + /// Returns a vector of states directly reachable from the given state in one transition. #[inline] pub fn direct_states_vec(&self, state: &State) -> Vec { self.direct_states_iter(state).collect() @@ -139,7 +139,7 @@ impl FastAutomaton { in_transitions } - /// Returns a vector containing the transitions from the provided state. + /// Returns a vector of transitions from the given state. #[inline] pub fn transitions_from_vec(&self, state: State) -> Vec { self.transitions[state] @@ -149,7 +149,7 @@ impl FastAutomaton { .collect() } - /// Returns an iterator containing the transitions from the provided state. + /// Returns an iterator over transitions from the given state. #[inline] pub fn transitions_from_iter( &self, @@ -161,7 +161,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns a mutable iterator containing the transitions from the provided state. + /// Returns a mutable iterator over transitions from the given state. #[inline] pub fn transitions_from_iter_mut( &mut self, @@ -173,7 +173,7 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns an owned iterator containing the transitions from the provided state. + /// Returns an owned iterator over transitions from the given state. #[inline] pub fn transitions_from_into_iter( &self, @@ -186,7 +186,7 @@ impl FastAutomaton { .filter(|(_, state)| !self.removed_states.contains(state)) } - /// Returns `true` if there is a directed transition between the two provided states. + /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { @@ -215,13 +215,13 @@ impl FastAutomaton { self.transitions.len() - self.removed_states.len() } - // Get a reference of the directed transtion's condition between the two provided states. + // Returns a reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { self.transitions[from_state].get(&to_state) } - // Get a mutable reference of the directed transtion's condition between the two provided states. + // Returns a mutable reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition_mut( &mut self, @@ -231,13 +231,13 @@ impl FastAutomaton { self.transitions[from_state].get_mut(&to_state) } - /// Returns the start state of the automaton. + /// Returns the start state. #[inline] pub fn get_start_state(&self) -> State { self.start_state } - // Get a reference to the set of accept (final) states of the automaton. + // Returns a reference to the set of accept (final) states. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states @@ -249,7 +249,7 @@ impl FastAutomaton { &self.spanning_set } - /// Returns `true` if the given `state` is one of the automaton's accept states. + /// Returns `true` if the given state is one of the accept states. #[inline] pub fn is_accepted(&self, state: &State) -> bool { self.accept_states.contains(state) @@ -267,7 +267,7 @@ impl FastAutomaton { self.cyclic } - /// Returns `true` if the automaton has the provided state. + /// Returns `true` if the automaton contains the given state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 6fa9d1b..3ee4456 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -12,7 +12,7 @@ impl FastAutomaton { Self::concat_all([self, other]) } - /// Returns a new `FastAutomaton` that is the concatenation of all automatons in the given iterator. + /// Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 55d8c46..8257c8f 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,10 +5,10 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Determinize the automaton and returns it as a new `FastAutomaton`. - pub fn determinize(&self) -> Result { + /// Determinizes the automaton and returns the result as a new `FastAutomaton`. + pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { - return Ok(self.clone()); + return Ok(Cow::Borrowed(self)); } let execution_profile = ExecutionProfile::get(); @@ -66,7 +66,7 @@ impl FastAutomaton { new_states_to_add.clear(); } } - Ok(new_automaton) + Ok(Cow::Owned(new_automaton)) } fn simple_hash(list: &VecDeque) -> u64 { diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 6987c31..778e0e3 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -126,7 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } - // Returns `true` if the two automatons have a non-empty intersection. + // Returns `true` if the two automata have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index cc4fb76..8bbed81 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - // Returns the repetition of the automaton, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + // Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs index e4406e8..d7adeef 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/subtraction.rs @@ -43,7 +43,7 @@ impl FastAutomaton { Ok(()) } - /// Complement the automaton, the automaton needs to be deterministic. + /// Complements the automaton; it must be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { self.totalize()?; diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs index 017341b..d2dc30b 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer.rs @@ -1,24 +1,24 @@ use super::*; +use crate::tokenizer::Tokenizer; use lazy_static::lazy_static; use rand::Rng; -use serde::{de, ser, Deserializer, Serializer}; use serde::{Deserialize, Serialize}; +use serde::{Deserializer, Serializer, de, ser}; use std::env; use z85::{decode, encode}; -use crate::tokenizer::Tokenizer; use sha2::{Digest, Sha256}; use aes_gcm_siv::{ - aead::{Aead, KeyInit}, Aes256GcmSiv, Nonce, + aead::{Aead, KeyInit}, }; +use flate2::Compression; use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; -use flate2::Compression; use std::io::prelude::*; -use crate::tokenizer::token::{automaton_token::AutomatonToken, Token}; +use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; pub struct FastAutomatonReader { cipher: Aes256GcmSiv, @@ -171,7 +171,9 @@ mod tests { assert_serialization( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", ); - assert_serialization("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); + assert_serialization( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", + ); Ok(()) } @@ -203,9 +205,8 @@ mod tests { let automaton2 = RegularExpression::new("\\d+") .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); let subtraction = automaton1.subtraction(&automaton2).unwrap(); @@ -219,7 +220,7 @@ mod tests { assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - + Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index d177120..0d92556 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,22 +51,22 @@ impl Display for Term { } impl Term { - /// Create a term that matches the empty language. + /// Creates a term that matches the empty language. pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) } - /// Create a term that matches all possible strings. + /// Creates a term that matches all possible strings. pub fn new_total() -> Self { Term::RegularExpression(RegularExpression::new_total()) } - /// Create a term that only match the empty string `""`. + /// Creates a term that only matches the empty string `""`. pub fn new_empty_string() -> Self { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parse the provided pattern and return a new `Term` holding the resulting `RegularExpression`. + /// Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. /// /// # Example: /// @@ -79,18 +79,17 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Create a new `Term` holding the provided `RegularExpression`. + /// Creates a new `Term` holding the provided `RegularExpression`. pub fn from_regex(regex: RegularExpression) -> Self { Term::RegularExpression(regex) } - /// Create a new `Term` holding the provided `FastAutomaton`. + /// Creates a new `Term` holding the provided `FastAutomaton`. pub fn from_automaton(automaton: FastAutomaton) -> Self { Term::Automaton(automaton) } - /// Compute the concatenation of the current term with the given list of terms. - /// Returns the resulting term. + /// Computes the concatenation of the given terms. /// /// # Example: /// @@ -143,8 +142,7 @@ impl Term { } } - /// Compute the union of the current term with the given collection of terms. - /// Returns the resulting term. + /// Computes the union of the given terms. /// /// # Example: /// @@ -206,8 +204,7 @@ impl Term { } } - /// Compute the intersection of the current term with the given collection of terms. - /// Returns the resulting term. + /// Computes the intersection of the given terms. /// /// # Example: /// @@ -244,8 +241,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } - /// Compute the subtraction of the current term and the given `subtrahend`. - /// Returns the resulting term. + /// Computes the difference between `self` and the given subtrahend. /// /// # Example: /// @@ -277,8 +273,7 @@ impl Term { self.subtraction(subtrahend) } - /// Returns the repetition of the current term, - /// between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: /// @@ -311,7 +306,7 @@ impl Term { } } - /// Generate the given count of strings matched by the given term. + /// Generates `count` strings matched by the term. /// /// # Example: /// @@ -325,14 +320,9 @@ impl Term { /// assert_eq!(3, strings.len()); // ex: ["deabc", "dede", "abcde"] /// ``` pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - Ok(self - .to_automaton()? - .generate_strings(count)? - .into_iter() - .collect()) + self.to_automaton()?.generate_strings(count) } - /// Compute whether the current term and the given term are equivalent. /// Returns `true` if both terms accept the same language. /// /// # Example: @@ -355,7 +345,6 @@ impl Term { automaton_1.are_equivalent(&automaton_2) } - /// Compute whether the current term is a subset of the given term. /// Returns `true` if all strings matched by the current term are also matched by the given term. /// /// # Example: @@ -378,7 +367,7 @@ impl Term { automaton_1.is_subset_of(&automaton_2) } - /// Check if the current term matches the empty language. + /// Checks if the term matches the empty language. pub fn is_empty(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty(), @@ -386,7 +375,7 @@ impl Term { } } - /// Check if the current term matches all possible strings. + /// Checks if the term matches all possible strings. pub fn is_total(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_total(), @@ -394,7 +383,7 @@ impl Term { } } - /// Check if the current term only match the empty string `""`. + /// Checks if the term matches only the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), @@ -402,7 +391,7 @@ impl Term { } } - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of matched strings. pub fn get_length(&self) -> (Option, Option) { match self { Term::RegularExpression(regex) => regex.get_length(), @@ -410,7 +399,7 @@ impl Term { } } - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the term (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), @@ -430,7 +419,7 @@ impl Term { } } - /// Converts the current `Term` to a `FastAutomaton`. + /// Converts the term to a `FastAutomaton`. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -438,7 +427,7 @@ impl Term { }) } - /// Converts the current `Term` to a `RegularExpression`. Returns `None` if the automaton cannot be converted. + /// Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. pub fn to_regex(&self) -> Option> { Some(match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -446,7 +435,7 @@ impl Term { }) } - /// Converts the current `Term` to a regular expression pattern. Returns `None` if the automaton cannot be converted. + /// Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. pub fn to_pattern(&self) -> Option { Some(self.to_regex()?.to_string()) } @@ -458,9 +447,9 @@ impl Term { if subtrahend.is_determinitic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?)) + Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?.into_owned())) } else { - Ok(Cow::Owned(subtrahend.determinize()?)) + Ok(subtrahend.determinize()?) } } @@ -528,10 +517,7 @@ mod tests { let result = regex1.subtraction(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); - assert_eq!( - "a+", - result - ); + assert_eq!("a+", result); Ok(()) } @@ -573,10 +559,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); - assert_eq!( - "(x{3})*", - result - ); + assert_eq!("(x{3})*", result); Ok(()) } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 2b946cb..2ee4bc5 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,7 +6,7 @@ mod affixes; mod number_of_states; impl RegularExpression { - /// Returns the minimum and maximum length of the possible matched strings. + /// Returns the minimum and maximum length of possible matched strings. pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -85,7 +85,7 @@ impl RegularExpression { } } - /// Returns the cardinality of the provided term (i.e. the number of the possible matched strings). + /// Returns the cardinality of the regular expression (i.e., the number of possible matched strings). pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); @@ -227,7 +227,7 @@ mod tests { let mut automaton = regex.to_automaton().unwrap(); if !automaton.is_cyclic() { - automaton = automaton.determinize().unwrap(); + automaton = automaton.determinize().unwrap().into_owned(); } //automaton.to_dot(); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 1b8f636..ae66725 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,7 +11,7 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and return the resulting `RegularExpression`. + /// Parses the provided pattern and returns the resulting `RegularExpression`. pub fn new(pattern: &str) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); @@ -33,7 +33,7 @@ impl RegularExpression { RE_FLAG_DETECTION.replace_all(regex, "").to_string() } - /// Create a `RegularExpression` that matches all possible strings. + /// Creates a regular expression that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( Box::new(RegularExpression::Character(CharRange::total())), @@ -42,12 +42,12 @@ impl RegularExpression { ) } - /// Create a `RegularExpression` that matches the empty language. + /// Creates a regular expression that matches the empty language. pub fn new_empty() -> Self { RegularExpression::Character(CharRange::empty()) } - /// Create a`RegularExpression` that only match the empty string `""`. + /// Creates a regular expression that matches only the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 5c7ca16..ba569e8 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -90,7 +90,7 @@ impl Display for RegularExpression { } impl RegularExpression { - /// Checks if the current `RegularExpression` matches the empty language. + /// Checks if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -99,7 +99,7 @@ impl RegularExpression { } } - /// Checks if the current `RegularExpression` matches all possible strings. + /// Checks if the regular expression only matches the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -107,7 +107,7 @@ impl RegularExpression { } } - /// Checks if the current `RegularExpression` only match the empty string `""`. + /// Checks if the regular expression matches all possible strings. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -124,7 +124,7 @@ impl RegularExpression { } } - /// Convert the current `RegularExpression` to an equivalent `FastAutomaton`. + /// Converts the regular expression to an equivalent `FastAutomaton`. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index ac699d8..fe83c46 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns a new `RegularExpression` representing the concatenation of `self` and `other`, using `append_back` to determine their order. + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 7da36bb..181724f 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns the repetition of the `RegularExpression`, between `min` and `max_opt` times. If `max_opt` is `None`, the repetition is unbounded. + /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index 51f66b7..5156ce8 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns a simplified version of this regular expression by eliminating redundant constructs and applying canonical reductions. + /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { RegularExpression::Character(_) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 1c09e78..8e4f1f3 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,12 +3,12 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - /// Returns a new `RegularExpression` representing the union of this expression with `other`. + /// Returns a regular expression matching the union of `self` and `other`. pub fn union(&self, other: &RegularExpression) -> RegularExpression { Self::union_all([self, other]) } - /// Returns a `RegularExpression` formed by taking the union of all expressions in `patterns`. + /// Returns a regular expression that is the union of all expressions in `patterns`. pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 0838525..40d0fcb 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{error::EngineError, fast_automaton::condition::Condition, CharRange}; +use crate::{CharRange, error::EngineError, fast_automaton::condition::Condition}; use self::token::range_token::RangeToken; @@ -162,7 +162,8 @@ mod tests { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); + let automaton = regex.to_automaton().unwrap(); + let automaton = automaton.determinize().unwrap(); let tokenizer = Tokenizer::new(&automaton); let embedding = tokenizer.to_embedding(); diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs index fe73cab..d9e6892 100644 --- a/src/tokenizer/embed_regex.rs +++ b/src/tokenizer/embed_regex.rs @@ -270,7 +270,8 @@ mod tests { let regex = RegularExpression::new(regex).unwrap(); println!("{}", regex); - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); + let automaton = regex.to_automaton().unwrap(); + let automaton = automaton.determinize().unwrap(); //automaton.to_dot(); let tokenizer = Tokenizer::new(&automaton); From eb79826c1ac920f832c619c561d62673fde4bddd Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:12:59 +0200 Subject: [PATCH 14/62] fix bench --- benches/my_benchmark.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index f2f9fdc..c35164a 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,4 +1,3 @@ -use ahash::AHashSet; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; @@ -11,14 +10,14 @@ fn to_regex(automaton: &FastAutomaton) -> RegularExpression { } fn determinize(automaton: &FastAutomaton) -> FastAutomaton { - automaton.determinize().unwrap() + automaton.determinize().unwrap().into_owned() } fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> FastAutomaton { automaton_1.intersection(automaton_2).unwrap() } -fn generate_strings(automaton: &FastAutomaton) -> AHashSet { +fn generate_strings(automaton: &FastAutomaton) -> Vec { automaton.generate_strings(2000).unwrap() } From 691a9727d834bc2c103670c15b53396aaab65284 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:13:07 +0200 Subject: [PATCH 15/62] fix docs test --- src/fast_automaton/builder.rs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 8047152..f2011e4 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -76,23 +76,37 @@ impl FastAutomaton { /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; + /// # let range = CharRange::total(); + /// # let spanning_set = SpanningSet::new_total(); /// Condition::from_range(&range, &spanning_set); /// ``` /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: /// /// 1. Merge an existing spanning set with another: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let old_set = SpanningSet::new_total(); + /// # let other_set = SpanningSet::new_total(); /// let new_set = SpanningSet::merge(&old_set, &other_set); /// ``` /// /// 2. Recompute from a list of ranges: - /// ```rust,ignore - /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let range_set1 = CharRange::total(); + /// # let range_set2 = CharRange::total(); + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2]); /// ``` /// /// After constructing `new_set`, apply it to the automaton: - /// ```rust,ignore + /// ```rust + /// # use regexsolver::fast_automaton::{FastAutomaton, spanning_set::SpanningSet}; + /// # let mut fast_automaton = FastAutomaton::new_total(); + /// # let new_set = SpanningSet::new_total(); /// fast_automaton.apply_new_spanning_set(&new_set); /// ``` /// From 4fe1d94be02c850117b644f17db5131e7092e390 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 3 Aug 2025 16:22:23 +0200 Subject: [PATCH 16/62] update docs --- src/lib.rs | 157 +++++++++++++++++++++-------------------------------- 1 file changed, 61 insertions(+), 96 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0d92556..75051db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,60 @@ pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// +/// ```rust +/// use regexsolver::Term; +/// +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*").unwrap(); +/// let t2 = Term::from_pattern(".*xyz").unwrap(); +/// +/// // Concatenate +/// let concat = t1.concat(&[t2]).unwrap(); +/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); +/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}") +/// .unwrap() +/// .intersection(&[Term::from_pattern(".*xy").unwrap()]) +/// .unwrap(); // (ab|xy)xy +/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// +/// // Subtraction +/// let diff = Term::from_pattern("a*") +/// .unwrap() +/// .subtraction(&Term::from_pattern("").unwrap()) +/// .unwrap(); +/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// +/// // Repetition +/// let rep = Term::from_pattern("abc") +/// .unwrap() +/// .repeat(2, Some(4)) +/// .unwrap(); +/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// +/// // Analyze +/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()); +/// +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}") +/// .unwrap() +/// .generate_strings(5) +/// .unwrap(); +/// println!("Some matches: {:?}", samples); +/// +/// // Equivalence & subset +/// let a = Term::from_pattern("a+").unwrap(); +/// let b = Term::from_pattern("a*").unwrap(); +/// assert!(!a.are_equivalent(&b).unwrap()); +/// assert!(a.is_subset_of(&b).unwrap()); +/// ``` +/// /// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] @@ -447,7 +501,12 @@ impl Term { if subtrahend.is_determinitic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?.into_owned())) + Ok(Cow::Owned( + minuend + .intersection(subtrahend)? + .determinize()? + .into_owned(), + )) } else { Ok(subtrahend.determinize()?) } @@ -494,7 +553,7 @@ impl Term { #[cfg(test)] mod tests { - use crate::{execution_profile::ExecutionProfileBuilder, regex::RegularExpression}; + use crate::regex::RegularExpression; use super::*; @@ -563,98 +622,4 @@ mod tests { Ok(()) } - - #[test] - fn test_readme_code_1() -> Result<(), String> { - // Create terms from regex - let t1 = Term::from_pattern("abc.*").unwrap(); - let t2 = Term::from_pattern(".*xyz").unwrap(); - - // Concatenate - let concat = t1.concat(&[t2]).unwrap(); - assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); - - // Union - let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); // (abc.*|fgh) - assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); - - // Intersection - let inter = Term::from_pattern("(ab|xy){2}") - .unwrap() - .intersection(&[Term::from_pattern(".*xy").unwrap()]) - .unwrap(); // (ab|xy)xy - assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); - - // Subtraction - let diff = Term::from_pattern("a*") - .unwrap() - .subtraction(&Term::from_pattern("").unwrap()) - .unwrap(); - assert_eq!(diff.to_pattern().unwrap(), "a+"); - - // Repetition - let rep = Term::from_pattern("abc") - .unwrap() - .repeat(2, Some(4)) - .unwrap(); // (abc){2,4} - assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); - - // Analyze - assert_eq!(rep.get_length(), (Some(6), Some(12))); - assert!(!rep.is_empty()); - - // Generate examples - let samples = Term::from_pattern("(x|y){1,3}") - .unwrap() - .generate_strings(5) - .unwrap(); - println!("Some matches: {:?}", samples); - - // Equivalence & subset - let a = Term::from_pattern("a+").unwrap(); - let b = Term::from_pattern("a*").unwrap(); - assert!(!a.are_equivalent(&b).unwrap()); - assert!(a.is_subset_of(&b).unwrap()); - - Ok(()) - } - - #[test] - fn test_readme_code_2() -> Result<(), String> { - let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); - - let execution_profile = ExecutionProfileBuilder::new() - .execution_timeout(5) // We set the limit (5ms) - .build(); - - // We run the operation with the defined limitation - execution_profile.run(|| { - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(1000).unwrap_err() - ); - }); - - Ok(()) - } - - #[test] - fn test_readme_code_3() -> Result<(), String> { - let term1 = Term::from_pattern(".*abcdef.*").unwrap(); - let term2 = Term::from_pattern(".*defabc.*").unwrap(); - - let execution_profile = ExecutionProfileBuilder::new() - .max_number_of_states(5) // We set the limit - .build(); - - // We run the operation with the defined limitation - execution_profile.run(|| { - assert_eq!( - EngineError::AutomatonHasTooManyStates, - term1.intersection(&[term2]).unwrap_err() - ); - }); - - Ok(()) - } } From a42c87a9ec1925e6d2194944fdc10cab1a0cd714 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 15:23:20 +0200 Subject: [PATCH 17/62] Update README.md --- README.md | 164 ++++++++++++++++++++++++++---------------------------- 1 file changed, 80 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 3b1084c..4406477 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code generators, test-case generators, and any use case requiring rich regex/automaton operations. +**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code or test-case generators, and any system needing rich regex or automaton operations. ## Table of Contents @@ -29,75 +29,72 @@ regexsolver = "1" ```rust use regexsolver::Term; - -// Create terms from regex -let t1 = Term::from_pattern("abc.*").unwrap(); -let t2 = Term::from_pattern(".*xyz").unwrap(); - -// Concatenate -let concat = t1.concat(&[t2]).unwrap(); -assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); - -// Union -let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); - -// Intersection -let inter = Term::from_pattern("(ab|xy){2}") - .unwrap() - .intersection(&[Term::from_pattern(".*xy").unwrap()]) - .unwrap(); // (ab|xy)xy -assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); - -// Subtraction -let diff = Term::from_pattern("a*") - .unwrap() - .subtraction(&Term::from_pattern("").unwrap()) - .unwrap(); -assert_eq!(diff.to_pattern().unwrap(), "a+"); - -// Repetition -let rep = Term::from_pattern("abc") - .unwrap() - .repeat(2, Some(4)) - .unwrap(); -assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); - -// Analyze -assert_eq!(rep.get_length(), (Some(6), Some(12))); -assert!(!rep.is_empty()); - -// Generate examples -let samples = Term::from_pattern("(x|y){1,3}") - .unwrap() - .generate_strings(5) - .unwrap(); -println!("Some matches: {:?}", samples); - -// Equivalence & subset -let a = Term::from_pattern("a+").unwrap(); -let b = Term::from_pattern("a*").unwrap(); -assert!(!a.are_equivalent(&b).unwrap()); -assert!(a.is_subset_of(&b).unwrap()); +use regexsolver::error::EngineError; + +fn main() -> Result<(), EngineError> { + // Create terms from regex + let t1 = Term::from_pattern("abc.*")?; + let t2 = Term::from_pattern(".*xyz")?; + + // Concatenate + let concat = t1.concat(&[t2])?; + assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); + + // Union + let union = t1.union(&[Term::from_pattern("fgh")?])?; + assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); + + // Intersection + let inter = Term::from_pattern("(ab|xy){2}")? + .intersection(&[Term::from_pattern(".*xy")?])?; + assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); + + // Difference + let diff = Term::from_pattern("a*")? + .difference(&Term::from_pattern("")?)?; + assert_eq!(diff.to_pattern().unwrap(), "a+"); + + // Repetition + let rep = Term::from_pattern("abc")? + .repeat(2, Some(4))?; + assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); + + // Analyze + assert_eq!(rep.get_length(), (Some(6), Some(12))); + assert!(!rep.is_empty()); + + // Generate examples + let samples = Term::from_pattern("(x|y){1,3}")? + .generate_strings(5)?; + println!("Some matches: {:?}", samples); + + // Equivalence & subset + let a = Term::from_pattern("a+")?; + let b = Term::from_pattern("a*")?; + assert!(!a.are_equivalent(&b)?); + assert!(a.is_subset_of(&b)?); + + Ok(()) +} ``` ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: - **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". -- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them would return an error. +- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. -- **Line Feed and Dot:** RegexSolver handle every characters the same way. The dot character `.` matches every possible unicode characters including the line feed (`\n`). +- **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). - **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. -- **Empty Regular Expressions:** An empty regular expression is denoted by `[]`, which represents a pattern that matches no input, not even an empty string. +- **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. -RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. As a result, unsupported features supported by the parser will be parsed but ignored. This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. ## API ### Term -`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. #### Build | Method | Return | Description | @@ -113,10 +110,9 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | -| `difference(&self, subtrahend: &Term)` | `Result` | Alias for `subtraction`. | +| `difference(&self, other: &Term)` | `Result` | Computes the difference between `self` and `other`. | | `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `subtraction(&self, subtrahend: &Term)` | `Result` | Computes the difference between `self` and the given subtrahend. | | `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | #### Analyze @@ -137,7 +133,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automaton can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automata can be converted to a regular expression. When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust @@ -165,33 +161,33 @@ This design allows us to perform unions, intersections, and complements of trans #### Build | Method | Return | Description | | -------- | ------- | ------- | -| `accept(&mut self, state: State)` | | Marks the provided state as an accepting (final) state. | -| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | | Creates a new epsilon transition between the two states. | -| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | +| `accept(&mut self, state: State)` | `()` | Marks the provided state as an accepting (final) state. | +| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | `()` | Creates a new epsilon transition between the two states. | +| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | `()` | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | | `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | | `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | | `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | | `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | -| `remove_state(&mut self, state: State)` | | Removes the state and all its connected transitions; panics if it's a start state. | -| `remove_states(&mut self, states: &IntSet)` | | Removes the given states and their connected transitions; panics if any is a start state. | +| `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | #### Manipulate | Method | Return | Description | | -------- | ------- | ------- | | `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | -| `concat(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. | -| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result as a new `FastAutomaton`. | -| `intersection(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the intersection of `self` and `other`. | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. | +| `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | +| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | +| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | +| `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | +| `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | +| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `subtraction(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the substraction of `self` and `other`. | -| `union(&self, other: &FastAutomaton)` | `Result` | Returns a new `FastAutomaton` representing the union of `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. | +| `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | +| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the union of all automatons in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the union of all automatons in the given iterator. | #### Analyze | Method | Return | Description | @@ -208,14 +204,14 @@ This design allows us to perform unions, intersections, and complements of trans | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | | `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | @@ -232,7 +228,7 @@ This design allows us to perform unions, intersections, and complements of trans ### RegularExpression -`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert in to a `FastAutomaton` with the method `to_automaton()`. +`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. #### Build | Method | Return | Description | @@ -259,17 +255,17 @@ This design allows us to perform unions, intersections, and complements of trans ## Bound Execution -By default, all operations run without limits. For heavy or untrusted patterns, use a thread local `ExecutionProfile` to cap execution time and maximum number of states in used automata. +Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. ### Time-Bounded Execution ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*")?; let execution_profile = ExecutionProfileBuilder::new() - .execution_timeout(5) // We set the limit (5ms) + .execution_timeout(5) // limit in milliseconds .build(); // We run the operation with the defined limitation @@ -283,11 +279,11 @@ execution_profile.run(|| { ```rust use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_pattern(".*abcdef.*").unwrap(); -let term2 = Term::from_pattern(".*defabc.*").unwrap(); +let term1 = Term::from_pattern(".*abcdef.*")?; +let term2 = Term::from_pattern(".*defabc.*")?; let execution_profile = ExecutionProfileBuilder::new() - .max_number_of_states(5) // We set the limit + .max_number_of_states(5) // we set the limit .build(); // We run the operation with the defined limitation @@ -304,7 +300,7 @@ If you want to use this library with other programming languages, we provide a w - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) - [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). +For more information about how to use the wrappers, you can refer to our [guide](https://docs.regexsolver.com/getting-started.html). ## License From aec5c39f78279aa22aa8cd63208aebaef8b598e9 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:05:21 +0200 Subject: [PATCH 18/62] Update naming and docs --- Cargo.toml | 10 +- src/cardinality/mod.rs | 6 +- src/error/mod.rs | 3 + src/execution_profile.rs | 2 +- src/fast_automaton/builder.rs | 6 +- .../condition/fast_bit_vec/mod.rs | 6 +- src/fast_automaton/convert/to_regex/mod.rs | 8 +- src/fast_automaton/mod.rs | 2 +- src/fast_automaton/operation/concat.rs | 4 +- src/fast_automaton/operation/determinize.rs | 4 +- .../{subtraction.rs => difference.rs} | 4 +- src/fast_automaton/operation/intersection.rs | 6 +- src/fast_automaton/operation/mod.rs | 2 +- src/fast_automaton/operation/union.rs | 6 +- src/fast_automaton/serializer.rs | 122 ++++------------- src/fast_automaton/spanning_set/mod.rs | 16 +-- src/lib.rs | 128 +++++++++--------- src/regex/analyze/number_of_states.rs | 2 +- src/regex/builder.rs | 2 +- src/regex/mod.rs | 4 +- src/tokenizer/embed_automaton.rs | 4 +- tests/integration_tests.rs | 5 - 22 files changed, 139 insertions(+), 213 deletions(-) rename src/fast_automaton/operation/{subtraction.rs => difference.rs} (92%) diff --git a/Cargo.toml b/Cargo.toml index c691486..eb32825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,15 +6,13 @@ authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" keywords = ["automaton", "intersection", "union", "difference", "regex"] -description = "Manipulate regex and automaton as if they were sets." +description = "High-performance Rust library for building, combining, and analyzing regular expressions and finite automata" readme = "README.md" [dependencies] serde = { version = "1.0", features = ["derive"], optional = true } ciborium = { version = "0.2.2", optional = true } z85 = { version = "3.0.5", optional = true } -aes-gcm-siv = { version = "0.11.1", optional = true } -sha2 = { version = "0.10.8", optional = true } flate2 = { version = "1.0.30", features = [ "zlib-ng", ], default-features = false, optional = true } @@ -35,14 +33,12 @@ serde_json = "1.0.114" [features] -default = ["serde"] -serde = [ +default = [] +serializable = [ "regex-charclass/serde", "dep:serde", "dep:ciborium", "dep:z85", - "dep:aes-gcm-siv", - "dep:sha2", "dep:flate2", ] diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 08131e0..9adad1c 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,10 +1,10 @@ -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; /// Represent a number. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Cardinality { /// An infinite number. Infinite, diff --git a/src/error/mod.rs b/src/error/mod.rs index 303c225..b960147 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,5 +1,6 @@ use std::fmt::{self}; +#[cfg(feature = "serializable")] use crate::tokenizer::token::TokenError; /// An error thrown by the engine. @@ -19,6 +20,7 @@ pub enum EngineError { ConditionInvalidRange, /// The provided index is out of bound of the condition. ConditionIndexOutOfBound, + #[cfg(feature = "serializable")] /// There is an error with one of the token. TokenError(TokenError), /// Computing the cardinality of the provided automaton failed. @@ -37,6 +39,7 @@ impl fmt::Display for EngineError { write!(f, "The automaton has too many states.") } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), + #[cfg(feature = "serializable")] EngineError::TokenError(err) => write!(f, "{err}."), EngineError::ConditionInvalidRange => write!( f, diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 76c1b78..cda0e11 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -131,7 +131,7 @@ impl ExecutionProfile { result } - /// Like [`run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + /// Like [`ExecutionProfile::run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. pub fn apply(&self, f: F) -> R where F: FnOnce() -> R, diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index f2011e4..8bcc136 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -38,7 +38,7 @@ impl FastAutomaton { automaton } - /// Creates an automaton that matches one of the characters in the given `CharRange`. + /// Creates an automaton that matches one of the characters in the given [`CharRange`]. pub fn new_from_range(range: &CharRange) -> Result { let mut automaton = Self::new_empty(); if range.is_empty() { @@ -75,7 +75,7 @@ impl FastAutomaton { /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// - /// This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: + /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: /// ```rust /// # use regexsolver::CharRange; /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; @@ -83,7 +83,7 @@ impl FastAutomaton { /// # let spanning_set = SpanningSet::new_total(); /// Condition::from_range(&range, &spanning_set); /// ``` - /// where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// where `spanning_set` is the automaton's current [`SpanningSet`]. The [`CharRange`] you pass must be fully covered by that spanning set. If it isn't, you have two options: /// /// 1. Merge an existing spanning set with another: /// ```rust diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index 82b0ead..a1b46c5 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -124,10 +124,10 @@ impl FastBitVec { } pub fn get_bits(&self) -> Vec { - let mut hot_bits = Vec::with_capacity(self.n); + let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - hot_bits.push(self.get(i).unwrap()); + bits.push(self.get(i).unwrap()); } - hot_bits + bits } } diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index e99e506..e8e7e8e 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -240,7 +240,7 @@ impl StateEliminationAutomaton { } impl FastAutomaton { - /// Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. + /// Attempts to convert the automaton to a [`RegularExpression`]; returns `None` if no equivalent pattern are found. pub fn to_regex(&self) -> Option { if self.is_empty() { return Some(RegularExpression::new_empty()); @@ -360,7 +360,7 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); @@ -403,7 +403,7 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); let result = result.to_regex().unwrap(); @@ -451,7 +451,7 @@ mod tests { .determinize() .unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); result.to_dot(); let result = result.to_regex().unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 30a09a9..908ed4e 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -26,7 +26,7 @@ pub mod condition; mod convert; mod generate; mod operation; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] mod serializer; pub mod spanning_set; diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 3ee4456..45654d3 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -7,12 +7,12 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the concatenation of `self` and `other`. + /// Computes the concatenation between `self` and `other`. pub fn concat(&self, other: &FastAutomaton) -> Result { Self::concat_all([self, other]) } - /// Returns a new `FastAutomaton` representing the concatenation of all automata in the given iterator. + /// Computes the concatenation of all automatons in the given iterator. pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 8257c8f..734f622 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,7 +5,7 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Determinizes the automaton and returns the result as a new `FastAutomaton`. + /// Determinizes the automaton and returns the result. pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { return Ok(Cow::Borrowed(self)); @@ -128,7 +128,7 @@ mod tests { assert!(deterministic_automaton.is_determinitic()); assert!( automaton - .subtraction(&deterministic_automaton) + .difference(&deterministic_automaton) .unwrap() .is_empty() ); diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/difference.rs similarity index 92% rename from src/fast_automaton/operation/subtraction.rs rename to src/fast_automaton/operation/difference.rs index d7adeef..a7b8ecf 100644 --- a/src/fast_automaton/operation/subtraction.rs +++ b/src/fast_automaton/operation/difference.rs @@ -59,8 +59,8 @@ impl FastAutomaton { Ok(()) } - /// Returns a new `FastAutomaton` representing the substraction of `self` and `other`. - pub fn subtraction(&self, other: &FastAutomaton) -> Result { + /// Computes the difference between `self` and `other`. + pub fn difference(&self, other: &FastAutomaton) -> Result { let mut complement = other.clone(); match complement.complement() { Ok(()) => self.intersection(&complement), diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 778e0e3..4f42859 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -12,12 +12,12 @@ use crate::{ use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the intersection of `self` and `other`. + /// Computes the intersection between `self` and `other`. pub fn intersection(&self, other: &FastAutomaton) -> Result { FastAutomaton::intersection_all([self, other]) } - /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given iterator. + /// Computes the intersection of all automatons in the given iterator. pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); @@ -33,7 +33,7 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Returns a new `FastAutomaton` that is the intersection of all automatons in the given parallel iterator. + /// Computes in parallel the intersection of all automatons in the given iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index a574a0e..54bcba3 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -8,7 +8,7 @@ mod union; mod concat; mod determinize; mod intersection; -mod subtraction; +mod difference; mod repeat; impl FastAutomaton { diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index d83e7de..e71f346 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -8,12 +8,12 @@ use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - /// Returns a new `FastAutomaton` representing the union of `self` and `other`. + /// Computes the union between `self` and `other`. pub fn union(&self, other: &FastAutomaton) -> Result { Self::union_all([self, other]) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given iterator. + /// Computes the union of all automatons in the given iterator. pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result { let mut new_automaton = FastAutomaton::new_empty(); @@ -23,7 +23,7 @@ impl FastAutomaton { Ok(new_automaton) } - /// Returns a new `FastAutomaton` that is the union of all automatons in the given parallel iterator. + /// Computes in parallel the union of all automatons in the given iterator. pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result { let execution_profile = ExecutionProfile::get(); diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs index d2dc30b..29c41e3 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer.rs @@ -1,18 +1,10 @@ use super::*; use crate::tokenizer::Tokenizer; -use lazy_static::lazy_static; -use rand::Rng; use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer, de, ser}; -use std::env; -use z85::{decode, encode}; -use sha2::{Digest, Sha256}; +use z85::{decode, encode}; -use aes_gcm_siv::{ - Aes256GcmSiv, Nonce, - aead::{Aead, KeyInit}, -}; use flate2::Compression; use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; @@ -20,34 +12,6 @@ use std::io::prelude::*; use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; -pub struct FastAutomatonReader { - cipher: Aes256GcmSiv, -} - -impl FastAutomatonReader { - pub fn new() -> Self { - let env_var = env::var("RS_FAIR_SECRET_KEY").unwrap_or("DEFAULT PASSKEY".to_string()); - let key = Sha256::digest(env_var.as_bytes()); - FastAutomatonReader { - cipher: Aes256GcmSiv::new(&key), - } - } - - pub fn random_nonce() -> [u8; 12] { - let mut nonce = [0u8; 12]; - rand::thread_rng().fill(&mut nonce); - nonce - } -} - -lazy_static! { - static ref SINGLETON_INSTANCE: FastAutomatonReader = FastAutomatonReader::new(); -} - -fn get_fast_automaton_reader() -> &'static FastAutomatonReader { - &SINGLETON_INSTANCE -} - #[derive(Serialize, Deserialize, Debug)] struct SerializedAutomaton(Vec, SpanningSet); @@ -67,22 +31,7 @@ impl serde::Serialize for FastAutomaton { return Err(ser::Error::custom(err.to_string())); } - serialized = compress_data(&serialized); - - let nonce = FastAutomatonReader::random_nonce(); - - match get_fast_automaton_reader() - .cipher - .encrypt(Nonce::from_slice(&nonce), serialized.as_ref()) - { - Ok(ciphertext) => { - let mut encrypted = Vec::from_iter(nonce); - encrypted.extend(ciphertext); - - serializer.serialize_str(&encode(&encrypted)) - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } + serializer.serialize_str(&encode(compress_data(&serialized))) } Err(err) => Err(ser::Error::custom(err.to_string())), } @@ -96,38 +45,27 @@ impl<'de> serde::Deserialize<'de> for FastAutomaton { { match String::deserialize(deserializer) { Ok(decoded) => match decode(decoded) { - Ok(encrypted) => { - let nonce = &encrypted[0..12]; - let payload = encrypted[12..].to_vec(); - let cipher_result = get_fast_automaton_reader() - .cipher - .decrypt(Nonce::from_slice(nonce), payload.as_ref()); - - match cipher_result { - Ok(cipher_result) => { - let decrypted = decompress_data(&cipher_result); - - let automaton: Result< - SerializedAutomaton, - ciborium::de::Error, - > = ciborium::from_reader(&decrypted[..]); - match automaton { - Ok(automaton) => { - let mut temp_automaton = FastAutomaton::new_empty(); - temp_automaton.spanning_set = automaton.1; - let tokenizer = Tokenizer::new(&temp_automaton); - - match tokenizer.from_embedding( - &automaton - .0 - .into_iter() - .map(AutomatonToken::from_fair_token) - .collect::>(), - ) { - Ok(res) => Ok(res), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } + Ok(compressed) => { + let payload = decompress_data(&compressed); + + let automaton: Result< + SerializedAutomaton, + ciborium::de::Error, + > = ciborium::from_reader(&payload[..]); + match automaton { + Ok(automaton) => { + let mut temp_automaton = FastAutomaton::new_empty(); + temp_automaton.spanning_set = automaton.1; + let tokenizer = Tokenizer::new(&temp_automaton); + + match tokenizer.from_embedding( + &automaton + .0 + .into_iter() + .map(AutomatonToken::from_fair_token) + .collect::>(), + ) { + Ok(res) => Ok(res), Err(err) => Err(de::Error::custom(err.to_string())), } } @@ -192,8 +130,8 @@ mod tests { let unserialized = unserialized.determinize().unwrap(); let automaton = automaton.determinize().unwrap(); - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); } #[test] @@ -208,18 +146,18 @@ mod tests { .unwrap(); let automaton2 = automaton2.determinize().unwrap(); - let subtraction = automaton1.subtraction(&automaton2).unwrap(); + let difference = automaton1.difference(&automaton2).unwrap(); - let serialized = serde_json::to_string(&subtraction).unwrap(); + let serialized = serde_json::to_string(&difference).unwrap(); println!("{serialized}"); let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); let unserialized = unserialized.determinize().unwrap(); - let automaton = subtraction.determinize().unwrap(); + let automaton = difference.determinize().unwrap(); - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); + assert!(automaton.difference(&unserialized).unwrap().is_empty()); + assert!(unserialized.difference(&automaton).unwrap().is_empty()); Ok(()) } diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index bfaefcb..bdb9d9c 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -2,13 +2,13 @@ use std::slice::Iter; use ahash::AHashSet; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; use crate::CharRange; /// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] pub struct SpanningSet(Vec, CharRange); @@ -91,13 +91,13 @@ impl SpanningSet { let other_set = spanning_ranges.swap_remove(index); let intersection_set = set.intersection(&other_set); new_spanning_ranges.insert(intersection_set); - let subtraction_set = set.difference(&other_set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = set.difference(&other_set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } - let subtraction_set = other_set.difference(&set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = other_set.difference(&set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } changed = true; } else if !set.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index 75051db..9dd5328 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ use nohash_hasher::NoHashHasher; use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] use serde::{Deserialize, Serialize}; use crate::execution_profile::ExecutionProfile; @@ -22,6 +22,7 @@ pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; +#[cfg(feature = "serializable")] pub mod tokenizer; pub type IntMap = HashMap>>; @@ -30,68 +31,67 @@ pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// +/// # Example /// ```rust /// use regexsolver::Term; +/// use regexsolver::error::EngineError; /// -/// // Create terms from regex -/// let t1 = Term::from_pattern("abc.*").unwrap(); -/// let t2 = Term::from_pattern(".*xyz").unwrap(); +/// fn main() -> Result<(), EngineError> { +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*")?; +/// let t2 = Term::from_pattern(".*xyz")?; /// -/// // Concatenate -/// let concat = t1.concat(&[t2]).unwrap(); -/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// // Concatenate +/// let concat = t1.concat(&[t2])?; +/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); /// -/// // Union -/// let union = t1.union(&[Term::from_pattern("fgh").unwrap()]).unwrap(); -/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh")?])?; +/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); /// -/// // Intersection -/// let inter = Term::from_pattern("(ab|xy){2}") -/// .unwrap() -/// .intersection(&[Term::from_pattern(".*xy").unwrap()]) -/// .unwrap(); // (ab|xy)xy -/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}")? +/// .intersection(&[Term::from_pattern(".*xy")?])?; +/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); /// -/// // Subtraction -/// let diff = Term::from_pattern("a*") -/// .unwrap() -/// .subtraction(&Term::from_pattern("").unwrap()) -/// .unwrap(); -/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// // Difference +/// let diff = Term::from_pattern("a*")? +/// .difference(&Term::from_pattern("")?)?; +/// assert_eq!(diff.to_pattern().unwrap(), "a+"); /// -/// // Repetition -/// let rep = Term::from_pattern("abc") -/// .unwrap() -/// .repeat(2, Some(4)) -/// .unwrap(); -/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// // Repetition +/// let rep = Term::from_pattern("abc")? +/// .repeat(2, Some(4))?; +/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); /// -/// // Analyze -/// assert_eq!(rep.get_length(), (Some(6), Some(12))); -/// assert!(!rep.is_empty()); +/// // Analyze +/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()); /// -/// // Generate examples -/// let samples = Term::from_pattern("(x|y){1,3}") -/// .unwrap() -/// .generate_strings(5) -/// .unwrap(); -/// println!("Some matches: {:?}", samples); +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}")? +/// .generate_strings(5)?; +/// println!("Some matches: {:?}", samples); /// -/// // Equivalence & subset -/// let a = Term::from_pattern("a+").unwrap(); -/// let b = Term::from_pattern("a*").unwrap(); -/// assert!(!a.are_equivalent(&b).unwrap()); -/// assert!(a.is_subset_of(&b).unwrap()); +/// // Equivalence & subset +/// let a = Term::from_pattern("a+")?; +/// let b = Term::from_pattern("a*")?; +/// assert!(!a.are_equivalent(&b)?); +/// assert!(a.is_subset_of(&b)?); +/// +/// Ok(()) +/// } +/// # main(); /// ``` /// -/// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +/// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. +#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Term { - #[cfg_attr(feature = "serde", serde(rename = "regex"))] + #[cfg_attr(feature = "serializable", serde(rename = "regex"))] RegularExpression(RegularExpression), - #[cfg_attr(feature = "serde", serde(rename = "fair"))] + #[cfg_attr(feature = "serializable", serde(rename = "fair"))] Automaton(FastAutomaton), } @@ -120,7 +120,7 @@ impl Term { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. + /// Parses the provided pattern and returns a new `Term` holding the resulting [`RegularExpression`]. /// /// # Example: /// @@ -133,12 +133,12 @@ impl Term { Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Creates a new `Term` holding the provided `RegularExpression`. + /// Creates a new `Term` holding the provided [`RegularExpression`]. pub fn from_regex(regex: RegularExpression) -> Self { Term::RegularExpression(regex) } - /// Creates a new `Term` holding the provided `FastAutomaton`. + /// Creates a new `Term` holding the provided [`FastAutomaton`]. pub fn from_automaton(automaton: FastAutomaton) -> Self { Term::Automaton(automaton) } @@ -295,7 +295,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } - /// Computes the difference between `self` and the given subtrahend. + /// Computes the difference between `self` and `other`. /// /// # Example: /// @@ -305,28 +305,22 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("de").unwrap(); /// - /// let subtraction = term1.subtraction(&term2).unwrap(); + /// let difference = term1.difference(&term2).unwrap(); /// - /// if let Term::RegularExpression(regex) = subtraction { + /// if let Term::RegularExpression(regex) = difference { /// assert_eq!("abc", regex.to_string()); /// } /// ``` - pub fn subtraction(&self, subtrahend: &Term) -> Result { + pub fn difference(&self, other: &Term) -> Result { let minuend_automaton = self.to_automaton()?; - let subtrahend_automaton = subtrahend.to_automaton()?; + let subtrahend_automaton = other.to_automaton()?; let subtrahend_automaton = Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; - let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; + let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; Ok(Term::Automaton(return_automaton)) } - /// See [`Self::subtraction`]. - #[inline] - pub fn difference(&self, subtrahend: &Term) -> Result { - self.subtraction(subtrahend) - } - /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: @@ -473,7 +467,7 @@ impl Term { } } - /// Converts the term to a `FastAutomaton`. + /// Converts the term to a [`FastAutomaton`]. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -569,11 +563,11 @@ mod tests { } #[test] - fn test_subtraction_1() -> Result<(), String> { + fn test_difference_1() -> Result<(), String> { let regex1 = Term::from_pattern("a*").unwrap(); let regex2 = Term::from_pattern("").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_pattern().unwrap(); assert_eq!("a+", result); @@ -582,11 +576,11 @@ mod tests { } #[test] - fn test_subtraction_2() -> Result<(), String> { + fn test_difference_2() -> Result<(), String> { let regex1 = Term::from_pattern("x*").unwrap(); let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); let result = result.unwrap().to_regex().unwrap().into_owned(); assert_eq!( diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index e7460f8..8325456 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -162,7 +162,7 @@ impl AbstractNFAMetadata { } impl RegularExpression { - pub fn get_number_of_states_in_nfa(&self) -> usize { + pub(crate) fn get_number_of_states_in_nfa(&self) -> usize { self.evaluate_number_of_states_in_nfa().number_of_states } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index ae66725..799c69d 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,7 +11,7 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and returns the resulting `RegularExpression`. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index ba569e8..05908c0 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -11,7 +11,7 @@ use super::*; mod analyze; mod builder; mod operation; -#[cfg(feature = "serde")] +#[cfg(feature = "serializable")] mod serializer; /// Represent a regular expression. @@ -124,7 +124,7 @@ impl RegularExpression { } } - /// Converts the regular expression to an equivalent `FastAutomaton`. + /// Converts the regular expression to an equivalent [`FastAutomaton`]. pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs index 40d0fcb..49733a0 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/tokenizer/embed_automaton.rs @@ -179,13 +179,13 @@ mod tests { assert!( automaton - .subtraction(&unembedded_automaton) + .difference(&unembedded_automaton) .unwrap() .is_empty() ); assert!( unembedded_automaton - .subtraction(&automaton) + .difference(&automaton) .unwrap() .is_empty() ); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index d1dd407..4dd7e47 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -16,11 +16,6 @@ fn assert_regex(regex: &str) { assert!(re.is_match(&string), "'{string}'"); } - assert_eq!( - automaton.get_number_of_states(), - regex.get_number_of_states_in_nfa() - ); - let determinized_automaton = automaton.determinize().unwrap(); let strings = determinized_automaton.generate_strings(500).unwrap(); for string in strings { From 19aef3f59013bf405b4a6c0d30e194a6182ca5e3 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:13:16 +0200 Subject: [PATCH 19/62] improve test --- src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9dd5328..c81d84d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -552,12 +552,13 @@ mod tests { use super::*; #[test] - fn test_details() -> Result<(), String> { + fn test_intersection() -> Result<(), String> { let regex1 = Term::from_pattern("a").unwrap(); let regex2 = Term::from_pattern("b").unwrap(); - let details = regex1.intersection(&vec![regex2]); - assert!(details.is_ok()); + let intersection = regex1.intersection(&vec![regex2]).unwrap(); + assert!(intersection.is_empty()); + assert_eq!("[]", intersection.to_pattern().unwrap()); Ok(()) } From 6878356d84714a0e3cd46d1f8f0d554798f896f5 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 6 Aug 2025 21:45:12 +0200 Subject: [PATCH 20/62] Fix bad repetition case --- src/fast_automaton/operation/repeat.rs | 27 +++++++++++-- tests/data/regex-todo.txt | 5 +++ tests/data/regex.txt | 53 +++++++++++++++++++++++++- 3 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 tests/data/regex-todo.txt diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 8bbed81..b49207b 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -27,9 +27,7 @@ impl FastAutomaton { self.accept(new_state); } - for to_state in self.direct_states_vec(&self.start_state) { - self.add_epsilon_transition(new_state, to_state); - } + self.add_epsilon_transition(new_state, self.start_state); self.start_state = new_state; if max_opt.is_none() { @@ -63,7 +61,8 @@ impl FastAutomaton { && automaton_to_repeat.state_out_degree(accept_state) == 0 && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 { - automaton_to_repeat.add_epsilon_transition(accept_state, automaton_to_repeat.start_state); + automaton_to_repeat + .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); let old_start_state = automaton_to_repeat.start_state; automaton_to_repeat.start_state = accept_state; automaton_to_repeat.remove_state(old_start_state); @@ -106,3 +105,23 @@ impl FastAutomaton { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_repeat_1() -> Result<(), String> { + let automaton = RegularExpression::new("(a*,a*)?") + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.match_string("")); + assert!(automaton.match_string(",")); + assert!(automaton.match_string("aaa,")); + assert!(automaton.match_string("aaaa,aa")); + assert!(!automaton.match_string("a")); + assert!(!automaton.match_string("aa")); + Ok(()) + } +} diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt new file mode 100644 index 0000000..1d97d13 --- /dev/null +++ b/tests/data/regex-todo.txt @@ -0,0 +1,5 @@ +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/data/regex.txt b/tests/data/regex.txt index e5fb5df..3eebe62 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -1,3 +1,5 @@ +(a*,a*)? +(?:\s*,\s*(?:0|1|0?\.\d+))? [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f] a{2,3} (abc|fg){2} @@ -19,4 +21,53 @@ a+(ba+)* [0-9]+[A-Z]* ù -^\d$ \ No newline at end of file +^\d$ +foo +bar? +baz+ +qux* +quux{3} +quuux{2,5} +quuuux{0,4} +.* +[aeiou] +[^aeiou] +[a-zA-Z0-9] +[\dA-Fa-f] +[\w&&[^_]] +[[:alpha:]]+ +[\p{L}]+ +[0-9]{2,4} +[01]?\d +[1-9][0-9]* +(cat|dog|mouse) +(?:red|green|blue){2} +(gr(a|e)y){1,3} +((ab|cd)ef)+ +(a(b(c|d)e)f)+ +(a|b(c|d(e|f))){2,3} +(?:abc){0,} +(?:abc){1,} +(?:abc){2,5} +a++ +\.\*\?\+\(\)\[\]\{\}\\\| +\u0041\u0042\u0043 +\p{Greek}+ +\p{Sc} +[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,} +\b((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d)\b +https?://[^\s/$.?#][^\s]* +\d{4}/\d{2}/\d{2} +\d{1,2}:\d{2}(:\d{2})? +<([A-Za-z][A-Za-z0-9]*)\b[^>]*?/> +\{(?:[^{}]|\{[^{}]*\})*\} +\b(?:\d[ -]*?){13,16}\b +#([A-Fa-f0-9]{8}) +(a|b|c|d|e|f|g|h|i|j){5} +(?:"[^"]*"|[^,]*)(?:,(?:"[^"]*"|[^,]*))* +\b([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b +\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b +[[:alnum:]&&[^0-9]] +[ \t]+ +[\r\n]+ +[^\t\r\n]+ \ No newline at end of file From 29697f8fab7a9d59578e8d9bcf300a2c8ba56725 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 8 Aug 2025 22:59:57 +0200 Subject: [PATCH 21/62] fix algo repeat --- src/regex/operation/mod.rs | 181 +---------------------- src/regex/operation/repeat.rs | 268 ++++++++++++++++++++++++++++------ tests/data/regex-todo.txt | 1 + 3 files changed, 227 insertions(+), 223 deletions(-) diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index f572238..7364d65 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -3,183 +3,4 @@ use super::*; mod concat; mod repeat; mod simplify; -mod union; - -#[cfg(test)] -mod tests { - - use regex_charclass::char::Char; - - use crate::{regex::RegularExpression, CharRange}; - - #[test] - fn test_parse_and_simplify() -> Result<(), String> { - assert_parse_and_simplify("(xxx)*", "(x{3})*"); - assert_parse_and_simplify("(x*){3}", "x*"); - assert_parse_and_simplify("(x+)?", "x*"); - assert_parse_and_simplify("(x?)+", "x*"); - assert_parse_and_simplify("(x{0,3})+", "x*"); - assert_parse_and_simplify("(x{2,3})+", "x{2,}"); - assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); - assert_parse_and_simplify("(x+)*", "x*"); - assert_parse_and_simplify(".*abc", ".*abc"); - assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); - assert_parse_and_simplify( - "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", - "a(bc(dg|fe)|mkv)*(abc){4,5}", - ); - assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); - assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); - assert_parse_and_simplify("(a|b)", "[ab]"); - assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); - assert_parse_and_simplify("(ab|ab)", "ab"); - assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); - assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); - assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); - assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); - assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", - "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", - ); - - assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); - Ok(()) - } - - fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { - let regex_parsed = RegularExpression::new(regex).unwrap(); - assert_eq!(regex_simplified, regex_parsed.to_string()); - } - - #[test] - fn test_repeat_simplify() -> Result<(), String> { - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 3, - Some(3), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 2, - Some(4), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(3), - 0, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 1, - Some(2), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(4), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 7, - Some(8), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - None, - 3, - Some(3), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 1, - None, - 0, - Some(1), - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(1), - 1, - None, - ); - - assert_repeat_simplify( - &CharRange::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(4), - 2, - Some(4), - ); - - Ok(()) - } - - fn assert_repeat_simplify( - range: &CharRange, - min1: u32, - max1: Option, - min2: u32, - max2: Option, - ) { - let repeat = RegularExpression::Repetition( - Box::new(RegularExpression::Repetition( - Box::new(RegularExpression::Character(range.clone())), - min1, - max1, - )), - min2, - max2, - ); - - let got = RegularExpression::new(&repeat.to_string()).unwrap(); - - println!("{} -> {}", repeat, got); - - let repeat = repeat.to_automaton().unwrap(); - - //repeat.to_dot(); - - let result = got.to_automaton().unwrap(); - - assert!(repeat.are_equivalent(&result).unwrap()); - } -} +mod union; \ No newline at end of file diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 181724f..6503f75 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -2,67 +2,249 @@ use super::*; impl RegularExpression { /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + pub fn repeat(&self, o_min: u32, o_max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); } else if self.is_empty() { return RegularExpression::new_empty(); } else if self.is_empty_string() { return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { + } else if let Some(max) = o_max_opt { + if max < o_min || max == 0 { return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { + } else if o_min == 1 && max == 1 { return self.clone(); } } match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) + RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { + let new_max = if let (Some(o_max), Some(i_max)) = (o_max_opt, i_max_opt) { + Some(o_max * i_max) } else { None }; - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, o_min, o_max_opt) { + RegularExpression::Repetition( + regular_expression.clone(), + o_min * i_min, + new_max, + ) } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt) } } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + _ => RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt), } } -} \ No newline at end of file + + fn can_simplify_nested_repetition( + i_min: u32, + i_max_opt: Option, + o_min: u32, + o_max_opt: Option, + ) -> bool { + if let Some(o_max) = o_max_opt { + if o_min == o_max { + return true; + } + } + + if let Some(i_max) = i_max_opt { + // We check if there is any gap by resolving: + // o_min * i_max >= (o_min + 1) * i_min - 1 + // <=> o_min * (i_max - i_min) >= i_min - 1 + o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else { + if o_min > 0 { true } else { i_min <= 1 } + } + } +} + +#[cfg(test)] +mod tests { + + use regex_charclass::char::Char; + + use crate::{CharRange, regex::RegularExpression}; + + #[test] + fn test_parse_and_simplify() -> Result<(), String> { + assert_parse_and_simplify("(xxx)*", "(x{3})*"); + assert_parse_and_simplify("(x*){3}", "x*"); + assert_parse_and_simplify("(x+)?", "x*"); + assert_parse_and_simplify("(x?)+", "x*"); + assert_parse_and_simplify("(x{0,3})+", "x*"); + assert_parse_and_simplify("(x{2,3})+", "x{2,}"); + assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); + assert_parse_and_simplify("(x+)*", "x*"); + assert_parse_and_simplify(".*abc", ".*abc"); + assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); + assert_parse_and_simplify( + "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", + "a(bc(dg|fe)|mkv)*(abc){4,5}", + ); + assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); + assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); + assert_parse_and_simplify("(a|b)", "[ab]"); + assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); + assert_parse_and_simplify("(ab|ab)", "ab"); + assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); + assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); + assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); + assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); + assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); + Ok(()) + } + + fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { + let regex_parsed = RegularExpression::new(regex).unwrap(); + assert_eq!(regex_simplified, regex_parsed.to_string()); + } + + #[test] + fn test_repeat_simplify() -> Result<(), String> { + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(3), + 0, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + Some(2), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(4), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 7, + Some(8), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + None, + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + None, + 0, + Some(1), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(1), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 2, + Some(2), + ); + + Ok(()) + } + + fn assert_repeat_simplify( + range: &CharRange, + min1: u32, + max1: Option, + min2: u32, + max2: Option, + ) { + let repeat = RegularExpression::Repetition( + Box::new(RegularExpression::Repetition( + Box::new(RegularExpression::Character(range.clone())), + min1, + max1, + )), + min2, + max2, + ); + + let got = RegularExpression::new(&repeat.to_string()).unwrap(); + + println!("{} -> {}", repeat, got); + + let repeat = repeat.to_automaton().unwrap(); + + //repeat.to_dot(); + + let result = got.to_automaton().unwrap(); + + assert!(repeat.are_equivalent(&result).unwrap()); + } +} diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt index 1d97d13..05849d6 100644 --- a/tests/data/regex-todo.txt +++ b/tests/data/regex-todo.txt @@ -1,3 +1,4 @@ +(a*,a*)* #([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) \{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) From e24624e42900fcd7b698d3bd8af6a0dc0ad272d4 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 9 Aug 2025 16:24:02 +0200 Subject: [PATCH 22/62] update serialization --- src/error/mod.rs | 2 +- src/fast_automaton/mod.rs | 2 +- .../{serializer.rs => serializer/mod.rs} | 31 +- .../serializer}/tokenizer/embed_automaton.rs | 21 +- .../serializer}/tokenizer/mod.rs | 11 +- .../serializer}/tokenizer/range_tokenizer.rs | 0 .../tokenizer/token/automaton_token.rs | 94 ++++++ .../serializer}/tokenizer/token/mod.rs | 20 +- .../serializer/tokenizer/token/range_token.rs | 55 ++++ src/lib.rs | 2 - src/regex/operation/repeat.rs | 5 +- src/tokenizer/embed_regex.rs | 295 ------------------ src/tokenizer/token/automaton_token.rs | 72 ----- src/tokenizer/token/range_token.rs | 58 ---- src/tokenizer/token/regex_token.rs | 84 ----- 15 files changed, 196 insertions(+), 556 deletions(-) rename src/fast_automaton/{serializer.rs => serializer/mod.rs} (82%) rename src/{ => fast_automaton/serializer}/tokenizer/embed_automaton.rs (90%) rename src/{ => fast_automaton/serializer}/tokenizer/mod.rs (90%) rename src/{ => fast_automaton/serializer}/tokenizer/range_tokenizer.rs (100%) create mode 100644 src/fast_automaton/serializer/tokenizer/token/automaton_token.rs rename src/{ => fast_automaton/serializer}/tokenizer/token/mod.rs (62%) create mode 100644 src/fast_automaton/serializer/tokenizer/token/range_token.rs delete mode 100644 src/tokenizer/embed_regex.rs delete mode 100644 src/tokenizer/token/automaton_token.rs delete mode 100644 src/tokenizer/token/range_token.rs delete mode 100644 src/tokenizer/token/regex_token.rs diff --git a/src/error/mod.rs b/src/error/mod.rs index b960147..448dfb9 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,7 +1,7 @@ use std::fmt::{self}; #[cfg(feature = "serializable")] -use crate::tokenizer::token::TokenError; +use crate::fast_automaton::serializer::tokenizer::token::TokenError; /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 908ed4e..cfa4f68 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -27,7 +27,7 @@ mod convert; mod generate; mod operation; #[cfg(feature = "serializable")] -mod serializer; +pub mod serializer; pub mod spanning_set; /// Represent a finite state automaton. diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer/mod.rs similarity index 82% rename from src/fast_automaton/serializer.rs rename to src/fast_automaton/serializer/mod.rs index 29c41e3..aa06df0 100644 --- a/src/fast_automaton/serializer.rs +++ b/src/fast_automaton/serializer/mod.rs @@ -1,5 +1,7 @@ +use crate::fast_automaton::serializer::tokenizer::token::automaton_token::AutomatonToken; +use crate::fast_automaton::serializer::tokenizer::Tokenizer; + use super::*; -use crate::tokenizer::Tokenizer; use serde::{Deserialize, Serialize}; use serde::{Deserializer, Serializer, de, ser}; @@ -10,10 +12,11 @@ use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; use std::io::prelude::*; -use crate::tokenizer::token::{Token, automaton_token::AutomatonToken}; +#[cfg(feature = "serializable")] +pub mod tokenizer; #[derive(Serialize, Deserialize, Debug)] -struct SerializedAutomaton(Vec, SpanningSet); +struct SerializedAutomaton(Vec, SpanningSet, usize); impl serde::Serialize for FastAutomaton { fn serialize(&self, serializer: S) -> Result @@ -21,12 +24,17 @@ impl serde::Serialize for FastAutomaton { S: Serializer, { let tokenizer = Tokenizer::new(self); - match AutomatonToken::to_fair_tokens(&tokenizer.to_embedding()) { + let number_of_states = self.get_number_of_states(); + match AutomatonToken::to_tokens( + &tokenizer.to_embedding(), + self.get_spanning_set().get_number_of_spanning_ranges(), + number_of_states, + ) { Ok(tokens) => { let serialized_automaton = - SerializedAutomaton(tokens, self.get_spanning_set().clone()); + SerializedAutomaton(tokens, self.get_spanning_set().clone(), number_of_states); - let mut serialized = Vec::with_capacity(self.get_number_of_states() * 8); + let mut serialized = Vec::with_capacity(number_of_states * 8); if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { return Err(ser::Error::custom(err.to_string())); } @@ -56,13 +64,22 @@ impl<'de> serde::Deserialize<'de> for FastAutomaton { Ok(automaton) => { let mut temp_automaton = FastAutomaton::new_empty(); temp_automaton.spanning_set = automaton.1; + let number_of_states = automaton.2; + let number_of_bases = + temp_automaton.spanning_set.get_number_of_spanning_ranges(); let tokenizer = Tokenizer::new(&temp_automaton); match tokenizer.from_embedding( &automaton .0 .into_iter() - .map(AutomatonToken::from_fair_token) + .map(|t| { + AutomatonToken::from_token( + t, + number_of_bases, + number_of_states, + ) + }) .collect::>(), ) { Ok(res) => Ok(res), diff --git a/src/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs similarity index 90% rename from src/tokenizer/embed_automaton.rs rename to src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 49733a0..825ea7e 100644 --- a/src/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -1,6 +1,6 @@ use token::TokenError; -use crate::{CharRange, error::EngineError, fast_automaton::condition::Condition}; +use crate::{error::EngineError, fast_automaton::{condition::Condition, serializer::tokenizer::token::automaton_token::AutomatonToken}, CharRange}; use self::token::range_token::RangeToken; @@ -77,7 +77,7 @@ impl Tokenizer<'_> { range = range.union(self.range_tokenizer.token_to_range(r).unwrap()); } AutomatonToken::State(s) => { - while !automaton.has_state((*s).into()) { + while !automaton.has_state(*s) { automaton.new_state(); } if let Some(fs) = from_state { @@ -85,9 +85,9 @@ impl Tokenizer<'_> { Self::apply_transition(&mut automaton, fs, ts, &range)?; range = CharRange::empty(); } - to_state = Some((*s).into()); + to_state = Some(*s); } else { - from_state = Some((*s).into()); + from_state = Some(*s); } } AutomatonToken::AcceptState => { @@ -129,8 +129,6 @@ impl Tokenizer<'_> { #[cfg(test)] mod tests { - use embed_automaton::token::Token; - use crate::regex::RegularExpression; use super::*; @@ -168,11 +166,14 @@ mod tests { let tokenizer = Tokenizer::new(&automaton); let embedding = tokenizer.to_embedding(); - // FAIR - let embedding_u16 = AutomatonToken::to_fair_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u16 + let number_of_bases = automaton.get_spanning_set().get_number_of_spanning_ranges(); + let number_of_states = automaton.get_number_of_states(); + + let embedding_usize = + AutomatonToken::to_tokens(&embedding, number_of_bases, number_of_states).unwrap(); + let embedding: Vec = embedding_usize .iter() - .map(|&t| AutomatonToken::from_fair_token(t)) + .map(|&t| AutomatonToken::from_token(t, number_of_bases, number_of_states)) .collect(); let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); diff --git a/src/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs similarity index 90% rename from src/tokenizer/mod.rs rename to src/fast_automaton/serializer/tokenizer/mod.rs index 7c83c1a..95ccb18 100644 --- a/src/tokenizer/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/mod.rs @@ -1,16 +1,15 @@ use std::{cmp::Ordering, collections::VecDeque, vec}; -use ahash::HashMapExt; +use crate::fast_automaton::serializer::tokenizer::range_tokenizer::RangeTokenizer; use crate::fast_automaton::spanning_set::SpanningSet; use crate::{ - fast_automaton::{FastAutomaton, State}, IntMap, IntSet, + fast_automaton::{FastAutomaton, State}, }; +use ahash::HashMapExt; -use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonToken}; mod embed_automaton; -mod embed_regex; pub mod range_tokenizer; pub mod token; @@ -18,7 +17,7 @@ pub mod token; pub struct Tokenizer<'a> { range_tokenizer: RangeTokenizer<'a>, automaton: &'a FastAutomaton, - state_to_token: IntMap, + state_to_token: IntMap, } impl Tokenizer<'_> { @@ -28,7 +27,7 @@ impl Tokenizer<'_> { worklist.push_front(automaton.get_start_state()); - let mut state_counter: u16 = 0; + let mut state_counter = 0; let mut state_to_token = IntMap::with_capacity(automaton.get_number_of_states()); while let Some(current_state) = worklist.pop_back() { diff --git a/src/tokenizer/range_tokenizer.rs b/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs similarity index 100% rename from src/tokenizer/range_tokenizer.rs rename to src/fast_automaton/serializer/tokenizer/range_tokenizer.rs diff --git a/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs new file mode 100644 index 0000000..2e68ded --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs @@ -0,0 +1,94 @@ +use self::range_token::RangeToken; + +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum AutomatonToken { + Range(RangeToken), + State(usize), + AcceptState, + SeparatorState, + Error, +} + +impl Ord for AutomatonToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (AutomatonToken::Range(a), AutomatonToken::Range(b)) => a.cmp(b), + (AutomatonToken::Range(_), _) => Ordering::Less, + (_, AutomatonToken::Range(_)) => Ordering::Greater, + + (AutomatonToken::State(a), AutomatonToken::State(b)) => a.cmp(b), + (AutomatonToken::State(_), _) => Ordering::Less, + (_, AutomatonToken::State(_)) => Ordering::Greater, + + (AutomatonToken::AcceptState, AutomatonToken::AcceptState) => Ordering::Equal, + (AutomatonToken::AcceptState, _) => Ordering::Less, + (_, AutomatonToken::AcceptState) => Ordering::Greater, + + (AutomatonToken::SeparatorState, AutomatonToken::SeparatorState) => Ordering::Equal, + (AutomatonToken::SeparatorState, _) => Ordering::Less, + (_, AutomatonToken::SeparatorState) => Ordering::Greater, + + (AutomatonToken::Error, AutomatonToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for AutomatonToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl AutomatonToken { + pub fn from_token( + token: usize, + number_of_bases: usize, + number_of_states: usize, + ) -> AutomatonToken { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + if (0..states).contains(&token) { + AutomatonToken::Range(RangeToken::from_token(token, number_of_bases)) + } else if (states..accept_state).contains(&token) { + AutomatonToken::State(token - states) + } else if token == accept_state { + AutomatonToken::AcceptState + } else if token == separator_state { + AutomatonToken::SeparatorState + } else { + AutomatonToken::Error + } + } + + pub fn to_token( + &self, + number_of_bases: usize, + number_of_states: usize, + ) -> Result { + let states = number_of_bases + 1; + let accept_state = states + number_of_states; + let separator_state = accept_state + 1; + Ok(match self { + AutomatonToken::Range(r) => r.to_token(number_of_bases)?, + AutomatonToken::State(s) => s + states, + AutomatonToken::AcceptState => accept_state, + AutomatonToken::SeparatorState => separator_state, + AutomatonToken::Error => return Err(TokenError::UnknownToken), + }) + } + + pub fn to_tokens( + tokens: &[Self], + number_of_bases: usize, + number_of_states: usize, + ) -> Result, TokenError> { + let mut vec = Vec::with_capacity(tokens.len()); + for token in tokens { + vec.push(token.to_token(number_of_bases, number_of_states)?); + } + Ok(vec) + } +} diff --git a/src/tokenizer/token/mod.rs b/src/fast_automaton/serializer/tokenizer/token/mod.rs similarity index 62% rename from src/tokenizer/token/mod.rs rename to src/fast_automaton/serializer/tokenizer/token/mod.rs index 4342be8..c510dd4 100644 --- a/src/tokenizer/token/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/token/mod.rs @@ -4,7 +4,6 @@ use super::*; pub mod automaton_token; pub mod range_token; -pub mod regex_token; #[derive(Debug, PartialEq, Eq)] pub enum TokenError { @@ -24,21 +23,4 @@ impl Display for TokenError { TokenError::SyntaxError => write!(f, "SyntaxError"), } } -} - -pub trait Token { - fn from_fair_token(token: u16) -> Self; - - fn to_fair_token(&self) -> Result; - - fn to_fair_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_fair_token()?); - } - Ok(vec) - } -} +} \ No newline at end of file diff --git a/src/fast_automaton/serializer/tokenizer/token/range_token.rs b/src/fast_automaton/serializer/tokenizer/token/range_token.rs new file mode 100644 index 0000000..20ed515 --- /dev/null +++ b/src/fast_automaton/serializer/tokenizer/token/range_token.rs @@ -0,0 +1,55 @@ +use super::*; + +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum RangeToken { + Total, + Base(usize), + Error, +} + +impl Ord for RangeToken { + fn cmp(&self, other: &Self) -> Ordering { + match (self, other) { + (RangeToken::Total, RangeToken::Total) => Ordering::Equal, + (RangeToken::Total, _) => Ordering::Less, + (_, RangeToken::Total) => Ordering::Greater, + (RangeToken::Base(a), RangeToken::Base(b)) => a.cmp(b), + (RangeToken::Base(_), RangeToken::Error) => Ordering::Less, + (RangeToken::Error, RangeToken::Base(_)) => Ordering::Greater, + (RangeToken::Error, RangeToken::Error) => Ordering::Equal, + } + } +} + +impl PartialOrd for RangeToken { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl RangeToken { + pub fn from_token(token: usize, number_of_bases: usize) -> RangeToken { + let max_number_of_bases = number_of_bases + 1; + if token == 0 { + RangeToken::Total + } else if (1..max_number_of_bases).contains(&token) { + RangeToken::Base(token - 1) + } else { + RangeToken::Error + } + } + + pub fn to_token(&self, number_of_bases: usize) -> Result { + let max_number_of_bases = number_of_bases + 1; + Ok(match self { + RangeToken::Total => 0, + RangeToken::Base(b) => { + if *b > max_number_of_bases { + return Err(TokenError::TokenOutOfBound("Base", max_number_of_bases, *b)); + } + b + 1 + } + RangeToken::Error => return Err(TokenError::UnknownToken), + }) + } +} diff --git a/src/lib.rs b/src/lib.rs index c81d84d..e4119e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,8 +22,6 @@ pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; -#[cfg(feature = "serializable")] -pub mod tokenizer; pub type IntMap = HashMap>>; pub type IntSet = HashSet>>; diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 6503f75..86ddbe5 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -39,6 +39,7 @@ impl RegularExpression { } } + /// Evaluate if the repetition `(r{i_min,i_max_opt}){o_min,o_max_opt}` can be simplified to `r{i_min*o_min,i_max_opt*o_max_opt}`. fn can_simplify_nested_repetition( i_min: u32, i_max_opt: Option, @@ -56,8 +57,10 @@ impl RegularExpression { // o_min * i_max >= (o_min + 1) * i_min - 1 // <=> o_min * (i_max - i_min) >= i_min - 1 o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else if o_min > 0 { + true } else { - if o_min > 0 { true } else { i_min <= 1 } + i_min <= 1 } } } diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs deleted file mode 100644 index d9e6892..0000000 --- a/src/tokenizer/embed_regex.rs +++ /dev/null @@ -1,295 +0,0 @@ -use token::TokenError; - -use crate::{regex::RegularExpression, CharRange}; - -use self::token::regex_token::RegexToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_embedding(&self, regex: &RegularExpression) -> Vec { - let mut vec = self.to_regex_embedding_vec(regex); - - Self::append_counter_if_necessary(&mut vec); - - vec - } - - fn append_counter_if_necessary(vec: &mut Vec) { - if let Some(last) = vec.last() { - match last { - RegexToken::RepetitionNone => {} - RegexToken::Repetition(_) => {} - RegexToken::EndGroup => {} - RegexToken::StartGroup => {} - RegexToken::Alternation => {} - RegexToken::Error => todo!(), - _ => { - vec.push(RegexToken::Repetition(1)); - } - }; - } - } - - fn to_regex_embedding_vec(&self, regex: &RegularExpression) -> Vec { - let mut vec = vec![]; - - match regex { - RegularExpression::Character(range) => { - self.range_tokenizer - .range_to_embedding(range) - .unwrap() - .into_iter() - .for_each(|t| vec.push(RegexToken::Range(t))); - } - RegularExpression::Repetition(regex, min, max_opt) => { - if matches!( - **regex, - RegularExpression::Repetition(_, _, _) | RegularExpression::Concat(_) - ) { - vec.push(RegexToken::StartGroup); - vec.extend(self.to_regex_embedding_vec(regex)); - vec.push(RegexToken::EndGroup); - } else { - vec.extend(self.to_regex_embedding_vec(regex)); - } - - vec.push(RegexToken::Repetition(*min as u16)); - - if let Some(max) = max_opt { - if max != min { - vec.push(RegexToken::Repetition(*max as u16)); - } - } else { - vec.push(RegexToken::RepetitionNone); - } - } - RegularExpression::Concat(elements) => { - for element in elements { - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - } - } - RegularExpression::Alternation(elements) => { - vec.push(RegexToken::StartGroup); - - for i in 0..elements.len() { - let element = &elements[i]; - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - if i < elements.len() - 1 { - vec.push(RegexToken::Alternation); - } - } - - vec.push(RegexToken::EndGroup); - } - } - - vec - } - - pub fn from_regex_embedding( - &self, - vec: &[RegexToken], - ) -> Result { - let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; - let mut current_min = None; - for i in 0..vec.len() { - let token = vec[i]; - let current_group = regex_groups.len() - 1; - match token { - RegexToken::Range(range_token) => { - let range = self.range_tokenizer.token_to_range(&range_token).unwrap(); - if let Some(curr_range) = ¤t_range { - current_range = Some(curr_range.union(range)); - } else { - current_range = Some(range.clone()); - } - } - RegexToken::StartGroup => { - regex_groups.push((RegularExpression::new_empty_string(), false)); - } - RegexToken::EndGroup => { - if current_group == 0 { - return Err(TokenError::SyntaxError); - } - if i == vec.len() - 1 || !matches!(vec[i + 1], RegexToken::Repetition(_)) { - let alternation: bool = regex_groups[current_group].1; - Self::pop_regex_group(&mut regex_groups, &None, &None); - if alternation { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - } - } - RegexToken::Alternation => { - if regex_groups[current_group].1 { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - regex_groups.push((RegularExpression::new_empty_string(), true)); - } - RegexToken::RepetitionNone => { - if current_min.is_some() { - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - ¤t_min, - &None, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, ¤t_min, &None); - } - current_min = None; - } else { - return Err(TokenError::SyntaxError); - } - } - RegexToken::Repetition(count) => { - if current_min.is_some() - || i == vec.len() - 1 - || !matches!(vec[i + 1], RegexToken::Repetition(_)) - && !matches!(vec[i + 1], RegexToken::RepetitionNone) - { - let min; - let max; - if current_min.is_some() { - min = current_min; - max = Some(count as u32); - } else { - min = Some(count as u32); - max = Some(count as u32); - } - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - &min, - &max, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, &min, &max); - } - current_min = None; - } else { - current_min = Some(count as u32); - } - } - _ => return Err(TokenError::UnknownToken), - }; - } - - Ok(regex_groups[0].0.clone()) - } - - fn pop_regex_group( - regex_groups: &mut Vec<(RegularExpression, bool)>, - current_min: &Option, - current_max: &Option, - ) -> bool { - if regex_groups.len() <= 1 { - return false; - } - - let popped_group = regex_groups.pop().unwrap(); - Self::add_regex( - regex_groups, - current_min, - current_max, - &popped_group.0, - popped_group.1, - ); - true - } - - fn add_regex( - regex_groups: &mut [(RegularExpression, bool)], - current_min: &Option, - current_max: &Option, - regex: &RegularExpression, - alternation: bool, - ) { - let current_group = regex_groups.len() - 1; - let regex_to_use = if let Some(min) = current_min { - if min == &1 && current_max.is_some() { - if current_max.unwrap() == 1 { - regex.clone() - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - regex.clone() - }; - - if alternation { - regex_groups[current_group].0 = regex_groups[current_group].0.union(®ex_to_use); - } else { - regex_groups[current_group].0 = - regex_groups[current_group].0.concat(®ex_to_use, true); - } - } -} - -#[cfg(test)] -mod tests { - use embed_regex::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(".*"); - assert_embedding_convertion("(a|b)"); - assert_embedding_convertion("(|a)"); - assert_embedding_convertion(".*ab"); - assert_embedding_convertion("[a-e]{3}"); - assert_embedding_convertion("[a-e]{3}efg"); - assert_embedding_convertion("toto"); - assert_embedding_convertion(".{2,3}"); - assert_embedding_convertion("q(abc?|ca)x"); - assert_embedding_convertion(".*q(abc?|ca)x"); - assert_embedding_convertion("(abc){3,6}"); - assert_embedding_convertion("((|a)abd+){3}"); - /*assert_embedding_convertion( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - );*/ - Ok(()) - } - - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap(); - let automaton = automaton.determinize().unwrap(); - //automaton.to_dot(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_embedding(®ex); - - //println!("{:?}", embedding); - - // FAIR - let embedding_u16 = RegexToken::to_fair_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u16 - .iter() - .map(|&t| RegexToken::from_fair_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - } -} diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs deleted file mode 100644 index e5f379c..0000000 --- a/src/tokenizer/token/automaton_token.rs +++ /dev/null @@ -1,72 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum AutomatonToken { - Range(RangeToken), - State(u16), - AcceptState, - SeparatorState, - Error, -} - -impl Ord for AutomatonToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for AutomatonToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl AutomatonToken { - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; - const TK_FAIR_SEPARATOR_STATE: u16 = Self::TK_FAIR_ACCEPT_STATE + 1; - - pub const FAIR_MAX_NUMBER_OF_STATES: u16 = 65_000; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_SEPARATOR_STATE + 1; -} - -impl Token for AutomatonToken { - fn from_fair_token(token: u16) -> AutomatonToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - AutomatonToken::Range(RangeToken::from_fair_token(token)) - } else if (Self::TK_FAIR_STATE..Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State(token - Self::TK_FAIR_STATE) - } else if token == Self::TK_FAIR_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_FAIR_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_fair_token()?, - AutomatonToken::State(s) => { - let max = Self::FAIR_MAX_NUMBER_OF_STATES; - let s = *s; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_FAIR_STATE - } - AutomatonToken::AcceptState => Self::TK_FAIR_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_FAIR_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs deleted file mode 100644 index 7876452..0000000 --- a/src/tokenizer/token/range_token.rs +++ /dev/null @@ -1,58 +0,0 @@ -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RangeToken { - Total, - Base(usize), - Error, -} - -impl RangeToken { - const TK_FAIR_TOTAL: u16 = 0; - const TK_FAIR_BASE: u16 = 1; - - pub const FAIR_MAX_NUMBER_OF_BASES: u16 = 127; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES + 1; -} - -impl Ord for RangeToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RangeToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Token for RangeToken { - fn from_fair_token(token: u16) -> RangeToken { - if token == Self::TK_FAIR_TOTAL { - RangeToken::Total - } else if (Self::TK_FAIR_BASE..Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_FAIR_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_FAIR_TOTAL, - RangeToken::Base(b) => { - let max = Self::FAIR_MAX_NUMBER_OF_BASES; - let b = *b as u16; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_FAIR_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs deleted file mode 100644 index bcb2e2b..0000000 --- a/src/tokenizer/token/regex_token.rs +++ /dev/null @@ -1,84 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexToken { - Range(RangeToken), - StartGroup, - EndGroup, - Alternation, - RepetitionNone, - Repetition(u16), - Error, -} - -impl Ord for RegexToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RegexToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexToken { - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; - const TK_FAIR_ALTERNATION: u16 = Self::TK_FAIR_END_GROUP + 1; - const TK_FAIR_REPETITION_NONE: u16 = Self::TK_FAIR_ALTERNATION + 1; - const TK_FAIR_REPETITION: u16 = Self::TK_FAIR_REPETITION_NONE + 1; - - pub const FAIR_MAX_NUMBER_OF_REPETITION: u16 = 1024; - - pub const FAIR_VOCABULARY_SIZE: u16 = - Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION + 1; -} - -impl Token for RegexToken { - fn from_fair_token(token: u16) -> RegexToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - RegexToken::Range(RangeToken::from_fair_token(token)) - } else if token == Self::TK_FAIR_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_FAIR_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_FAIR_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_FAIR_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_FAIR_REPETITION - ..Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition(token - Self::TK_FAIR_REPETITION) - } else { - RegexToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_fair_token()?, - RegexToken::StartGroup => Self::TK_FAIR_START_GROUP, - RegexToken::EndGroup => Self::TK_FAIR_END_GROUP, - RegexToken::Alternation => Self::TK_FAIR_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_FAIR_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::FAIR_MAX_NUMBER_OF_REPETITION; - let r = *r; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_FAIR_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } -} From f874caad390cbe97872660b2a11ab58be4ca25d1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Mon, 11 Aug 2025 21:51:29 +0200 Subject: [PATCH 23/62] remove some errors --- src/error/mod.rs | 16 -------------- src/fast_automaton/analyze/cardinality.rs | 17 +++++++------- src/fast_automaton/condition/converter.rs | 14 +++++------- .../condition/fast_bit_vec/mod.rs | 12 +++++----- src/fast_automaton/condition/mod.rs | 22 ++++++++++--------- src/fast_automaton/operation/difference.rs | 5 ++--- src/lib.rs | 18 +++++---------- src/regex/analyze/mod.rs | 2 +- 8 files changed, 38 insertions(+), 68 deletions(-) diff --git a/src/error/mod.rs b/src/error/mod.rs index 448dfb9..29052b4 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -10,21 +10,15 @@ pub enum EngineError { InvalidCharacterInRegex, /// The operation took too much time. OperationTimeOutError, - /// The given automaton should be deterministic. - AutomatonShouldBeDeterministic, /// The automaton has too many states. AutomatonHasTooManyStates, /// The regular expression can not be parsed. RegexSyntaxError(String), /// The provided range can not be built from the spanning set. ConditionInvalidRange, - /// The provided index is out of bound of the condition. - ConditionIndexOutOfBound, #[cfg(feature = "serializable")] /// There is an error with one of the token. TokenError(TokenError), - /// Computing the cardinality of the provided automaton failed. - CannotComputeAutomatonCardinality, } impl fmt::Display for EngineError { @@ -32,9 +26,6 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => { - write!(f, "The given automaton should be deterministic.") - } EngineError::AutomatonHasTooManyStates => { write!(f, "The automaton has too many states.") } @@ -45,13 +36,6 @@ impl fmt::Display for EngineError { f, "The provided range can not be built from the spanning set." ), - EngineError::ConditionIndexOutOfBound => { - write!(f, "The provided index is out of bound of the condition.") - } - EngineError::CannotComputeAutomatonCardinality => write!( - f, - "Computing the cardinality of the provided automaton failed." - ), } } } diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 7157bae..5741e77 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -4,18 +4,17 @@ use super::*; impl FastAutomaton { /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). - pub fn get_cardinality(&self) -> Option> { + pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { - return Some(Cardinality::Integer(0)); + return Cardinality::Integer(0); } else if self.cyclic || self.is_total() { - return Some(Cardinality::Infinite); - } else if !self.deterministic { - return None; + return Cardinality::Infinite; } + assert!(self.is_determinitic(), "The automaton should be deterministic."); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { - return Some(Cardinality::Infinite); + return Cardinality::Infinite; } let topologically_sorted_states = topologically_sorted_states.unwrap(); @@ -41,7 +40,7 @@ impl FastAutomaton { } } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } } @@ -53,10 +52,10 @@ impl FastAutomaton { temp_cardinality = add; continue; } - return Some(Cardinality::BigInteger); + return Cardinality::BigInteger; } } - Some(Cardinality::Integer(temp_cardinality)) + Cardinality::Integer(temp_cardinality) } fn topological_sorted_states(&self) -> Option> { diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 503d6ce..9fabd11 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -59,14 +59,10 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { pub fn convert(&self, condition: &Condition) -> Result { let mut new_condition = Condition::empty(self.to_spanning_set); for (from_index, to_indexes) in self.equivalence_map.iter().enumerate() { - if let Some(has) = condition.0.get(from_index) { - if has && !to_indexes.is_empty() { - to_indexes.iter().for_each(|&to_index| { - new_condition.0.set(to_index, true); - }); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if condition.0.get(from_index) && !to_indexes.is_empty() { + to_indexes.iter().for_each(|&to_index| { + new_condition.0.set(to_index, true); + }); } } @@ -86,8 +82,8 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { #[cfg(test)] mod tests { - use regex_charclass::{char::Char, irange::{range::AnyRange}}; use crate::CharRange; + use regex_charclass::{char::Char, irange::range::AnyRange}; use super::*; diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index a1b46c5..9c85a43 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -7,7 +7,7 @@ pub struct FastBitVec { impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { - let bit = if self.get(i).unwrap() { 1 } else { 0 }; + let bit = if self.get(i) { 1 } else { 0 }; write!(f, "{bit}")?; } Ok(()) @@ -48,13 +48,11 @@ impl FastBitVec { } #[inline] - pub fn get(&self, i: usize) -> Option { - if i >= self.n { - return None; - } + pub fn get(&self, i: usize) -> bool { + assert!(i < self.n, "The provided bit index is out of bound."); let w = i / 64; let b = i % 64; - self.bits.get(w).map(|&block| (block & (1 << b)) != 0) + (self.bits[w] & (1 << b)) != 0 } #[inline] @@ -126,7 +124,7 @@ impl FastBitVec { pub fn get_bits(&self) -> Vec { let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - bits.push(self.get(i).unwrap()); + bits.push(self.get(i)); } bits } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 08a439b..122ccc9 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,9 +1,9 @@ use std::hash::Hash; use fast_bit_vec::FastBitVec; -use regex_charclass::{char::Char, CharacterClass}; +use regex_charclass::{CharacterClass, char::Char}; -use crate::{error::EngineError, CharRange}; +use crate::{CharRange, error::EngineError}; use super::spanning_set::SpanningSet; pub mod converter; @@ -76,12 +76,8 @@ impl Condition { .iter() .enumerate() { - if let Some(has) = self.0.get(i) { - if has { - range = range.union(base); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if self.0.get(i) { + range = range.union(base); } } @@ -193,11 +189,17 @@ mod tests { let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_binary_representation()); + assert_eq!( + vec![false, false, false, false], + empty.get_binary_representation() + ); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_binary_representation()); + assert_eq!( + vec![true, true, true, true], + total.get_binary_representation() + ); assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index a7b8ecf..acb63e9 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -6,9 +6,8 @@ use super::*; impl FastAutomaton { fn totalize(&mut self) -> Result<(), EngineError> { - if !self.is_determinitic() { - return Err(EngineError::AutomatonShouldBeDeterministic); - } + assert!(self.is_determinitic(), "The automaton should be deterministic."); + let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = IntMap::with_capacity_and_hasher( diff --git a/src/lib.rs b/src/lib.rs index e4119e2..9c8e6ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,19 +449,11 @@ impl Term { pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => { - let cardinality = if !automaton.is_determinitic() { - automaton.determinize()?.get_cardinality() - } else { - automaton.get_cardinality() - }; - - if let Some(cardinality) = cardinality { - Ok(cardinality) - } else { - Err(EngineError::CannotComputeAutomatonCardinality) - } - } + Term::Automaton(automaton) => Ok(if !automaton.is_determinitic() { + automaton.determinize()?.get_cardinality() + } else { + automaton.get_cardinality() + }), } } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 2ee4bc5..f5d1975 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -232,7 +232,7 @@ mod tests { //automaton.to_dot(); - let expected = automaton.get_cardinality().unwrap(); + let expected = automaton.get_cardinality(); assert_eq!(expected, cardinality); } From c2cc84234182704b941002e66fc878f78ce97c4e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:06:52 +0200 Subject: [PATCH 24/62] Change regex convertion algo --- src/fast_automaton/analyze/cardinality.rs | 6 +- src/fast_automaton/analyze/equivalence.rs | 28 +- src/fast_automaton/analyze/length.rs | 4 +- src/fast_automaton/analyze/mod.rs | 6 +- src/fast_automaton/analyze/subset.rs | 34 +- src/fast_automaton/builder.rs | 27 +- .../convert/to_regex/builder/scc.rs | 207 ---------- src/fast_automaton/convert/to_regex/mod.rs | 378 +++--------------- .../mod.rs => state_elimination/builder.rs} | 103 +++-- .../to_regex/state_elimination/eliminate.rs | 118 ++++++ .../convert/to_regex/state_elimination/mod.rs | 121 ++++++ .../convert/to_regex/transform.rs | 208 ---------- .../convert/to_regex/transform/mod.rs | 16 + .../to_regex/transform/shape/dotstar.rs | 172 ++++++++ .../convert/to_regex/transform/shape/mod.rs | 1 + src/fast_automaton/generate.rs | 4 +- src/fast_automaton/mod.rs | 64 +-- src/fast_automaton/operation/concat.rs | 72 ++-- src/fast_automaton/operation/determinize.rs | 6 +- src/fast_automaton/operation/difference.rs | 8 +- src/fast_automaton/operation/intersection.rs | 99 ++--- src/fast_automaton/operation/mod.rs | 6 +- src/fast_automaton/operation/repeat.rs | 8 +- src/fast_automaton/operation/union.rs | 203 +++++++--- src/fast_automaton/serializer/mod.rs | 6 +- .../serializer/tokenizer/embed_automaton.rs | 24 +- src/lib.rs | 46 +-- src/regex/builder.rs | 40 +- src/regex/mod.rs | 90 +++++ src/regex/operation/concat.rs | 31 +- src/regex/operation/repeat.rs | 2 +- src/regex/operation/union.rs | 44 +- tests/integration_tests.rs | 10 +- 33 files changed, 1018 insertions(+), 1174 deletions(-) delete mode 100644 src/fast_automaton/convert/to_regex/builder/scc.rs rename src/fast_automaton/convert/to_regex/{builder/mod.rs => state_elimination/builder.rs} (62%) create mode 100644 src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs create mode 100644 src/fast_automaton/convert/to_regex/state_elimination/mod.rs delete mode 100644 src/fast_automaton/convert/to_regex/transform.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/mod.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs create mode 100644 src/fast_automaton/convert/to_regex/transform/shape/mod.rs diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 5741e77..ccad761 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,9 +65,9 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.all_states_vec() { + for from_state in &self.states_vec() { in_degree.entry(*from_state).or_insert(0); - for to_state in self.direct_states_iter(from_state) { + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.direct_states_iter(&from_state) { + for to_state in self.direct_states(&from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 32f2ccb..3f70711 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if both automata accept the same language. - pub fn are_equivalent(&self, other: &FastAutomaton) -> Result { + pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { @@ -44,26 +44,26 @@ mod tests { false, ); - let regex_1 = RegularExpression::new("cd").unwrap(); - let regex_2 = RegularExpression::new("cd").unwrap(); + let regex_1 = RegularExpression::parse("cd", false).unwrap(); + let regex_2 = RegularExpression::parse("cd", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); - let regex_1 = RegularExpression::new("test.*other").unwrap(); - let regex_2 = RegularExpression::new("test.*othew").unwrap(); + let regex_1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex_2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex_1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("[0]").unwrap(); - let regex_2 = RegularExpression::new("[01]").unwrap(); + let regex_1 = RegularExpression::parse("[0]", false).unwrap(); + let regex_2 = RegularExpression::parse("[01]", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("(b+a+)*").unwrap(); - let regex_2 = RegularExpression::new("(b[a-b]*a)?").unwrap(); + let regex_1 = RegularExpression::parse("(b+a+)*", false).unwrap(); + let regex_2 = RegularExpression::parse("(b[a-b]*a)?", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); Ok(()) @@ -72,14 +72,14 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.are_equivalent(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.are_equivalent(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.equivalent(&automaton_2).unwrap()); assert_eq!( expected, - automaton_1.are_equivalent(&automaton_2).unwrap() + automaton_1.equivalent(&automaton_2).unwrap() ); } } diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 5ab7180..bbec964 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -27,7 +27,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states_iter(&state) { + for to_state in self.direct_states(&state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -54,7 +54,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states_iter(&state) { + for to_state in self.direct_states(&state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 46b7c23..3902460 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -32,15 +32,15 @@ impl FastAutomaton { pub fn is_empty_string(&self) -> bool { self.accept_states.len() == 1 && self.accept_states.contains(&self.start_state) - && self.state_in_degree(self.start_state) == 0 + && self.in_degree(self.start_state) == 0 } /// Returns the set of all states reachable from the start state. pub fn get_reacheable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.all_states_iter() { - for (condition, to_state) in self.transitions_from_iter(from_state) { + for from_state in self.states() { + for (condition, to_state) in self.transitions_from(from_state) { if condition.is_empty() { continue; } diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index e08a476..e4ca7d6 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -4,7 +4,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if all strings accepted by `self` are also accepted by `other`. - pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { + pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); } else if other.is_empty() || self.is_total() { @@ -39,33 +39,33 @@ mod tests { true, ); - let regex1 = RegularExpression::new("test.*other").unwrap(); - let regex2 = RegularExpression::new("test.*othew").unwrap(); + let regex1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_subset(®ex1, ®ex2, false, false); - let regex1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_subset(®ex1, ®ex2, false, true); - let regex1 = RegularExpression::new("(abc|def)").unwrap(); - let regex2 = RegularExpression::new("(abc|def|xyz)").unwrap(); + let regex1 = RegularExpression::parse("(abc|def)", false).unwrap(); + let regex2 = RegularExpression::parse("(abc|def|xyz)", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("[0]").unwrap(); - let regex2 = RegularExpression::new("[01]").unwrap(); + let regex1 = RegularExpression::parse("[0]", false).unwrap(); + let regex2 = RegularExpression::parse("[01]", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("a.*b.*c.*").unwrap(); - let regex2 = RegularExpression::new("a.*b.*").unwrap(); + let regex1 = RegularExpression::parse("a.*b.*c.*", false).unwrap(); + let regex2 = RegularExpression::parse("a.*b.*", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("1..").unwrap(); - let regex2 = RegularExpression::new("...").unwrap(); + let regex1 = RegularExpression::parse("1..", false).unwrap(); + let regex2 = RegularExpression::parse("...", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); @@ -80,18 +80,18 @@ mod tests { ) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_subset_of(&automaton_1).unwrap()); + assert_eq!(true, automaton_1.subset(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_subset_of(&automaton_2).unwrap()); + assert_eq!(true, automaton_2.subset(&automaton_2).unwrap()); assert_eq!( expected_1_2, - automaton_1.is_subset_of(&automaton_2).unwrap() + automaton_1.subset(&automaton_2).unwrap() ); assert_eq!( expected_2_1, - automaton_2.is_subset_of(&automaton_1).unwrap() + automaton_2.subset(&automaton_1).unwrap() ); } } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 8bcc136..da16808 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -122,7 +122,7 @@ impl FastAutomaton { if self.deterministic { let mut deterministic = true; - for (condition, state) in self.transitions_from_iter(from_state) { + for (condition, state) in self.transitions_from(from_state) { if state == &to_state { continue; } @@ -159,12 +159,15 @@ impl FastAutomaton { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self + .transitions_from(to_state) + .map(|(cond, to_state)| (cond.clone(), *to_state)) + .collect(); for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (c, s) in self.transitions_from_iter(from_state) { + for (c, s) in self.transitions_from(from_state) { if state == *s { continue; } @@ -190,6 +193,19 @@ impl FastAutomaton { } } + pub fn remove_transition(&mut self, from_state: State, to_state: State) { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + + self.transitions_in + .entry(to_state) + .or_default() + .remove(&from_state); + self.transitions[from_state].remove(&to_state); + } + /// Removes the state and all its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); @@ -269,7 +285,7 @@ impl FastAutomaton { return Ok(()); } let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.all_states_vec() { + for from_state in &self.states_vec() { for to_state in self.direct_states_vec(from_state) { match self.transitions[*from_state].entry(to_state) { Entry::Occupied(mut o) => { @@ -296,6 +312,7 @@ impl FastAutomaton { #[inline] pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { self.transitions = model.transitions.clone(); + self.transitions_in = model.transitions_in.clone(); self.start_state = model.start_state; self.accept_states = model.accept_states.clone(); self.removed_states = model.removed_states.clone(); @@ -320,7 +337,7 @@ mod tests { } fn assert_regex_build_deterministic_automaton(regex: &str, deterministic: bool) { - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs deleted file mode 100644 index c99cbc5..0000000 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ /dev/null @@ -1,207 +0,0 @@ -use super::*; - -impl StateEliminationAutomaton { - pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { - let mut index = 0; - let mut stack = Vec::new(); - let mut indices = vec![-1; self.transitions.len()]; - let mut lowlink = vec![-1; self.transitions.len()]; - let mut on_stack = vec![false; self.transitions.len()]; - let mut scc = Vec::new(); - - for state in self.states_iter() { - if self.removed_states.contains(&state) { - continue; - } - if indices[state] == -1 { - self.strongconnect( - state, - &mut index, - &mut stack, - &mut indices, - &mut lowlink, - &mut on_stack, - &mut scc, - ); - } - } - - let scc = scc - .into_iter() - .filter(|states| { - let first_state = states.iter().next().unwrap(); - let self_loop = if let Some(transitions_in) = self.transitions_in.get(first_state) { - transitions_in.contains(first_state) - } else { - false - }; - states.len() != 1 || self_loop - }) - .collect::>(); - - for component in scc { - self.build_component(&component)?; - } - - self.cyclic = false; - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - fn strongconnect( - &self, - v: usize, - index: &mut usize, - stack: &mut Vec, - indices: &mut Vec, - lowlink: &mut Vec, - on_stack: &mut Vec, - scc: &mut Vec>, - ) { - indices[v] = *index as i32; - lowlink[v] = *index as i32; - *index += 1; - stack.push(v); - on_stack[v] = true; - - if let Some(neighbors) = self.transitions.get(v) { - for &w in neighbors.keys() { - if indices[w] == -1 { - self.strongconnect(w, index, stack, indices, lowlink, on_stack, scc); - lowlink[v] = lowlink[v].min(lowlink[w]); - } else if on_stack[w] { - lowlink[v] = lowlink[v].min(indices[w]); - } - } - } - - if lowlink[v] == indices[v] { - let mut component = Vec::new(); - while let Some(w) = stack.pop() { - on_stack[w] = false; - component.push(w); - if w == v { - break; - } - } - scc.push(component); - } - } - - fn build_component(&mut self, states: &[usize]) -> Result<(), EngineError> { - let state_set = states.iter().copied().collect::>(); - let mut start_states = IntMap::new(); - let mut accept_states = IntMap::new(); - - let mut state_elimination_automaton = StateEliminationAutomaton { - start_state: 0, // start_state is not set yet - accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(states.len()), - transitions_in: IntMap::with_capacity(states.len()), - removed_states: IntSet::new(), - cyclic: true, - }; - - let mut states_map = IntMap::with_capacity(states.len()); - for from_state in states { - if *from_state == self.accept_state { - self.accept_state = self.new_state(); - self.add_transition_to(*from_state, self.accept_state, GraphTransition::Epsilon); - } - if *from_state == self.start_state { - self.start_state = self.new_state(); - self.add_transition_to(self.start_state, *from_state, GraphTransition::Epsilon); - } - let from_state_new = *states_map - .entry(*from_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if !state_set.contains(to_state) { - accept_states - .entry(*to_state) - .or_insert_with(Vec::new) - .push((from_state_new, transition.clone())); - continue; - } - - let to_state_new = *states_map - .entry(*to_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - - state_elimination_automaton.add_transition_to( - from_state_new, - to_state_new, - transition.clone(), - ); - } - - for (parent_state, transition) in self.in_transitions_vec(*from_state) { - if !state_set.contains(&parent_state) { - start_states - .entry(from_state_new) - .or_insert_with(Vec::new) - .push((parent_state, transition.clone())); - } - } - } - - for state in states { - self.remove_state(*state); - } - - for (start_state, parent_states) in &start_states { - for (parent_state, transition) in parent_states { - let new_parent_state = if !transition.is_empty_string() { - let new_parent_state = self.new_state(); - - self.add_transition_to(*parent_state, new_parent_state, transition.clone()); - new_parent_state - } else { - *parent_state - }; - for (target_state, accept_states_transition) in &accept_states { - let mut new_automaton = state_elimination_automaton.clone(); - - let target_state = if accept_states_transition.len() > 1 { - new_automaton.accept_state = new_automaton.new_state(); - for (accept_state, transition) in accept_states_transition { - new_automaton.add_transition_to( - *accept_state, - new_automaton.accept_state, - transition.clone(), - ); - } - *target_state - } else { - let (accept_state, transition) = - accept_states_transition.iter().next().unwrap(); - - new_automaton.accept_state = *accept_state; - if !transition.is_empty_string() { - let new_target_state = self.new_state(); - self.add_transition_to( - new_target_state, - *target_state, - transition.clone(), - ); - new_target_state - } else { - *target_state - } - }; - - new_automaton.start_state = *start_state; - - self.add_transition_to( - new_parent_state, - target_state, - GraphTransition::Graph(new_automaton), - ); - } - } - } - - Ok(()) - } -} diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index e8e7e8e..10a530e 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,288 +1,12 @@ -use std::{ - collections::{VecDeque, hash_map::Entry}, - fmt::Display, -}; - -use ahash::{HashMapExt, HashSetExt}; -use log::warn; - -use crate::{error::EngineError, execution_profile::ExecutionProfile, regex::RegularExpression}; - use super::*; -mod builder; +mod state_elimination; mod transform; -#[derive(Clone, Debug)] -enum GraphTransition { - Graph(StateEliminationAutomaton), - Weight(T), - Epsilon, -} - -impl GraphTransition { - pub fn is_empty_string(&self) -> bool { - matches!(self, GraphTransition::Epsilon) - } - - pub fn get_weight(&self) -> Option<&T> { - if let GraphTransition::Weight(weight) = self { - Some(weight) - } else { - None - } - } -} - -#[derive(Clone, Debug)] -struct StateEliminationAutomaton { - start_state: usize, - accept_state: usize, - transitions: Vec>>, - transitions_in: IntMap>, - removed_states: IntSet, - cyclic: bool, -} - -impl Display for StateEliminationAutomaton { - fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_graph_dot(sb, None) - } -} - -impl StateEliminationAutomaton { - //#[cfg(test)] - #[allow(dead_code)] - #[inline] - pub fn to_dot(&self) { - println!("{self}"); - } - - #[inline] - fn to_graph_dot( - &self, - sb: &mut std::fmt::Formatter<'_>, - prefix: Option<&str>, - ) -> std::fmt::Result { - let is_subgraph; - let indent; - let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{prefix} {{")?; - writeln!(sb, "\t\tlabel = \"{prefix} - cyclic={}\";", self.cyclic)?; - indent = "\t"; - is_subgraph = true; - prefix - } else { - writeln!(sb, "digraph Automaton {{")?; - writeln!(sb, "\trankdir = LR;")?; - writeln!(sb, "\tlabel = \"cyclic={}\";", self.cyclic)?; - indent = ""; - is_subgraph = false; - "" - }; - - for from_state in self.states_iter() { - let from_state_with_prefix = if is_subgraph { - format!("S{prefix}_{from_state}") - } else { - format!("S{from_state}") - }; - - write!(sb, "{indent}\t{from_state_with_prefix}")?; - if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; - } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{from_state}\"];")?; - } - - if !is_subgraph && self.start_state == from_state { - writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {from_state_with_prefix}")?; - } - for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { - let to_state_with_prefix = if is_subgraph { - format!("S{prefix}_{to_state}") - } else { - format!("S{to_state}") - }; - - match weight { - GraphTransition::Graph(state_elimination_automaton) => { - let subgraph_prefix = if is_subgraph { - format!("{prefix}_{from_state}_{to_state}") - } else { - format!("{from_state}_{to_state}") - }; - state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; - writeln!(sb)?; - let subgraph_start_state = format!( - "S{subgraph_prefix}_{}", - state_elimination_automaton.start_state - ); - writeln!( - sb, - "{indent}\t{from_state_with_prefix} -> {subgraph_start_state} [label=\"ε\"]" - )?; - - let subgraph_accept_state = format!( - "S{subgraph_prefix}_{}", - state_elimination_automaton.accept_state - ); - writeln!( - sb, - "{indent}\t{subgraph_accept_state} -> {to_state_with_prefix} [label=\"ε\"]" - ) - } - GraphTransition::Weight(range) => { - writeln!( - sb, - "{indent}\t{} -> {} [label=\"{}\"]", - from_state_with_prefix, - to_state_with_prefix, - RegularExpression::Character(range.clone()) - .to_string() - .replace('\\', "\\\\") - .replace('"', "\\\"") - ) - } - GraphTransition::Epsilon => writeln!( - sb, - "{indent}\t{from_state_with_prefix} -> {to_state_with_prefix} [label=\"ε\"]" - ), - }?; - } - } - write!(sb, "{indent}}}") - } - - #[inline] - pub fn states_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator)> { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) - } - - #[inline] - pub fn transitions_from_state_vec(&self, from_state: &State) -> Vec { - self.transitions[*from_state] - .keys() - .filter(|s| !self.removed_states.contains(s)) - .copied() - .collect() - } - - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { - for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if to_state == *state { - in_transitions.push((*from_state, transition.clone())); - } - } - } - in_transitions - } - - pub fn states_topo_vec(&self) -> Vec { - if self.cyclic { - panic!("The graph has a cycle"); - } - - let mut in_degree: IntMap = self - .transitions_in - .iter() - .map(|(state, parents)| (*state, parents.len())) - .collect(); - - let mut worklist: VecDeque = VecDeque::new(); - for (&state, °ree) in &in_degree { - if degree == 0 { - worklist.push_back(state); - } - } - - let mut sorted_order = Vec::with_capacity(self.get_number_of_states()); - while let Some(state) = worklist.pop_front() { - sorted_order.push(state); - - if let Some(neighbors) = self.transitions.get(state) { - let neighbors = neighbors.keys(); - for &neighbor in neighbors { - if let Some(degree) = in_degree.get_mut(&neighbor) { - *degree -= 1; - if *degree == 0 { - worklist.push_back(neighbor); - } - } - } - } - } - - if sorted_order.len() == self.get_number_of_states() { - sorted_order - } else { - panic!("The graph has a cycle"); - } - } - - #[inline] - pub fn get_number_of_states(&self) -> usize { - self.transitions.len() - self.removed_states.len() - } -} - impl FastAutomaton { - /// Attempts to convert the automaton to a [`RegularExpression`]; returns `None` if no equivalent pattern are found. - pub fn to_regex(&self) -> Option { - if self.is_empty() { - return Some(RegularExpression::new_empty()); - } - let execution_profile = ExecutionProfile::get(); - if let Ok(graph) = StateEliminationAutomaton::new(self) { - if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { - let regex = regex?; - match regex.to_automaton() { - Ok(automaton) => match self.are_equivalent(&automaton) { - Ok(result) => { - if !result { - warn!( - "The automaton is not equivalent to the generated regex; automaton={self}, regex={regex}" - ); - None - } else { - Some(regex) - } - } - Err(err) => { - warn!( - "Engine error while checking for equivalence ({err}); automaton={self}, regex={regex}" - ); - None - } - }, - Err(err) => { - if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!( - "The generated regex cannot be converted to automaton to be checked for equivalence ({err}); automaton={self}, regex={regex}" - ); - } - None - } - } - } else { - None - } - } else { - None - } + pub fn to_regex(&self) -> RegularExpression { + let transformed_automaton = transform::transform(self); + state_elimination::convert_to_regex(&transformed_automaton) } } @@ -290,8 +14,24 @@ impl FastAutomaton { mod tests { use super::*; + #[test] + fn test_convert_t() -> Result<(), String> { + assert_convert("abc.*def.*uif(ab|de)"); + + Ok(()) + } + #[test] fn test_convert() -> Result<(), String> { + + assert_convert(".*u(ab|de)"); + assert_convert(".*sf.*uif(ab|de)"); + + assert_convert("(a+|,)*"); + assert_convert("((ab)*,(cd)*)*"); + assert_convert("(a*,a*,a*)*"); + assert_convert("(a*,a*)*"); + assert_convert("(ac|ads|a)*"); assert_convert(".*sf"); assert_convert(".*sf.*uif(ab|de)"); @@ -325,36 +65,33 @@ mod tests { } fn assert_convert(regex: &str) { - let input_regex = RegularExpression::new(regex).unwrap(); + let input_regex = RegularExpression::parse(regex, false).unwrap(); println!("IN : {}", input_regex); let input_automaton = input_regex.to_automaton().unwrap(); - //input_automaton.to_dot(); - - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); - //input_automaton.to_dot(); - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.are_equivalent(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); } #[test] fn test_convert_after_operation_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(ab|cd)") + let automaton1 = RegularExpression::parse("(ab|cd)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("ab") + let automaton2 = RegularExpression::parse("ab", false) .unwrap() .to_automaton() .unwrap(); @@ -362,9 +99,9 @@ mod tests { let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("cd", output_regex.to_string()); Ok(()) @@ -372,20 +109,20 @@ mod tests { #[test] fn test_convert_after_operation_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("", output_regex.to_string()); Ok(()) @@ -393,71 +130,72 @@ mod tests { #[test] fn test_convert_after_operation_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); let automaton2 = automaton2.determinize().unwrap(); let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert_eq!("x(x{3})*x?", result.to_string()); Ok(()) } #[test] fn test_convert_after_operation_4() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*abc.*") + let automaton1 = RegularExpression::parse(".*abc.*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + let automaton2 = RegularExpression::parse(".*def.*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); assert_eq!(".*(abc.*def|def.*abc).*", result.to_string()); Ok(()) } - /*#[test] - fn test_convert_after_operation_5() -> Result<(), String> { - if std::env::var_os("RUST_LOG").is_none() { - std::env::set_var("RUST_LOG", "regexsolver=debug"); - } - env_logger::init(); - - let automaton1 = RegularExpression::new(".*abc.*") + #[test] + fn test_automaton() -> Result<(), String> { + let automaton = RegularExpression::parse("a*ba*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + automaton.print_dot(); + + let automaton1 = RegularExpression::parse("(a*ba*)*", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + automaton1.print_dot(); - let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + automaton1.determinize().unwrap().print_dot(); + + // (a*b[ab]*)? + // a*b+a+b+ - let result = result.to_regex().unwrap(); + let automaton2 = RegularExpression::parse("(a*b[ab]*)?", false) + .unwrap() + .to_automaton() + .unwrap(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert!(automaton1.equivalent(&automaton2).unwrap()); Ok(()) - }*/ + } } diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs similarity index 62% rename from src/fast_automaton/convert/to_regex/builder/mod.rs rename to src/fast_automaton/convert/to_regex/state_elimination/builder.rs index 0790851..54cec00 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -1,48 +1,64 @@ -use super::*; - -mod scc; +use ahash::HashMapExt; -impl StateEliminationAutomaton { - pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { - if automaton.is_empty() { - return Ok(None); - } +use super::*; - let mut state_elimination_automaton = StateEliminationAutomaton { +impl Gnfa { + pub(super) fn from_automaton(automaton: &FastAutomaton) -> Gnfa { + let mut state_elimination_automaton = Gnfa { start_state: 0, // start_state is not set yet accept_state: 0, // accept_state is not set yet transitions: Vec::with_capacity(automaton.get_number_of_states()), transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), - removed_states: IntSet::new(), - cyclic: false, + removed_states: IntSet::with_capacity(automaton.get_number_of_states()), + empty: false }; + if automaton.is_empty() { + state_elimination_automaton.empty = true; + return state_elimination_automaton; + } + let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); - for from_state in automaton.all_states_iter() { + for from_state in automaton.states() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (condition, to_state) in - automaton.transitions_from_iter(from_state) - { + for (condition, to_state) in automaton.transitions_from(from_state) { let new_to_state = *states_map .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( new_from_state, new_to_state, - GraphTransition::Weight(condition.to_range(automaton.get_spanning_set())?), + RegularExpression::Character( + condition.to_range(automaton.get_spanning_set()).unwrap(), + ), ); } } - state_elimination_automaton.start_state = - *states_map.get(&automaton.get_start_state()).unwrap(); // We finally set start_state + if automaton.in_degree(automaton.get_start_state()) == 0 { + // If the start state does not have any incoming state we just set it + state_elimination_automaton.start_state = + *states_map.get(&automaton.get_start_state()).unwrap(); + } else { + // If not we create a new state that will be the new start state + state_elimination_automaton.start_state = state_elimination_automaton.new_state(); + + let previous_start_state = *states_map.get(&automaton.get_start_state()).unwrap(); + // We add an empty string transition to the new start state + state_elimination_automaton.add_transition( + state_elimination_automaton.start_state, + previous_start_state, + RegularExpression::new_empty_string(), + ); + } - if automaton.get_accept_states().len() == 1 { - // If there is only one accept state with just set it + let accept_state = *automaton.get_accept_states().iter().next().unwrap(); + if automaton.get_accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { + // If there is only one accept state we just set it state_elimination_automaton.accept_state = *states_map .get(automaton.get_accept_states().iter().next().unwrap()) .unwrap(); @@ -52,19 +68,18 @@ impl StateEliminationAutomaton { for accept_state in automaton.get_accept_states() { let accept_state = *states_map.get(accept_state).unwrap(); // We add an empty string transition to the new accept state - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( accept_state, state_elimination_automaton.accept_state, - GraphTransition::Epsilon, + RegularExpression::new_empty_string(), ); } } - state_elimination_automaton.identify_and_apply_components()?; - //state_elimination_automaton.to_dot(); - Ok(Some(state_elimination_automaton)) + + state_elimination_automaton } - pub fn new_state(&mut self) -> usize { + fn new_state(&mut self) -> usize { if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); self.transitions_in.insert(*new_state, IntSet::new()); @@ -78,7 +93,7 @@ impl StateEliminationAutomaton { } #[inline] - pub fn has_state(&self, state: State) -> bool { + pub(super) fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } @@ -89,11 +104,11 @@ impl StateEliminationAutomaton { } } - pub fn add_transition_to( + pub(crate) fn add_transition( &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: RegularExpression, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -106,13 +121,8 @@ impl StateEliminationAutomaton { .insert(from_state); match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { - if let (GraphTransition::Weight(current_regex), GraphTransition::Weight(regex)) = - (o.get(), transition) - { - o.insert(GraphTransition::Weight(current_regex.union(®ex))); - } else { - panic!("Cannot add transition"); - } + //o.insert(RegularExpression::Alternation(vec![transition, o.get().clone()])); + o.insert(transition.union(o.get())); } Entry::Vacant(v) => { v.insert(transition); @@ -120,7 +130,7 @@ impl StateEliminationAutomaton { }; } - pub fn remove_state(&mut self, state: State) { + pub(super) fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( @@ -149,21 +159,4 @@ impl StateEliminationAutomaton { transitions.remove(&state); } } - - pub fn remove_transition(&mut self, from_state: State, to_state: State) { - self.assert_state_exists(from_state); - if from_state != to_state { - self.assert_state_exists(to_state); - } - - if let Some(from_states) = self.transitions_in.get_mut(&to_state) { - from_states.remove(&from_state); - } - - self.transitions[from_state].remove(&to_state); - } - - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { - self.transitions.get(from_state)?.get(&to_state) - } } diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs new file mode 100644 index 0000000..d528f1b --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -0,0 +1,118 @@ +use super::*; + +impl Gnfa { + pub(super) fn convert(&mut self) -> RegularExpression { + if self.empty { + return RegularExpression::new_empty(); + } + + while let Some(state) = self.get_next_state_to_eliminate() { + self.eliminate_state(state); + } + + self.get_transition(self.start_state, self.accept_state) + .cloned() + .unwrap_or(RegularExpression::new_empty_string()) + } + + fn get_next_state_to_eliminate(&self) -> Option { + let mut best_state: Option = None; + let mut best_score: u128 = u128::MAX; + + for state in self.all_states_iter() { + if state == self.start_state || state == self.accept_state { + continue; + } + + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = state as u128 & 0xFF; + if score < best_score { + best_score = score; + best_state = Some(state); + } + continue; + } + + let mut score: u128 = in_deg * out_deg; + + if self.has_self_loop(state) { + score = score + (score >> 1); + } + + let mut label_cost: u128 = 0; + + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } + + score = score.saturating_mul(1).saturating_add(label_cost); + + let tie = state as u128 & 0xFFFF; + let score = score.saturating_add(tie); + + if score < best_score { + best_score = score; + best_state = Some(state); + } + } + + best_state + } + + fn eliminate_state(&mut self, k: usize) { + if self.removed_states.contains(&k) { + return; + } + + let in_states = self + .transitions_in + .get(&k) + .unwrap() + .iter() + .cloned() + .filter(|&s| s != k) + .collect::>(); + let out_states = self.transitions[k] + .keys() + .cloned() + .filter(|&s| s != k) + .collect::>(); + + for p in in_states { + for &q in &out_states { + self.bridge(p, k, q); + } + } + + self.remove_state(k); + } + + fn bridge(&mut self, p: usize, k: usize, q: usize) { + let rpk = self.get_transition(p, k); + let rkk = self.get_transition(k, k); + let rkq = self.get_transition(k, q); + + if let (Some(rpk), Some(rkq)) = (rpk, rkq) { + let mut regex = rpk.clone(); + if let Some(rkk) = rkk { + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, RegularExpression::Repetition(Box::new(rkk.clone()), 0, None)])); + regex = regex.concat(&rkk.repeat(0, None), true); + } + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, rkq.clone()])); + regex = regex.concat(rkq, true); + self.add_transition(p, q, regex); + } + } +} diff --git a/src/fast_automaton/convert/to_regex/state_elimination/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs new file mode 100644 index 0000000..023d6b1 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs @@ -0,0 +1,121 @@ +use super::*; + +mod builder; +mod eliminate; + +struct Gnfa { + start_state: usize, + accept_state: usize, + transitions: Vec>, + transitions_in: IntMap>, + removed_states: IntSet, + empty: bool, +} + +impl Display for Gnfa { + fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(sb, "digraph GNFA {{")?; + writeln!(sb, "\trankdir = LR;")?; + for from_state in self.all_states_iter() { + write!(sb, "\t{from_state}")?; + if self.accept_state == from_state { + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; + } else { + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; + } + + if self.start_state == from_state { + writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; + writeln!(sb, "\tinitial -> {from_state}")?; + } + for (regex, to_state) in self.transitions_from_vec(from_state) { + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{regex}\"]")?; + } + } + write!(sb, "}}") + } +} + +impl Gnfa { + fn get_transition(&self, from_state: State, to_state: State) -> Option<&RegularExpression> { + self.transitions.get(from_state)?.get(&to_state) + } + + #[inline] + fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + fn transitions_to_vec(&self, state: State) -> Vec<(State, RegularExpression)> { + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { + if to_state == state { + in_transitions.push((*from_state, condition)); + break; + } + } + } + in_transitions + } + + #[inline] + fn transitions_from_vec(&self, state: State) -> Vec<(RegularExpression, State)> { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + } + + #[inline] + fn has_self_loop(&self, state: State) -> bool { + self.get_transition(state, state).is_some() + } +} + +pub(super) fn convert_to_regex(automaton: &FastAutomaton) -> RegularExpression { + let mut gnfa = Gnfa::from_automaton(automaton); + gnfa.convert() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_state_elimination() -> Result<(), String> { + test_correct("abc"); + test_correct(".*de"); + test_correct(".*def"); + test_correct("(a*ba*)*"); + test_correct(".*u(ab|d)"); + test_correct(".*u(ab|de)"); + Ok(()) + } + + fn test_correct(pattern: &str) { + println!("Pattern: {pattern}"); + + let automaton = RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + + let automaton = automaton.determinize().unwrap().into_owned(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs deleted file mode 100644 index 4498578..0000000 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ /dev/null @@ -1,208 +0,0 @@ -use std::hash::BuildHasherDefault; - -use crate::execution_profile::ExecutionProfile; - -use super::*; - -impl StateEliminationAutomaton { - pub fn convert_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.cyclic { - return self.convert_graph_to_regex(execution_profile); - } - execution_profile.assert_not_timed_out()?; - - let mut regex_map: IntMap = IntMap::with_capacity_and_hasher( - self.get_number_of_states(), - BuildHasherDefault::default(), - ); - regex_map.insert(self.start_state, RegularExpression::new_empty_string()); - for from_state in self.states_topo_vec() { - let current_regex = if let Some(current_regex) = regex_map.get(&from_state) { - current_regex.clone() - } else { - RegularExpression::new_empty_string() - }; - if let Some(transitions) = self.transitions.get(from_state) { - for (to_state, transition) in transitions { - let transition_regex = match transition { - GraphTransition::Graph(graph) => { - if let Some(regex) = graph.convert_graph_to_regex(execution_profile)? { - regex - } else { - return Ok(None); - } - } - GraphTransition::Weight(range) => { - RegularExpression::Character(range.clone()) - } - GraphTransition::Epsilon => RegularExpression::new_empty_string(), - }; - let new_regex = current_regex.concat(&transition_regex, true); - match regex_map.entry(*to_state) { - Entry::Occupied(mut o) => { - o.insert(new_regex.union(o.get()).simplify()); - } - Entry::Vacant(v) => { - v.insert(new_regex); - } - }; - } - } - } - - Ok(regex_map.get(&self.accept_state).cloned()) - } - - fn convert_graph_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - execution_profile.assert_not_timed_out()?; - if let Some(regex) = self.convert_shape_dot_star(execution_profile)? { - return Ok(Some(regex)); - } else if let Some(regex) = self.convert_shape_self_loop(execution_profile)? { - return Ok(Some(regex)); - } - Ok(None) - } - - /// We try to idenfify the regex following the shape: - /// A*B - fn convert_shape_dot_star( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.get_number_of_states() < 2 { - return Ok(None); - } - //self.to_dot(); - let mut dot_value = - if let Some(dot_value) = self.get_transition(self.start_state, self.start_state) { - if let Some(dot_value) = dot_value.get_weight() { - dot_value.clone() - } else { - return Ok(None); - } - } else { - return Ok(None); - }; - - for state in self.states_iter() { - if state == self.start_state { - continue; - } - let weight = if let Some(weight) = self.get_transition(state, self.start_state) { - if let Some(weight) = weight.get_weight() { - weight - } else { - return Ok(None); - } - } else if state == self.accept_state { - continue; - } else { - return Ok(None); - }; - - if !dot_value.contains_all(weight) { - return Ok(None); - } - } - - let mut graph = self.clone(); - - for (from_state, transition) in graph.in_transitions_vec(graph.start_state) { - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - return Ok(None); - }; - dot_value = dot_value.union(weight); - graph.remove_transition(from_state, graph.start_state); - } - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::with_capacity(graph.get_number_of_states()); - - worklist.push_back(graph.start_state); - seen.insert(self.start_state); - - while let Some(from_state) = worklist.pop_front() { - for to_state in graph.transitions_from_state_vec(&from_state) { - let transition = - if let Some(transition) = graph.get_transition(from_state, to_state) { - transition - } else { - return Ok(None); - }; - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - continue; - }; - dot_value = dot_value.union(weight); - if seen.contains(&to_state) { - if graph.accept_state != to_state || to_state == from_state { - graph.remove_transition(from_state, to_state); - } - } else { - seen.insert(to_state); - worklist.push_back(to_state); - } - } - } - - graph.add_transition_to( - self.start_state, - self.start_state, - GraphTransition::Weight(dot_value), - ); - - graph.identify_and_apply_components()?; - graph.convert_to_regex(execution_profile) - } - - /// We try to identify the regex following the shape: - /// A*B - fn convert_shape_self_loop( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - let mut graph = self.clone(); - - graph.accept_state = graph.new_state(); - - for (from_state, transition) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - - graph.add_transition_to(from_state, graph.accept_state, transition); - } - - graph.identify_and_apply_components()?; - - let a_part = if let Some(a_part) = graph.convert_to_regex(execution_profile)? { - a_part - } else { - return Ok(None); - }; - - let mut graph = self.clone(); - - for (from_state, _) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - } - - graph.identify_and_apply_components()?; - let b_part = if let Some(b_part) = graph.convert_to_regex(execution_profile)? { - b_part - } else { - return Ok(None); - }; - - let regex = a_part.repeat(0, None).concat(&b_part, true); - - Ok(Some(regex)) - } -} diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs new file mode 100644 index 0000000..552222e --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/mod.rs @@ -0,0 +1,16 @@ +use crate::fast_automaton::{ + FastAutomaton, convert::to_regex::transform::shape::dotstar::dot_star, +}; + +mod shape; + +const TRANSFORM_FUNCTION: &[fn(&FastAutomaton) -> FastAutomaton] = &[dot_star]; + +pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { + let mut automaton = automaton.clone(); + for transform in TRANSFORM_FUNCTION { + automaton = transform(&automaton); + } + + automaton +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs new file mode 100644 index 0000000..bf5e682 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -0,0 +1,172 @@ +use nohash_hasher::IntSet; + +use crate::fast_automaton::{FastAutomaton, State, condition::Condition}; + +pub(crate) fn dot_star(automaton: &FastAutomaton) -> FastAutomaton { + let components = identify_and_apply_components(automaton); + + let mut automaton = automaton.clone(); + for component in components { + dot_star_component(&mut automaton, &component); + } + + automaton +} + +fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) { + let mut start_state = if component.contains(&automaton.start_state) { + Some(automaton.start_state) + } else { + None + }; + for &state in component { + for (from_state, _) in automaton.transitions_to_vec(state) { + if !component.contains(&from_state) { + if start_state.is_none() { + start_state = Some(state); + } else { + // Only one start state possible + return; + } + } + } + } + + if start_state.is_none() { + // Only one start state possible + return; + } + let start_state = start_state.unwrap(); + + let mut first_hop = automaton + .direct_states(&start_state) + .filter(|&s| s != start_state) + .collect::>(); + let mut states_to_remove = vec![]; + + for state in &first_hop { + let transitions = automaton.transitions_to_vec(*state); + if !transitions.iter().all(|(_, c)| *c == transitions[0].1) { + // Some condition(s) to a given first hop state are not the same. + return; + } + + if transitions.len() != component.len() { + states_to_remove.push(*state); + } + } + + states_to_remove.iter().for_each(|s| { + first_hop.remove(s); + }); + + let mut out_condition = None; + for &state in component { + let mut has_transition_to_start_state = false; + + let mut this_condition = Condition::empty(automaton.get_spanning_set()); + for (condition, &to_state) in automaton.transitions_from(state) { + if to_state == start_state { + has_transition_to_start_state = true; + } + + this_condition = this_condition.union(&condition); + } + if !has_transition_to_start_state { + // Some state(s) do not have transition to the start state. + return; + } + + if let Some(condition) = &out_condition { + if &this_condition != condition { + // The union of outcoming condition for some states are not identical + return; + } + } else { + out_condition = Some(this_condition); + } + } + + automaton.add_transition(start_state, start_state, &out_condition.unwrap()); + for &state in component { + for to_state in automaton.direct_states_vec(&state) { + if !component.contains(&to_state) { + continue; + } + + if state != start_state && (to_state == start_state || first_hop.contains(&to_state)) { + automaton.remove_transition(state, to_state); + } + } + } + for state in states_to_remove { + automaton.remove_state(state); + } +} + +pub fn identify_and_apply_components(automaton: &FastAutomaton) -> Vec> { + let mut index = 0; + let mut stack = Vec::new(); + let mut indices = vec![-1; automaton.transitions.len()]; + let mut lowlink = vec![-1; automaton.transitions.len()]; + let mut on_stack = vec![false; automaton.transitions.len()]; + let mut scc = Vec::new(); + + for state in automaton.states() { + if indices[state] == -1 { + strongconnect( + automaton, + state, + &mut index, + &mut stack, + &mut indices, + &mut lowlink, + &mut on_stack, + &mut scc, + ); + } + } + + scc.into_iter() + .filter(|states| states.len() != 1) + .collect::>() +} + +#[allow(clippy::too_many_arguments)] +fn strongconnect( + automaton: &FastAutomaton, + v: usize, + index: &mut usize, + stack: &mut Vec, + indices: &mut Vec, + lowlink: &mut Vec, + on_stack: &mut Vec, + scc: &mut Vec>, +) { + indices[v] = *index as i32; + lowlink[v] = *index as i32; + *index += 1; + stack.push(v); + on_stack[v] = true; + + for w in automaton.direct_states(&v) { + if indices[w] == -1 { + strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); + lowlink[v] = lowlink[v].min(lowlink[w]); + } else if on_stack[w] { + lowlink[v] = lowlink[v].min(indices[w]); + } + } + + if lowlink[v] == indices[v] { + let mut component = IntSet::default(); + while let Some(w) = stack.pop() { + on_stack[w] = false; + component.insert(w); + if w == v { + break; + } + } + scc.push(component); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/mod.rs b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs new file mode 100644 index 0000000..5c83bf6 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs @@ -0,0 +1 @@ +pub(super) mod dotstar; \ No newline at end of file diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 6cb0628..7532309 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -58,7 +58,7 @@ impl FastAutomaton { break; } } - for (cond, to_state) in self.transitions_from_iter(state) { + for (cond, to_state) in self.transitions_from(state) { execution_profile.assert_not_timed_out()?; let range = match ranges_cache.entry(cond) { Entry::Occupied(o) => o.get().clone(), @@ -114,7 +114,7 @@ mod tests { fn assert_generate_strings(regex: &str, number: usize) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index cfa4f68..7bf0313 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -47,7 +47,7 @@ impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.all_states_iter() { + for from_state in self.states() { write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; @@ -59,7 +59,7 @@ impl Display for FastAutomaton { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; writeln!(sb, "\tinitial -> {from_state}")?; } - for (cond, to_state) in self.transitions_from_iter(from_state) { + for (cond, to_state) in self.transitions_from(from_state) { writeln!( sb, "\t{from_state} -> {to_state} [label=\"{}\"]", @@ -85,7 +85,7 @@ impl FastAutomaton { /// Returns the number of transitions to the provided state. #[inline] - pub fn state_in_degree(&self, state: State) -> usize { + pub fn in_degree(&self, state: State) -> usize { self.transitions_in .get(&state) .unwrap_or(&IntSet::new()) @@ -94,25 +94,25 @@ impl FastAutomaton { /// Returns the number of transitions from the provided state. #[inline] - pub fn state_out_degree(&self, state: State) -> usize { + pub fn out_degree(&self, state: State) -> usize { self.transitions[state].len() } /// Returns an iterator over the automaton’s states. #[inline] - pub fn all_states_iter(&self) -> impl Iterator + '_ { + pub fn states(&self) -> impl Iterator + '_ { (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } /// Returns a vector containing the automaton’s states. #[inline] - pub fn all_states_vec(&self) -> Vec { - self.all_states_iter().collect() + pub fn states_vec(&self) -> Vec { + self.states().collect() } /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] - pub fn direct_states_iter(&self, state: &State) -> impl Iterator + '_ { + pub fn direct_states(&self, state: &State) -> impl Iterator + '_ { self.transitions[*state] .keys() .cloned() @@ -122,7 +122,7 @@ impl FastAutomaton { /// Returns a vector of states directly reachable from the given state in one transition. #[inline] pub fn direct_states_vec(&self, state: &State) -> Vec { - self.direct_states_iter(state).collect() + self.direct_states(state).collect() } /// Returns a vector containing the transitions to the provided state. @@ -151,7 +151,7 @@ impl FastAutomaton { /// Returns an iterator over transitions from the given state. #[inline] - pub fn transitions_from_iter( + pub fn transitions_from( &self, state: State, ) -> impl Iterator { @@ -161,31 +161,6 @@ impl FastAutomaton { .filter(|s| !self.removed_states.contains(s.1)) } - /// Returns a mutable iterator over transitions from the given state. - #[inline] - pub fn transitions_from_iter_mut( - &mut self, - state: &State, - ) -> impl Iterator { - self.transitions[*state] - .iter_mut() - .map(|(s, c)| (c, s)) - .filter(|s| !self.removed_states.contains(s.1)) - } - - /// Returns an owned iterator over transitions from the given state. - #[inline] - pub fn transitions_from_into_iter( - &self, - state: &State, - ) -> impl Iterator + '_ { - self.transitions[*state] - .clone() - .into_iter() - .map(|(s, c)| (c, s)) - .filter(|(_, state)| !self.removed_states.contains(state)) - } - /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { @@ -221,16 +196,6 @@ impl FastAutomaton { self.transitions[from_state].get(&to_state) } - // Returns a mutable reference to the condition of the directed transition between the two states, if any. - #[inline] - pub fn get_condition_mut( - &mut self, - from_state: State, - to_state: State, - ) -> Option<&mut Condition> { - self.transitions[from_state].get_mut(&to_state) - } - /// Returns the start state. #[inline] pub fn get_start_state(&self) -> State { @@ -285,7 +250,7 @@ impl FastAutomaton { continue; } let curr_char = input.chars().nth(position).unwrap() as u32; - for (cond, to_state) in self.transitions_from_iter(*current_state) { + for (cond, to_state) in self.transitions_from(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { if position + 1 == input.len() { if self.accept_states.contains(to_state) { @@ -301,7 +266,12 @@ impl FastAutomaton { } #[inline] - pub fn to_dot(&self) { + pub fn as_dot(&self) -> String { + format!("{self}") + } + + #[inline] + pub fn print_dot(&self) { println!("{self}"); } } diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 45654d3..6318c61 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -41,12 +41,12 @@ impl FastAutomaton { BuildHasherDefault::default(), ); - let start_state_and_accept_states_not_mergeable = other.state_in_degree(other.start_state) > 0 + let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 && self .accept_states .iter() .cloned() - .any(|s| self.state_out_degree(s) > 0); + .any(|s| self.out_degree(s) > 0); let accept_states = self.accept_states.iter().cloned().collect::>(); @@ -67,7 +67,7 @@ impl FastAutomaton { } } - for from_state in other.all_states_iter() { + for from_state in other.states() { let new_from_states = match new_states.entry(from_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -86,7 +86,7 @@ impl FastAutomaton { } }; - for (condition, to_state) in other.transitions_from_iter(from_state) { + for (condition, to_state) in other.transitions_from(from_state) { let new_to_states = match new_states.entry(*to_state) { Entry::Occupied(o) => { vec![*o.get()] @@ -135,12 +135,12 @@ mod tests { #[test] fn test_simple_concatenation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("abc") + let automaton = RegularExpression::parse("abc", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("abc")); assert!(!automaton.match_string("abcd")); assert!(!automaton.match_string("ab")); @@ -150,7 +150,7 @@ mod tests { #[test] fn test_simple_concat_alternation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("0101(abc|ac|aaa)") + let automaton = RegularExpression::parse("0101(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); @@ -170,7 +170,7 @@ mod tests { #[test] fn test_simple_concat_repeat_regex() -> Result<(), String> { - let automaton = RegularExpression::new("A+B*") + let automaton = RegularExpression::parse("A+B*", false) .unwrap() .to_automaton() .unwrap(); @@ -185,7 +185,7 @@ mod tests { #[test] fn test_simple_repeat_regex_01() -> Result<(), String> { - let automaton = RegularExpression::new("a+") + let automaton = RegularExpression::parse("a+", false) .unwrap() .to_automaton() .unwrap(); @@ -200,7 +200,7 @@ mod tests { #[test] fn test_simple_repeat_regex_02() -> Result<(), String> { - let automaton = RegularExpression::new("a*c") + let automaton = RegularExpression::parse("a*c", false) .unwrap() .to_automaton() .unwrap(); @@ -214,11 +214,11 @@ mod tests { #[test] fn test_simple_repeat_regex_03() -> Result<(), String> { - let automaton = RegularExpression::new("(ab){3,4}") + let automaton = RegularExpression::parse("(ab){3,4}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("ababab")); assert!(automaton.match_string("abababab")); assert!(!automaton.match_string("ab")); @@ -229,11 +229,11 @@ mod tests { #[test] fn test_simple_repeat_regex_04() -> Result<(), String> { - let automaton = RegularExpression::new("a{3,}") + let automaton = RegularExpression::parse("a{3,}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("aaa")); assert!(automaton.match_string("aaaaa")); assert!(!automaton.match_string("a")); @@ -243,11 +243,11 @@ mod tests { #[test] fn test_simple_repeat_regex_05() -> Result<(), String> { - let automaton = RegularExpression::new("a?") + let automaton = RegularExpression::parse("a?", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(!automaton.match_string("aa")); @@ -257,11 +257,11 @@ mod tests { #[test] fn test_simple_repeat_regex_06() -> Result<(), String> { - let automaton = RegularExpression::new("a{0,2}") + let automaton = RegularExpression::parse("a{0,2}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -272,11 +272,11 @@ mod tests { #[test] fn test_simple_repeat_regex_07() -> Result<(), String> { - let automaton = RegularExpression::new("a{1,3}") + let automaton = RegularExpression::parse("a{1,3}", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -287,11 +287,11 @@ mod tests { #[test] fn test_simple_repeat_regex_08() -> Result<(), String> { - let automaton = RegularExpression::new("a+(ba+)*") + let automaton = RegularExpression::parse("a+(ba+)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(!automaton.match_string("aab")); assert!(automaton.match_string("a")); @@ -306,11 +306,11 @@ mod tests { #[test] fn test_simple_repeat_regex_09() -> Result<(), String> { - let automaton = RegularExpression::new("(ac|ads|a)*") + let automaton = RegularExpression::parse("(ac|ads|a)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("ac")); assert!(automaton.match_string("ads")); @@ -328,11 +328,11 @@ mod tests { #[test] fn test_simple_repeat_regex_10() -> Result<(), String> { - let automaton = RegularExpression::new("(ef|ads|a)+") + let automaton = RegularExpression::parse("(ef|ads|a)+", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(!automaton.match_string("")); assert!(automaton.match_string("ef")); assert!(automaton.match_string("ads")); @@ -350,11 +350,11 @@ mod tests { #[test] fn test_simple_repeat_regex_11() -> Result<(), String> { - let automaton = RegularExpression::new("(a|bc)*") + let automaton = RegularExpression::parse("(a|bc)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("bc")); @@ -367,11 +367,11 @@ mod tests { #[test] fn test_simple_repeat_regex_12() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)?") + let automaton = RegularExpression::parse("([ab]*a)?", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -385,11 +385,11 @@ mod tests { #[test] fn test_simple_repeat_regex_13() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)*") + let automaton = RegularExpression::parse("([ab]*a)*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("aa")); @@ -403,22 +403,22 @@ mod tests { #[test] fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { - let automaton = RegularExpression::new("a*") + let automaton = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert_eq!(1, automaton.get_number_of_states()); Ok(()) } #[test] fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { - let automaton = RegularExpression::new("(a*bc)") + let automaton = RegularExpression::parse("(a*bc)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert_eq!(3, automaton.get_number_of_states()); Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 734f622..73f8464 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -40,7 +40,7 @@ impl FastAutomaton { for base in &ranges { for from_state in &states { - for (cond, to_state) in self.transitions_from_iter(*from_state) { + for (cond, to_state) in self.transitions_from(*from_state) { if cond.has_intersection(base) { match new_states_to_add.binary_search(to_state) { Ok(_) => {} // element already in vector @ `pos` @@ -84,7 +84,7 @@ mod tests { #[test] fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::new(".*ab") + let automaton = RegularExpression::parse(".*ab", false) .unwrap() .to_automaton() .unwrap(); @@ -113,7 +113,7 @@ mod tests { fn assert_determinization(regex: &str) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index acb63e9..4e2bd22 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -16,9 +16,9 @@ impl FastAutomaton { ); let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.all_states_iter() { + for from_state in self.states() { let mut new_condition = Condition::empty(&self.spanning_set); - for (condition, _) in self.transitions_from_iter(from_state) { + for (condition, _) in self.transitions_from(from_state) { new_condition = new_condition.union(condition); ranges.push(condition.to_range(self.get_spanning_set())?); } @@ -36,7 +36,7 @@ impl FastAutomaton { let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); self.apply_new_spanning_set(&new_spanning_set)?; - if self.state_in_degree(crash_state) == 1 { + if self.in_degree(crash_state) == 1 { self.remove_state(crash_state); } Ok(()) @@ -47,7 +47,7 @@ impl FastAutomaton { self.totalize()?; let mut new_accept_states = IntSet::default(); - for state in self.all_states_iter() { + for state in self.states() { if self.accept_states.contains(&state) { continue; } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 4f42859..694d66c 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -4,10 +4,7 @@ use rayon::prelude::*; use condition::converter::ConditionConverter; -use crate::{ - error::EngineError, - execution_profile::{ExecutionProfile}, -}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; @@ -18,8 +15,9 @@ impl FastAutomaton { } /// Computes the intersection of all automatons in the given iterator. - pub fn intersection_all<'a, I: IntoIterator>(automatons: I) -> Result - { + pub fn intersection_all<'a, I: IntoIterator>( + automatons: I, + ) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); for automaton in automatons { @@ -34,25 +32,27 @@ impl FastAutomaton { } /// Computes in parallel the intersection of all automatons in the given iterator. - pub fn intersection_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result - { + pub fn intersection_all_par<'a, I: IntoParallelIterator>( + automatons: I, + ) -> Result { let execution_profile = ExecutionProfile::get(); let total = FastAutomaton::new_total(); - automatons.into_par_iter() - .try_fold( - || total.clone(), - |acc, next| { - execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) - }, - ) - .try_reduce( - || total.clone(), - |acc, next| { - execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) - }, - ) + automatons + .into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) } fn intersection_internal<'a>( @@ -197,7 +197,7 @@ impl FastAutomaton { condition_converter: &ConditionConverter, ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_iter(state) + .transitions_from(state) .map(|(c, &s)| match condition_converter.convert(c) { Ok(condition) => Ok((condition, s)), Err(err) => Err(err), @@ -210,15 +210,15 @@ impl FastAutomaton { #[cfg(test)] mod tests { - use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; + use crate::regex::RegularExpression; #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); @@ -234,11 +234,11 @@ mod tests { #[test] fn test_simple_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); @@ -252,11 +252,11 @@ mod tests { #[test] fn test_simple_intersection_regex_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); @@ -272,11 +272,11 @@ mod tests { #[test] fn test_complex_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*(abc|ac|aaa)") + let automaton1 = RegularExpression::parse(".*(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); @@ -293,16 +293,16 @@ mod tests { #[test] fn test_complex_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])") + let automaton1 = RegularExpression::parse("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", false) .unwrap() .to_automaton().unwrap(); - let automaton2 = RegularExpression::new("avb@.*") + let automaton2 = RegularExpression::parse("avb@.*", false) .unwrap() .to_automaton() .unwrap(); - automaton1.to_dot(); - automaton2.to_dot(); + automaton1.print_dot(); + automaton2.print_dot(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert!(!intersection.is_empty()); @@ -310,33 +310,4 @@ mod tests { assert!(intersection.match_string("avb@gmail.com")); Ok(()) } - - #[test] - fn test_intersection_par() -> Result<(), String> { - let c = 14; - let mut automaton_list = Vec::with_capacity(c); - - for i in 0..c { - automaton_list.push( - RegularExpression::new(&format!(".*{i}.*")) - .unwrap() - .to_automaton() - .unwrap(), - ) - } - - // FastAutomaton::intersection_all(automaton_list.iter().collect::>()); - - // 3.76 - // 4.47 - // 3.84 - - let _ = FastAutomaton::intersection_all_par(automaton_list.iter().collect::>()); - - // 0.59 - // 0.55 - // 0.53 - - Ok(()) - } } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 54bcba3..b21d25f 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -17,7 +17,7 @@ impl FastAutomaton { let reacheable_states = self.get_reacheable_states(); let mut dead_states = IntSet::default(); - for from_state in self.all_states_iter() { + for from_state in self.states() { if !reacheable_states.contains(&from_state) { dead_states.insert(from_state); } @@ -35,11 +35,11 @@ mod tests { #[test] fn test_remove_dead_states() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index b49207b..cdfd36b 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -21,7 +21,7 @@ impl FastAutomaton { let automaton_to_repeat = self.clone(); - if min == 0 && self.state_in_degree(self.start_state) != 0 { + if min == 0 && self.in_degree(self.start_state) != 0 { let new_state = self.new_state(); if self.is_accepted(&self.start_state) { self.accept(new_state); @@ -58,8 +58,8 @@ impl FastAutomaton { let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.state_out_degree(accept_state) == 0 - && automaton_to_repeat.state_in_degree(automaton_to_repeat.start_state) == 0 + && automaton_to_repeat.out_degree(accept_state) == 0 + && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 { automaton_to_repeat .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); @@ -112,7 +112,7 @@ mod tests { #[test] fn test_repeat_1() -> Result<(), String> { - let automaton = RegularExpression::new("(a*,a*)?") + let automaton = RegularExpression::parse("(a*,a*)?", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index e71f346..8e80b39 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -14,8 +14,9 @@ impl FastAutomaton { } /// Computes the union of all automatons in the given iterator. - pub fn union_all<'a, I: IntoIterator>(automatons: I) -> Result - { + pub fn union_all<'a, I: IntoIterator>( + automatons: I, + ) -> Result { let mut new_automaton = FastAutomaton::new_empty(); for automaton in automatons { new_automaton.union_mut(automaton)?; @@ -24,30 +25,33 @@ impl FastAutomaton { } /// Computes in parallel the union of all automatons in the given iterator. - pub fn union_all_par<'a, I: IntoParallelIterator>(automatons: I) -> Result - { + pub fn union_all_par<'a, I: IntoParallelIterator>( + automatons: I, + ) -> Result { let execution_profile = ExecutionProfile::get(); let empty = FastAutomaton::new_empty(); - automatons.into_par_iter() - .try_fold( - || empty.clone(), - |mut acc, next| { - execution_profile.apply(|| { - acc.union_mut(next)?; - Ok(acc) - }) - }, - ).try_reduce( - || empty.clone(), - |mut acc, next| { - execution_profile.apply(|| { - acc.union_mut(&next)?; - Ok(acc) - }) - }, - ) + automatons + .into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ) + .try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) } fn prepare_start_states( @@ -56,9 +60,13 @@ impl FastAutomaton { new_states: &mut IntMap, condition_converter: &ConditionConverter, ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.state_out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.state_in_degree(self.start_state); - let other_start_state_in_degree = other.state_in_degree(other.start_state); + let mut imcomplete_states = + IntSet::with_capacity(other.out_degree(other.start_state) + 1); + if other.is_accepted(&other.start_state) { + self.accept(self.start_state); + } + let self_start_state_in_degree = self.in_degree(self.start_state); + let other_start_state_in_degree = other.in_degree(other.start_state); if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { // The start states can be the same state without any consequence new_states.insert(other.start_state, self.start_state); @@ -66,29 +74,22 @@ impl FastAutomaton { } else { if self_start_state_in_degree != 0 { let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - for (cond, to_state) in self.transitions_from_vec(self.start_state) - { - self.add_transition(new_state, to_state, &cond); - } + self.add_epsilon_transition(new_state, self.start_state); self.start_state = new_state; + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); } if other_start_state_in_degree != 0 { let new_state = self.new_state(); if other.is_accepted(&other.start_state) { self.accept(new_state); - self.accept(self.start_state); } new_states.insert(other.start_state, new_state); imcomplete_states.insert(new_state); - for (cond, other_to_state) in - other.transitions_from_vec(other.start_state) - { + for (cond, other_to_state) in other.transitions_from_vec(other.start_state) { let cond = condition_converter.convert(&cond)?; let to_state = match new_states.entry(other_to_state) { Entry::Occupied(o) => *o.get(), @@ -114,13 +115,13 @@ impl FastAutomaton { ) { let mut self_accept_states_without_outgoing_edges = vec![]; for &state in &self.accept_states { - if self.state_out_degree(state) == 0 && !imcomplete_states.contains(&state) { + if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { self_accept_states_without_outgoing_edges.push(state); } } let accept_state_without_outgoing_edges = match self_accept_states_without_outgoing_edges.len() { - 1 => self_accept_states_without_outgoing_edges[0], + 1 => Some(self_accept_states_without_outgoing_edges[0]), n if n > 1 => { let new_state = self.new_state(); self.accept(new_state); @@ -131,24 +132,23 @@ impl FastAutomaton { } self.remove_state(accept_state); } - new_state - } - _ => { - let new_state = self.new_state(); - self.accept(new_state); - new_state + Some(new_state) } + _ => None, }; for &state in &other.accept_states { - if other.state_out_degree(state) == 0 { - new_states - .entry(state) - .or_insert(accept_state_without_outgoing_edges); - } else if new_states.get(&state).is_none() { - let new_accept_state = self.new_state(); - self.accept(new_accept_state); - new_states.insert(state, new_accept_state); + match accept_state_without_outgoing_edges { + Some(accept_state) if other.out_degree(state) == 0 => { + new_states.entry(state).or_insert(accept_state); + } + _ => { + if new_states.get(&state).is_none() { + let new_accept_state = self.new_state(); + self.accept(new_accept_state); + new_states.insert(state, new_accept_state); + } + } } } } @@ -181,7 +181,7 @@ impl FastAutomaton { self.prepare_start_states(other, &mut new_states, &condition_converter)?; self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - for from_state in other.all_states_iter() { + for from_state in other.states() { let new_from_state = match new_states.entry(from_state) { Entry::Occupied(o) => *o.get(), Entry::Vacant(v) => { @@ -190,7 +190,7 @@ impl FastAutomaton { new_state } }; - for (condition, to_state) in other.transitions_from_iter(from_state) { + for (condition, to_state) in other.transitions_from(from_state) { let new_condition = condition_converter.convert(condition)?; let new_to_state = match new_states.entry(*to_state) { Entry::Occupied(o) => *o.get(), @@ -214,7 +214,7 @@ mod tests { #[test] fn test_simple_alternation_regex_1() -> Result<(), String> { - let automaton = RegularExpression::new("(abc|ac|aaa)") + let automaton = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); @@ -233,11 +233,11 @@ mod tests { #[test] fn test_simple_alternation_regex_2() -> Result<(), String> { - let automaton = RegularExpression::new("(b?|b{2})") + let automaton = RegularExpression::parse("(b?|b{2})", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("b")); assert!(automaton.match_string("bb")); @@ -248,11 +248,27 @@ mod tests { #[test] fn test_simple_alternation_regex_3() -> Result<(), String> { - let automaton = RegularExpression::new("((a|bc)*|d)") + let automaton = RegularExpression::parse("((a|bc)*|d)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("")); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("abcaaabcbc")); + assert!(automaton.match_string("d")); + assert!(!automaton.match_string("ad")); + assert!(!automaton.match_string("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3b() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|(a|bc)*)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("")); assert!(automaton.match_string("a")); assert!(automaton.match_string("abcaaabcbc")); @@ -262,13 +278,30 @@ mod tests { Ok(()) } + #[test] + fn test_simple_alternation_regex_3t() -> Result<(), String> { + let automaton = RegularExpression::parse("(d*|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("")); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("abcaaabcbc")); + assert!(automaton.match_string("d")); + assert!(automaton.match_string("ddd")); + assert!(!automaton.match_string("ad")); + assert!(!automaton.match_string("abcd")); + Ok(()) + } + #[test] fn test_simple_alternation_regex_4() -> Result<(), String> { - let automaton = RegularExpression::new("(a+(ba+)*|ca*c)") + let automaton = RegularExpression::parse("(a+(ba+)*|ca*c)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("cc")); assert!(automaton.match_string("caaac")); assert!(automaton.match_string("a")); @@ -278,11 +311,11 @@ mod tests { #[test] fn test_simple_alternation_regex_5() -> Result<(), String> { - let automaton = RegularExpression::new("((aad|ads|a)*|q)") + let automaton = RegularExpression::parse("((aad|ads|a)*|q)", false) .unwrap() .to_automaton() .unwrap(); - automaton.to_dot(); + automaton.print_dot(); assert!(automaton.match_string("q")); assert!(automaton.match_string("aad")); assert!(automaton.match_string("ads")); @@ -294,4 +327,48 @@ mod tests { assert!(!automaton.match_string("qq")); Ok(()) } + + #[test] + fn test_simple_alternation_regex_6() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab|)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("ab")); + assert!(automaton.match_string("")); + assert!(!automaton.match_string("a")); + assert!(!automaton.match_string("b")); + assert!(!automaton.match_string("aab")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_7() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|a?|ab)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("a")); + assert!(automaton.match_string("d")); + assert!(automaton.match_string("ab")); + assert!(automaton.match_string("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_8() -> Result<(), String> { + let automaton = RegularExpression::parse("((d|a?|ab)u)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.match_string("au")); + assert!(automaton.match_string("du")); + assert!(automaton.match_string("abu")); + assert!(automaton.match_string("u")); + assert!(automaton.match_string("")); + Ok(()) + } } diff --git a/src/fast_automaton/serializer/mod.rs b/src/fast_automaton/serializer/mod.rs index aa06df0..7a40bae 100644 --- a/src/fast_automaton/serializer/mod.rs +++ b/src/fast_automaton/serializer/mod.rs @@ -134,7 +134,7 @@ mod tests { } fn assert_serialization(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, false).unwrap(); println!("{regex}"); let automaton = regex.to_automaton().unwrap(); @@ -153,11 +153,11 @@ mod tests { #[test] fn test_serialization_case_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*") + let automaton1 = RegularExpression::parse(".*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("\\d+") + let automaton2 = RegularExpression::parse("\\d+", false) .unwrap() .to_automaton() .unwrap(); diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 825ea7e..429c008 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -135,29 +135,25 @@ mod tests { #[test] fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair("(a|b)"); - assert_embedding_convertion_for_fair("(|a)"); - assert_embedding_convertion_for_fair(".*ab"); - assert_embedding_convertion_for_fair("toto"); - assert_embedding_convertion_for_fair(".{2,3}"); - assert_embedding_convertion_for_fair("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair(".*q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair( + assert_embedding_convertion("(a|b)"); + assert_embedding_convertion("(|a)"); + assert_embedding_convertion(".*ab"); + assert_embedding_convertion("toto"); + assert_embedding_convertion(".{2,3}"); + assert_embedding_convertion("q(ab|ca|ab|abc)x"); + assert_embedding_convertion(".*q(ab|ca|ab|abc)x"); + assert_embedding_convertion( "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", ); - assert_embedding_convertion_for_fair( + assert_embedding_convertion( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", ); Ok(()) } - fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex); - } - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, false).unwrap(); println!("{}", regex); let automaton = regex.to_automaton().unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 9c8e6ad..ea1bb71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,26 +41,26 @@ pub type CharRange = RangeSet; /// /// // Concatenate /// let concat = t1.concat(&[t2])?; -/// assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); +/// assert_eq!(concat.to_pattern(), "abc.*xyz"); /// /// // Union /// let union = t1.union(&[Term::from_pattern("fgh")?])?; -/// assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); +/// assert_eq!(union.to_pattern(), "(abc.*|fgh)"); /// /// // Intersection /// let inter = Term::from_pattern("(ab|xy){2}")? /// .intersection(&[Term::from_pattern(".*xy")?])?; -/// assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); +/// assert_eq!(inter.to_pattern(), "(ab|xy)xy"); /// /// // Difference /// let diff = Term::from_pattern("a*")? /// .difference(&Term::from_pattern("")?)?; -/// assert_eq!(diff.to_pattern().unwrap(), "a+"); +/// assert_eq!(diff.to_pattern(), "a+"); /// /// // Repetition /// let rep = Term::from_pattern("abc")? /// .repeat(2, Some(4))?; -/// assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); +/// assert_eq!(rep.to_pattern(), "(abc){2,4}"); /// /// // Analyze /// assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -388,7 +388,7 @@ impl Term { let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.are_equivalent(&automaton_2) + automaton_1.equivalent(&automaton_2) } /// Returns `true` if all strings matched by the current term are also matched by the given term. @@ -410,7 +410,7 @@ impl Term { let automaton_1 = self.to_automaton()?; let automaton_2 = that.to_automaton()?; - automaton_1.is_subset_of(&automaton_2) + automaton_1.subset(&automaton_2) } /// Checks if the term matches the empty language. @@ -465,17 +465,17 @@ impl Term { }) } - /// Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. - pub fn to_regex(&self) -> Option> { - Some(match self { + /// Converts the term to a RegularExpression. + pub fn to_regex(&self) -> Cow { + match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), - Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()?), - }) + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()), + } } - /// Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. - pub fn to_pattern(&self) -> Option { - Some(self.to_regex()?.to_string()) + /// Converts the term to a regular expression pattern. + pub fn to_pattern(&self) -> String { + self.to_regex().to_string() } fn determinize_subtrahend<'a>( @@ -523,12 +523,12 @@ impl Term { fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { let mut regex_list = Vec::with_capacity(terms.len() + 1); - regex_list.push(self.to_regex()?); + regex_list.push(self.to_regex()); let mut terms_regexes = terms .iter() .map(Term::to_regex) - .collect::>>()?; + .collect::>(); regex_list.append(&mut terms_regexes); Some(regex_list) @@ -548,7 +548,7 @@ mod tests { let intersection = regex1.intersection(&vec![regex2]).unwrap(); assert!(intersection.is_empty()); - assert_eq!("[]", intersection.to_pattern().unwrap()); + assert_eq!("[]", intersection.to_pattern()); Ok(()) } @@ -560,7 +560,7 @@ mod tests { let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("a+", result); Ok(()) @@ -573,9 +573,9 @@ mod tests { let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap().to_regex().unwrap().into_owned(); + let result = result.unwrap().to_regex().into_owned(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), + Term::RegularExpression(RegularExpression::new("x(x{3})*x?").unwrap()), Term::RegularExpression(result) ); @@ -589,7 +589,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("", result); Ok(()) @@ -602,7 +602,7 @@ mod tests { let result = regex1.intersection(&vec![regex2]); assert!(result.is_ok()); - let result = result.unwrap().to_pattern().unwrap(); + let result = result.unwrap().to_pattern(); assert_eq!("(x{3})*", result); Ok(()) diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 799c69d..e5c8f9b 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,8 +11,13 @@ lazy_static! { } impl RegularExpression { - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses and simplify the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { + Self::parse(pattern, true) + } + + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } @@ -24,7 +29,7 @@ impl RegularExpression { .build() .parse(&Self::remove_flags(pattern)) { - Ok(hir) => Self::convert_to_regex(&hir), + Ok(hir) => Self::convert_to_regex(&hir, simplify), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), } } @@ -52,7 +57,7 @@ impl RegularExpression { RegularExpression::Concat(VecDeque::new()) } - fn convert_to_regex(hir: &Hir) -> Result { + fn convert_to_regex(hir: &Hir, simplify: bool) -> Result { match hir.kind() { HirKind::Empty => Ok(RegularExpression::new_empty_string()), HirKind::Literal(literal) => { @@ -84,15 +89,26 @@ impl RegularExpression { HirKind::Look(_) => Ok(RegularExpression::new_empty_string()), HirKind::Repetition(repetition) => { let (min, max) = (repetition.min, repetition.max); - Self::convert_to_regex(&repetition.sub).map(|v| v.repeat(min, max)) + let regex = Self::convert_to_regex(&repetition.sub, simplify)?; + Ok(if simplify { + regex.repeat(min, max) + } else { + RegularExpression::Repetition(Box::new(regex), min, max) + }) } - HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub), + HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub, simplify), HirKind::Concat(concat) => { let mut concat_regex = RegularExpression::Concat(VecDeque::with_capacity(concat.len())); for c in concat { - let concat_value = Self::convert_to_regex(c)?; - concat_regex = concat_regex.concat(&concat_value, true); + let concat_value = Self::convert_to_regex(c, simplify)?; + if simplify { + concat_regex = concat_regex.concat(&concat_value, true); + } else if let RegularExpression::Concat(values) = concat_regex { + let mut values = values.clone(); + values.push_back(concat_value); + concat_regex = RegularExpression::Concat(values); + } } Ok(concat_regex) } @@ -100,8 +116,14 @@ impl RegularExpression { let mut alternation_regex = RegularExpression::Alternation(Vec::with_capacity(alternation.len())); for a in alternation { - let alternation_value = Self::convert_to_regex(a)?; - alternation_regex = alternation_regex.union(&alternation_value); + let alternation_value = Self::convert_to_regex(a, simplify)?; + if simplify { + alternation_regex = alternation_regex.union(&alternation_value); + } else if let RegularExpression::Alternation(values) = alternation_regex { + let mut values = values.clone(); + values.push(alternation_value); + alternation_regex = RegularExpression::Alternation(values); + } } Ok(alternation_regex) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 05908c0..2f5062f 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -151,6 +151,96 @@ impl RegularExpression { } } } + + pub fn evaluate_complexity(&self) -> f64 { + let (score, depth, _) = self.eval_inner(); + score + Self::depth_penalty(depth) + } + + // returns: (score, max_depth, contains_repetition) + fn eval_inner(&self) -> (f64, usize, bool) { + match self { + RegularExpression::Character(range) => { + let len = range.to_regex().len() as f64; + // small, capped cost for raw length + let base = 1.0 + 0.05 * len.min(40.0); + (base, 1, false) + } + + RegularExpression::Repetition(inner, min, max_opt) => { + let (inner_score, inner_depth, inner_has_rep) = inner.eval_inner(); + + // multipliers tuned for readability impact + let mut m = match max_opt { + None => 1.6, // open upper bound like a+ or a{m,} + Some(max) if max > min => 1.3, // variable upper bound a{m,n} + Some(max) if max == min && *min > 1 => 1.1, // exact count a{n} + _ => 1.0, // a{1} or degenerate + }; + + // nested quantifiers like (?:...+)+ are harder + if inner_has_rep { + m *= 1.5; + } + + (inner_score * m, inner_depth + 1, true) + } + + RegularExpression::Concat(items) => { + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for (i, it) in items.iter().enumerate() { + let (s, d, h) = it.eval_inner(); + sum += s; + if i > 0 { + // tiny discount: linear sequences are relatively easy to read + sum *= 0.98; + } + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + (sum, max_depth + 1, has_rep) + } + + RegularExpression::Alternation(branches) => { + if branches.is_empty() { + return (0.0, 1, false); + } + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for b in branches { + let (s, d, h) = b.eval_inner(); + sum += s; + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + // branching cost: more alternatives = harder to scan + let k = branches.len() as f64; + let multiplier = 1.0 + 0.15 * (k - 1.0); + + (sum * multiplier, max_depth + 1, has_rep) + } + } + } + + fn depth_penalty(depth: usize) -> f64 { + // no penalty up to depth 2, then quadratic growth + if depth <= 2 { + 0.0 + } else { + ((depth - 2) as f64).powi(2) * 0.8 + } + } } #[cfg(test)] diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index fe83c46..4e346f8 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -179,17 +179,9 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { - Some(RegularExpression::Repetition( - Box::new(this.clone()), - 2, - Some(2), - )) + Some(this.repeat(2, Some(2))) } } else if let ( RegularExpression::Repetition(this_regex, this_min, this_max_opt), @@ -204,11 +196,8 @@ impl RegularExpression { } else { None }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + + Some(this_regex.repeat(new_min, new_max_opt)) } else if let ( RegularExpression::Character(this_range), RegularExpression::Character(that_range), @@ -227,11 +216,7 @@ impl RegularExpression { if **this_regex == *that { let new_min = this_min + 1; let new_max_opt = this_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + Some(this_regex.repeat(new_min, new_max_opt)) } else { None } @@ -239,11 +224,7 @@ impl RegularExpression { if **that_regex == *this { let new_min = that_min + 1; let new_max_opt = that_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - that_regex.clone(), - new_min, - new_max_opt, - )) + Some(that_regex.repeat(new_min, new_max_opt)) } else { None } diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 86ddbe5..c5578ca 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -248,6 +248,6 @@ mod tests { let result = got.to_automaton().unwrap(); - assert!(repeat.are_equivalent(&result).unwrap()); + assert!(repeat.equivalent(&result).unwrap()); } } diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8e4f1f3..ee6abee 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -9,8 +9,9 @@ impl RegularExpression { } /// Returns a regular expression that is the union of all expressions in `patterns`. - pub fn union_all<'a, I: IntoIterator>(patterns: I) -> RegularExpression - { + pub fn union_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); for other in patterns { @@ -104,11 +105,7 @@ impl RegularExpression { ) = (this_character, that_repetition) { if this_character == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { let mut alternate = vec![this_character.clone(), that_repetition.clone()]; alternate.sort_unstable(); @@ -139,18 +136,10 @@ impl RegularExpression { self_regex.union_(&other_regex) } } else { - Cow::Owned(RegularExpression::Repetition( - Box::new(self_regex), - 0, - Some(1), - )) + Cow::Owned(self_regex.repeat(0, Some(1))) } } else if !other_regex.is_empty_string() { - Cow::Owned(RegularExpression::Repetition( - Box::new(other_regex), - 0, - Some(1), - )) + Cow::Owned(other_regex.repeat(0, Some(1))) } else { Cow::Owned(RegularExpression::new_empty_string()) }; @@ -228,11 +217,7 @@ impl RegularExpression { ) = (this_concat, that_repetition) { if this_concat == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { Self::opunion_common_affixes(this_concat, that_repetition) } @@ -288,18 +273,13 @@ impl RegularExpression { || this_max + 1 == *that_min || that_max + 1 == *this_min { - return RegularExpression::Repetition( - this_regex.clone(), + return this_regex.repeat( cmp::min(*this_min, *that_min), Some(cmp::max(*this_max, *that_max)), ); } } else { - return RegularExpression::Repetition( - this_regex.clone(), - cmp::min(*this_min, *that_min), - None, - ); + return this_regex.repeat(cmp::min(*this_min, *that_min), None); } } @@ -321,11 +301,7 @@ impl RegularExpression { ) = (this_repetition, that_alternation) { if that_alternation == &**this_regex && *this_min <= 2 { - RegularExpression::Repetition( - this_regex.clone(), - cmp::min(1, *this_min), - *this_max_opt, - ) + this_regex.repeat(cmp::min(1, *this_min), *this_max_opt) } else { let mut set = BTreeSet::new(); diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 4dd7e47..9609c8f 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -22,13 +22,13 @@ fn assert_regex(regex: &str) { assert!(re.is_match(&string), "'{string}'"); } - assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); - assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.are_equivalent(&determinized_automaton).unwrap()); + assert!(automaton.subset(&determinized_automaton).unwrap()); + assert!(determinized_automaton.subset(&automaton).unwrap()); + assert!(automaton.equivalent(&determinized_automaton).unwrap()); - let regex_from_automaton = automaton.to_regex().unwrap(); + let regex_from_automaton = automaton.to_regex(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.are_equivalent(&automaton_from_regex).unwrap()); + assert!(automaton.equivalent(&automaton_from_regex).unwrap()); } #[test] From 135cca6cd5a9f44c6319ae74996f2d9f49001d9f Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:09:59 +0200 Subject: [PATCH 25/62] update tests --- benches/my_benchmark.rs | 2 +- tests/data/regex-todo.txt | 6 ------ tests/data/regex.txt | 8 +++++++- tests/integration_tests.rs | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index c35164a..71898ec 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -6,7 +6,7 @@ fn parse_regex(regex: &str) -> RegularExpression { } fn to_regex(automaton: &FastAutomaton) -> RegularExpression { - automaton.to_regex().unwrap() + automaton.to_regex() } fn determinize(automaton: &FastAutomaton) -> FastAutomaton { diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt index 05849d6..e69de29 100644 --- a/tests/data/regex-todo.txt +++ b/tests/data/regex-todo.txt @@ -1,6 +0,0 @@ -(a*,a*)* -#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) -\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} -rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) -[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? -<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/data/regex.txt b/tests/data/regex.txt index 3eebe62..31aa829 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -70,4 +70,10 @@ https?://[^\s/$.?#][^\s]* [[:alnum:]&&[^0-9]] [ \t]+ [\r\n]+ -[^\t\r\n]+ \ No newline at end of file +[^\t\r\n]+ +(a*,a*)* +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 9609c8f..319f261 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -9,7 +9,7 @@ use regexsolver::regex::RegularExpression; fn assert_regex(regex: &str) { let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, true).unwrap(); let automaton = regex.to_automaton().unwrap(); let strings = automaton.generate_strings(500).unwrap(); for string in strings { From c3d800a998f82c7caa8413e5d1113e60928ba282 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:12:42 +0200 Subject: [PATCH 26/62] fix clippy --- src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs index bf5e682..33142ca 100644 --- a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -70,7 +70,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) has_transition_to_start_state = true; } - this_condition = this_condition.union(&condition); + this_condition = this_condition.union(condition); } if !has_transition_to_start_state { // Some state(s) do not have transition to the start state. From afbacc6941d046d444aeb0e75d3092bba6fe037d Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:24:28 +0200 Subject: [PATCH 27/62] add test --- .../convert/to_regex/transform/mod.rs | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs index 552222e..643e6ba 100644 --- a/src/fast_automaton/convert/to_regex/transform/mod.rs +++ b/src/fast_automaton/convert/to_regex/transform/mod.rs @@ -14,3 +14,34 @@ pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { automaton } + +#[cfg(test)] +mod tests { + use crate::{ + fast_automaton::convert::to_regex::transform::transform, regex::RegularExpression, + }; + + #[test] + fn test_equivalence() -> Result<(), String> { + assert_equivalent("abc"); + assert_equivalent(".*abc"); + assert_equivalent(".*abc.*def"); + assert_equivalent(".*abc.*def(ab|fr)"); + assert_equivalent(".*abc.*def(ab|fr).*mpa"); + + Ok(()) + } + + fn assert_equivalent(pattern: &str) { + let before = RegularExpression::parse(pattern, false) + .unwrap() + .to_automaton() + .unwrap(); + + let before = before.determinize().unwrap(); + + let after = transform(&before); + + assert!(before.equivalent(&after).unwrap()); + } +} From 7b576f72dcda1f65fd1c2bc77f465197235b7482 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:41:29 +0200 Subject: [PATCH 28/62] update readme --- README.md | 46 ++++++++++++++++++++++------------------------ src/lib.rs | 12 ++++++------ 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 4406477..b3baf47 100644 --- a/README.md +++ b/README.md @@ -38,26 +38,26 @@ fn main() -> Result<(), EngineError> { // Concatenate let concat = t1.concat(&[t2])?; - assert_eq!(concat.to_pattern().unwrap(), "abc.*xyz"); + assert_eq!(concat.to_pattern(), "abc.*xyz"); // Union let union = t1.union(&[Term::from_pattern("fgh")?])?; - assert_eq!(union.to_pattern().unwrap(), "(abc.*|fgh)"); + assert_eq!(union.to_pattern(), "(abc.*|fgh)"); // Intersection let inter = Term::from_pattern("(ab|xy){2}")? .intersection(&[Term::from_pattern(".*xy")?])?; - assert_eq!(inter.to_pattern().unwrap(), "(ab|xy)xy"); + assert_eq!(inter.to_pattern(), "(ab|xy)xy"); // Difference let diff = Term::from_pattern("a*")? .difference(&Term::from_pattern("")?)?; - assert_eq!(diff.to_pattern().unwrap(), "a+"); + assert_eq!(diff.to_pattern(), "a+"); // Repetition let rep = Term::from_pattern("abc")? .repeat(2, Some(4))?; - assert_eq!(rep.to_pattern().unwrap(), "(abc){2,4}"); + assert_eq!(rep.to_pattern(), "(abc){2,4}"); // Analyze assert_eq!(rep.get_length(), (Some(6), Some(12))); @@ -71,8 +71,8 @@ fn main() -> Result<(), EngineError> { // Equivalence & subset let a = Term::from_pattern("a+")?; let b = Term::from_pattern("a*")?; - assert!(!a.are_equivalent(&b)?); - assert!(a.is_subset_of(&b)?); + assert!(!a.equivalent(&b)?); + assert!(a.subset(&b)?); Ok(()) } @@ -118,17 +118,17 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `are_equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | +| `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | | `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | -| `is_subset_of(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | +| `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | -| `to_pattern(&self)` | `Option` | Converts the term to a regular expression pattern; returns `None` if conversion isn’t possible. | -| `to_regex(&self)` | `Option>` | Converts the term to a RegularExpression; returns `None` if conversion isn’t possible. | +| `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | +| `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | ### FastAutomaton @@ -192,17 +192,16 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `all_states_iter(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | -| `all_states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | -| `are_equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `direct_states_iter(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | +| `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | +| `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | -| `get_condition_mut(&mut self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a mutable reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | @@ -214,14 +213,12 @@ This design allows us to perform unions, intersections, and complements of trans | `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | -| `is_subset_of(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `state_in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `state_out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `to_regex(&self)` | `Option` | Attempts to convert the automaton to a `RegularExpression`; returns `None` if no equivalent pattern are found. | -| `transitions_from_into_iter(&self, state: State)` | `impl Iterator` | Returns an owned iterator over transitions from the given state. | -| `transitions_from_iter(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | -| `transitions_from_iter_mut(&mut self, state: State)` | `impl Iterator` | Returns a mutable iterator over transitions from the given state. | +| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | +| `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | @@ -234,10 +231,11 @@ This design allows us to perform unions, intersections, and complements of trans | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | -| `new(pattern: &str)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. | +| `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the given regular expression pattern and returns a corresponding `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | | `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | diff --git a/src/lib.rs b/src/lib.rs index ea1bb71..5c52410 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,8 +74,8 @@ pub type CharRange = RangeSet; /// // Equivalence & subset /// let a = Term::from_pattern("a+")?; /// let b = Term::from_pattern("a*")?; -/// assert!(!a.are_equivalent(&b)?); -/// assert!(a.is_subset_of(&b)?); +/// assert!(!a.equivalent(&b)?); +/// assert!(a.subset(&b)?); /// /// Ok(()) /// } @@ -379,9 +379,9 @@ impl Term { /// let term1 = Term::from_pattern("(abc|de)").unwrap(); /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { + pub fn equivalent(&self, that: &Term) -> Result { if self == that { return Ok(true); } @@ -401,9 +401,9 @@ impl Term { /// let term1 = Term::from_pattern("de").unwrap(); /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// - /// assert!(term1.is_subset_of(&term2).unwrap()); + /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn is_subset_of(&self, that: &Term) -> Result { + pub fn subset(&self, that: &Term) -> Result { if self == that { return Ok(true); } From 0a0d91ba57cc3bff05aff3b80365fbba006451d5 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:55:00 +0200 Subject: [PATCH 29/62] update readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b3baf47..2afb543 100644 --- a/README.md +++ b/README.md @@ -94,13 +94,13 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### Term -`Term` is an enum designed to represent either a regular expression or a compiled automaton. This unified representation enables seamless and efficient execution of set operations across multiple instances. It's particularly valuable when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. +`Term` is an enum designed to represent either a regular expression or an automaton. Used when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. #### Build | Method | Return | Description | | -------- | ------- | ------- | | `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | -| `from_pattern(pattern: &str)` | `Result` | Parses the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | +| `from_pattern(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | | `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | | `new_empty()` | `Term` | Creates a term that matches the empty language. | | `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | @@ -133,7 +133,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re ### FastAutomaton -`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. Not all automata can be converted to a regular expression. +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: ```rust @@ -192,8 +192,6 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | -| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | -| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | @@ -216,8 +214,10 @@ This design allows us to perform unions, intersections, and complements of trans | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | +| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | +| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | From 05b68020137c2777d6076bac8fe713f8c4babc8f Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:59:05 +0200 Subject: [PATCH 30/62] update method signature --- README.md | 2 +- src/fast_automaton/builder.rs | 8 ++++---- src/regex/mod.rs | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2afb543..72e10fa 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ This design allows us to perform unions, intersections, and complements of trans | `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | | `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | | `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | -| `new_from_range(range: &CharRange)` | `Result` | Creates an automaton that matches one of the characters in the given `CharRange`. | +| `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | | `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index da16808..4346fd2 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -39,19 +39,19 @@ impl FastAutomaton { } /// Creates an automaton that matches one of the characters in the given [`CharRange`]. - pub fn new_from_range(range: &CharRange) -> Result { + pub fn new_from_range(range: &CharRange) -> Self { let mut automaton = Self::new_empty(); if range.is_empty() { - return Ok(automaton); + return automaton; } let new_state = automaton.new_state(); let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); - let condition = Condition::from_range(range, &spanning_set)?; + let condition = Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); automaton.spanning_set = spanning_set; automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); - Ok(automaton) + automaton } /// Creates a new state and returns its identifier. diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 2f5062f..d3d2b14 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -129,7 +129,7 @@ impl RegularExpression { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; match self { - RegularExpression::Character(range) => FastAutomaton::new_from_range(range), + RegularExpression::Character(range) => Ok(FastAutomaton::new_from_range(range)), RegularExpression::Repetition(regular_expression, min, max_opt) => { let mut automaton = regular_expression.to_automaton()?; automaton.repeat_mut(*min, *max_opt)?; From a2dc371580d5b1f2fef367582ec731821fc5f919 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 17 Sep 2025 22:06:09 +0200 Subject: [PATCH 31/62] add concat all for regex --- README.md | 4 +-- src/regex/mod.rs | 10 +++---- src/regex/operation/concat.rs | 51 +++++++++++++++++------------------ 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 72e10fa..aa9505d 100644 --- a/README.md +++ b/README.md @@ -227,10 +227,11 @@ This design allows us to perform unions, intersections, and complements of trans `RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. -#### Build +#### Build/Manipulate | Method | Return | Description | | -------- | ------- | ------- | | `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | +| `concat_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the concatenation of all expressions in `patterns`. | | `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | @@ -292,7 +293,6 @@ execution_profile.run(|| { ## Cross-Language Support - If you want to use this library with other programming languages, we provide a wide range of wrappers: - [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) - [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) diff --git a/src/regex/mod.rs b/src/regex/mod.rs index d3d2b14..44feecd 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -172,13 +172,13 @@ impl RegularExpression { // multipliers tuned for readability impact let mut m = match max_opt { - None => 1.6, // open upper bound like a+ or a{m,} - Some(max) if max > min => 1.3, // variable upper bound a{m,n} - Some(max) if max == min && *min > 1 => 1.1, // exact count a{n} - _ => 1.0, // a{1} or degenerate + None => 1.6, + Some(max) if max > min => 1.3, + Some(max) if max == min && *min > 1 => 1.1, + _ => 1.0, }; - // nested quantifiers like (?:...+)+ are harder + // nested quantifiers like (...+)+ are harder if inner_has_rep { m *= 1.5; } diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 4e346f8..9cefb01 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,19 @@ use super::*; impl RegularExpression { + /// Returns a regular expression that is the concatenation of all expressions in `patterns`. + pub fn concat_all<'a, I: IntoIterator>( + patterns: I, + ) -> RegularExpression { + let mut result = RegularExpression::new_empty_string(); + + for other in patterns { + result = result.concat(other, true); + } + + result + } + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { @@ -11,35 +24,19 @@ impl RegularExpression { return self.clone(); } - match (self, other) { + let (front, back) = if append_back { + (self, other) + } else { + (other, self) + }; + + match (front, back) { (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_concat_and_concat(self, other) - } else { - Self::opconcat_concat_and_concat(other, self) - } - } - (RegularExpression::Concat(_), _) => { - if append_back { - Self::opconcat_concat_and_other(self, other) - } else { - Self::opconcat_other_and_concat(other, self) - } - } - (_, RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_other_and_concat(self, other) - } else { - Self::opconcat_concat_and_other(other, self) - } - } - (_, _) => { - if append_back { - Self::opconcat_other_and_other(self, other) - } else { - Self::opconcat_other_and_other(other, self) - } + Self::opconcat_concat_and_concat(front, back) } + (RegularExpression::Concat(_), _) => Self::opconcat_concat_and_other(front, back), + (_, RegularExpression::Concat(_)) => Self::opconcat_other_and_concat(front, back), + (_, _) => Self::opconcat_other_and_other(front, back), } } From 7afac6256ba76c3d3c376f2d4a4ccbe1e1c41939 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:48:21 +0200 Subject: [PATCH 32/62] update docs --- README.md | 31 ++++++++++++-------- src/fast_automaton/builder.rs | 5 ++-- src/fast_automaton/mod.rs | 29 ++++++++---------- src/fast_automaton/operation/intersection.rs | 4 +-- src/fast_automaton/operation/repeat.rs | 2 +- src/lib.rs | 14 ++++----- src/regex/builder.rs | 4 +-- src/regex/mod.rs | 3 +- src/regex/operation/repeat.rs | 20 ++++++------- 9 files changed, 58 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index aa9505d..1173ea8 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | -------- | ------- | ------- | | `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | -| `get_cardinality()` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | +| `get_cardinality(&self)` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | @@ -130,7 +130,6 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | | `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | - ### FastAutomaton `FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. @@ -170,8 +169,9 @@ This design allows us to perform unions, intersections, and complements of trans | `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | | `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | | `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | -| `remove_state(&mut self, state: State)` | `()` | Removes the state and all its connected transitions; panics if it's a start state. | +| `remove_state(&mut self, state: State)` | `()` | Removes the state and its connected transitions; panics if it's a start state. | | `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | +| `remove_transition(&mut self, from_state: State, to_state: State)` | `()` | Removes the transition between the two provided states if it exists. | #### Manipulate | Method | Return | Description | @@ -181,6 +181,7 @@ This design allows us to perform unions, intersections, and complements of trans | `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | | `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | | `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | +| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | | `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | | `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | @@ -192,35 +193,37 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `generate_strings(&self, number: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | +| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | +| `match_string(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | +| `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | | `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | -| `to_regex(&self)` | `RegularExpression` | Convert the automaton to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | -| `transitions_from_vec(&self, state: State)` | `Vec` | Returns a vector of transitions from the given state. | -| `transitions_to_vec(&self, state: State)` | `Vec` | Returns a vector of transitions to the given state. | +| `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | +| `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | ### RegularExpression @@ -236,8 +239,8 @@ This design allows us to perform unions, intersections, and complements of trans | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | -| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the given regular expression pattern and returns a corresponding `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | -| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | +| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | | `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | @@ -245,6 +248,7 @@ This design allows us to perform unions, intersections, and complements of trans #### Analyze | Method | Return | Description | | -------- | ------- | ------- | +| `evaluate_complexity(&self)` | `f64` | Returns a heuristic score for the readability of the pattern. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | | `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | @@ -252,6 +256,7 @@ This design allows us to perform unions, intersections, and complements of trans | `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | | `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | + ## Bound Execution Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 4346fd2..c12b366 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -193,6 +193,7 @@ impl FastAutomaton { } } + /// Removes the transition between the two provided states if it exists. pub fn remove_transition(&mut self, from_state: State, to_state: State) { self.assert_state_exists(from_state); if from_state != to_state { @@ -206,7 +207,7 @@ impl FastAutomaton { self.transitions[from_state].remove(&to_state); } - /// Removes the state and all its connected transitions; panics if it's a start state. + /// Removes the state and its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { @@ -236,7 +237,7 @@ impl FastAutomaton { } } - /// Remove the provided states from the automaton. Remove all the transitions they are connected to. Panic if one of the state is used as a start state. + /// Removes the given states and their connected transitions; panics if any is a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 7bf0313..c78604a 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -14,12 +14,6 @@ pub(crate) type Transitions = IntMap; /// The identifier of state in an [`FastAutomaton`] pub type State = usize; -/// A tuple containing the condition of a transition to a state. -pub type TransitionTo = (Condition, State); - -/// A tuple containing the condition of a transition from a state. -pub type TransitionFrom = (State, Condition); - mod analyze; mod builder; pub mod condition; @@ -125,8 +119,8 @@ impl FastAutomaton { self.direct_states(state).collect() } - /// Returns a vector containing the transitions to the provided state. - pub fn transitions_to_vec(&self, state: State) -> Vec { + /// Returns a vector of transitions to the given state. + pub fn transitions_to_vec(&self, state: State) -> Vec<(State, Condition)> { let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { for (condition, to_state) in self.transitions_from_vec(*from_state) { @@ -141,7 +135,7 @@ impl FastAutomaton { /// Returns a vector of transitions from the given state. #[inline] - pub fn transitions_from_vec(&self, state: State) -> Vec { + pub fn transitions_from_vec(&self, state: State) -> Vec<(Condition, State)> { self.transitions[state] .iter() .map(|(s, c)| (c.clone(), *s)) @@ -184,13 +178,13 @@ impl FastAutomaton { .collect() } - // Returns the number of states in the automaton. + /// Returns the number of states in the automaton. #[inline] pub fn get_number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } - // Returns a reference to the condition of the directed transition between the two states, if any. + /// Returns a reference to the condition of the directed transition between the two states, if any. #[inline] pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { self.transitions[from_state].get(&to_state) @@ -202,7 +196,7 @@ impl FastAutomaton { self.start_state } - // Returns a reference to the set of accept (final) states. + /// Returns a reference to the set of accept (final) states. #[inline] pub fn get_accept_states(&self) -> &IntSet { &self.accept_states @@ -238,21 +232,22 @@ impl FastAutomaton { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } - pub fn match_string(&self, input: &str) -> bool { + /// Returns `true` if the automaton matches the given string. + pub fn match_string(&self, string: &str) -> bool { let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); worklist.push_back((0, &self.start_state)); while let Some((position, current_state)) = worklist.pop_back() { - if input.len() == position { + if string.len() == position { if self.accept_states.contains(current_state) { return true; } continue; } - let curr_char = input.chars().nth(position).unwrap() as u32; + let curr_char = string.chars().nth(position).unwrap() as u32; for (cond, to_state) in self.transitions_from(*current_state) { if cond.has_character(&curr_char, &self.spanning_set).unwrap() { - if position + 1 == input.len() { + if position + 1 == string.len() { if self.accept_states.contains(to_state) { return true; } @@ -265,11 +260,13 @@ impl FastAutomaton { false } + /// Returns the automaton's DOT representation. #[inline] pub fn as_dot(&self) -> String { format!("{self}") } + /// Prints the automaton's DOT representation. #[inline] pub fn print_dot(&self) { println!("{self}"); diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 694d66c..b79ebab 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -126,7 +126,7 @@ impl FastAutomaton { Ok(Cow::Owned(new_automaton)) } - // Returns `true` if the two automata have a non-empty intersection. + /// Returns `true` if the two automata have a non-empty intersection. pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); @@ -195,7 +195,7 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self .transitions_from(state) .map(|(c, &s)| match condition_converter.convert(c) { diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index cdfd36b..c7ff2d8 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -1,7 +1,7 @@ use super::*; impl FastAutomaton { - // Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { diff --git a/src/lib.rs b/src/lib.rs index 5c52410..3d750f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -118,7 +118,7 @@ impl Term { Term::RegularExpression(RegularExpression::new_empty_string()) } - /// Parses the provided pattern and returns a new `Term` holding the resulting [`RegularExpression`]. + /// Parses and simplifies the provided pattern and returns a new [`Term`] holding the resulting [`RegularExpression`]. /// /// # Example: /// @@ -381,13 +381,13 @@ impl Term { /// /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn equivalent(&self, that: &Term) -> Result { - if self == that { + pub fn equivalent(&self, term: &Term) -> Result { + if self == term { return Ok(true); } let automaton_1 = self.to_automaton()?; - let automaton_2 = that.to_automaton()?; + let automaton_2 = term.to_automaton()?; automaton_1.equivalent(&automaton_2) } @@ -403,13 +403,13 @@ impl Term { /// /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn subset(&self, that: &Term) -> Result { - if self == that { + pub fn subset(&self, term: &Term) -> Result { + if self == term { return Ok(true); } let automaton_1 = self.to_automaton()?; - let automaton_2 = that.to_automaton()?; + let automaton_2 = term.to_automaton()?; automaton_1.subset(&automaton_2) } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e5c8f9b..6d00471 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -11,12 +11,12 @@ lazy_static! { } impl RegularExpression { - /// Parses and simplify the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses and simplifies the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { Self::parse(pattern, true) } - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If simplify is `true`, the expression is simplified during parsing. pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 44feecd..a7682f6 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -152,12 +152,13 @@ impl RegularExpression { } } + /// Returns a heuristic score for the readability of the pattern. pub fn evaluate_complexity(&self) -> f64 { let (score, depth, _) = self.eval_inner(); score + Self::depth_penalty(depth) } - // returns: (score, max_depth, contains_repetition) + /// Returns: (score, max_depth, contains_repetition) fn eval_inner(&self) -> (f64, usize, bool) { match self { RegularExpression::Character(range) => { diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index c5578ca..235f4f1 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,41 +1,41 @@ use super::*; impl RegularExpression { - /// Returns the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. - pub fn repeat(&self, o_min: u32, o_max_opt: Option) -> RegularExpression { + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); } else if self.is_empty() { return RegularExpression::new_empty(); } else if self.is_empty_string() { return Self::new_empty_string(); - } else if let Some(max) = o_max_opt { - if max < o_min || max == 0 { + } else if let Some(max) = max_opt { + if max < min || max == 0 { return RegularExpression::new_empty_string(); - } else if o_min == 1 && max == 1 { + } else if min == 1 && max == 1 { return self.clone(); } } match self { RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { - let new_max = if let (Some(o_max), Some(i_max)) = (o_max_opt, i_max_opt) { + let new_max = if let (Some(o_max), Some(i_max)) = (max_opt, i_max_opt) { Some(o_max * i_max) } else { None }; - if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, o_min, o_max_opt) { + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { RegularExpression::Repetition( regular_expression.clone(), - o_min * i_min, + min * i_min, new_max, ) } else { - RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt) + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) } } - _ => RegularExpression::Repetition(Box::new(self.clone()), o_min, o_max_opt), + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), } } From 2119ea6880a53bfa93ff8dc1a994f36a52127ff6 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 20:58:18 +0200 Subject: [PATCH 33/62] update doc --- README.md | 2 +- src/regex/builder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1173ea8..c299a26 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ This design allows us to perform unions, intersections, and complements of trans | `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | | `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | | `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | -| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If simplify is `true`, the expression is simplified during parsing. | +| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If `simplify` is `true`, the expression is simplified during parsing. | | `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | | `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 6d00471..7681b8e 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -16,7 +16,7 @@ impl RegularExpression { Self::parse(pattern, true) } - /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If simplify is `true`, the expression is simplified during parsing. + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If `simplify` is `true`, the expression is simplified during parsing. pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); From 0315e7d7fcbfbea88bf7723154b922b894889b67 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:25:55 +0200 Subject: [PATCH 34/62] additional updates --- README.md | 26 +- src/fast_automaton/analyze/cardinality.rs | 2 +- src/fast_automaton/analyze/mod.rs | 2 +- src/fast_automaton/builder.rs | 2 +- src/fast_automaton/generate.rs | 12 +- src/fast_automaton/mod.rs | 6 +- src/fast_automaton/operation/concat.rs | 226 +++++++++--------- src/fast_automaton/operation/determinize.rs | 4 +- src/fast_automaton/operation/difference.rs | 2 +- src/fast_automaton/operation/intersection.rs | 52 ++-- src/fast_automaton/operation/mod.rs | 4 +- src/fast_automaton/operation/repeat.rs | 12 +- src/fast_automaton/operation/union.rs | 134 +++++------ .../serializer/tokenizer/embed_automaton.rs | 2 +- .../serializer/tokenizer/mod.rs | 2 +- src/lib.rs | 4 +- src/regex/builder.rs | 16 +- 17 files changed, 254 insertions(+), 254 deletions(-) diff --git a/README.md b/README.md index c299a26..93f116f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) -**RegexSolver** is a high-performance Rust library for building, combining, and analyzing regular expressions and finite automata. Ideal for constraint solvers, code or test-case generators, and any system needing rich regex or automaton operations. +**RegexSolver** is a Rust library for building, combining, and analyzing regular expressions and finite automata. It is designed for constraint solvers, test generators, and other systems that need advanced regex and automaton operations. ## Table of Contents @@ -81,11 +81,11 @@ fn main() -> Result<(), EngineError> { ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. +- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. - **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). -- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. @@ -178,17 +178,17 @@ This design allows us to perform unions, intersections, and complements of trans | -------- | ------- | ------- | | `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | | `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the concatenation of all automatons in the given iterator. | +| `concat_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the concatenation of all automata in the given iterator. | | `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | | `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | | `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | | `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | -| `intersection_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the intersection of all automatons in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the intersection of all automatons in the given iterator. | +| `intersection_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the intersection of all automata in the given iterator. | +| `intersection_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the intersection of all automata in the given iterator. | | `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | | `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automatons: I)` | `Result` | Computes the union of all automatons in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automatons: I)` | `Result` | Computes in parallel the union of all automatons in the given iterator. | +| `union_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the union of all automata in the given iterator. | +| `union_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the union of all automata in the given iterator. | #### Analyze | Method | Return | Description | @@ -196,26 +196,26 @@ This design allows us to perform unions, intersections, and complements of trans | `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | -| `does_transition_exists(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, number: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | -| `get_reacheable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | +| `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | -| `is_determinitic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | +| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | +| `is_match(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `match_string(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | | `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | | `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index ccad761..4bbf9b7 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -10,7 +10,7 @@ impl FastAutomaton { } else if self.cyclic || self.is_total() { return Cardinality::Infinite; } - assert!(self.is_determinitic(), "The automaton should be deterministic."); + assert!(self.is_deterministic(), "The automaton should be deterministic."); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 3902460..dbc05f4 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -36,7 +36,7 @@ impl FastAutomaton { } /// Returns the set of all states reachable from the start state. - pub fn get_reacheable_states(&self) -> IntSet { + pub fn get_reachable_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); for from_state in self.states() { diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index c12b366..16f647b 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -342,6 +342,6 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert_eq!(deterministic, automaton.is_determinitic()); + assert_eq!(deterministic, automaton.is_deterministic()); } } diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 7532309..3e60f29 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -7,12 +7,12 @@ use super::*; impl FastAutomaton { /// Generates `count` strings matched by the automaton. - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { + pub fn generate_strings(&self, count: usize) -> Result, EngineError> { if self.is_empty() { return Ok(Vec::new()); } - let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); + let mut strings = AHashSet::with_capacity(cmp::min(count, 1000)); let execution_profile = ExecutionProfile::get(); @@ -20,8 +20,8 @@ impl FastAutomaton { AHashMap::with_capacity(self.get_number_of_states()); let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(number, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); + VecDeque::with_capacity(cmp::min(count, 1000)); + let mut visited = AHashSet::with_capacity(cmp::min(count, 1000)); worklist.push_back((vec![], self.start_state)); while let Some((ranges, state)) = worklist.pop_front() { @@ -31,7 +31,7 @@ impl FastAutomaton { } else { let mut end = false; let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < number { + while strings.len() < count { execution_profile.assert_not_timed_out()?; let mut string = vec![]; for i in 0..ranges.len() { @@ -54,7 +54,7 @@ impl FastAutomaton { } } - if strings.len() == number { + if strings.len() == count { break; } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index c78604a..94ff5f4 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -157,7 +157,7 @@ impl FastAutomaton { /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] - pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { + pub fn has_transition(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { return false; } @@ -216,7 +216,7 @@ impl FastAutomaton { /// Returns `true` if the automaton is deterministic. #[inline] - pub fn is_determinitic(&self) -> bool { + pub fn is_deterministic(&self) -> bool { self.deterministic } @@ -233,7 +233,7 @@ impl FastAutomaton { } /// Returns `true` if the automaton matches the given string. - pub fn match_string(&self, string: &str) -> bool { + pub fn is_match(&self, string: &str) -> bool { let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); worklist.push_back((0, &self.start_state)); diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 6318c61..7eac73f 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -12,11 +12,11 @@ impl FastAutomaton { Self::concat_all([self, other]) } - /// Computes the concatenation of all automatons in the given iterator. - pub fn concat_all<'a, I: IntoIterator>(automatons: I) -> Result + /// Computes the concatenation of all automata in the given iterator. + pub fn concat_all<'a, I: IntoIterator>(automata: I) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); - for automaton in automatons { + for automaton in automata { new_automaton.concat_mut(automaton)?; } @@ -141,10 +141,10 @@ mod tests { .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("abc")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("abc")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); Ok(()) } @@ -154,17 +154,17 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("0101abc")); - assert!(automaton.match_string("0101ac")); - assert!(automaton.match_string("0101aaa")); - assert!(!automaton.match_string("abc")); - assert!(!automaton.match_string("0101abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("0101abc")); + assert!(automaton.is_match("0101ac")); + assert!(automaton.is_match("0101aaa")); + assert!(!automaton.is_match("abc")); + assert!(!automaton.is_match("0101abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); Ok(()) } @@ -174,12 +174,12 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("AAABBB")); - assert!(automaton.match_string("AA")); - assert!(automaton.match_string("AB")); - assert!(!automaton.match_string("B")); - assert!(!automaton.match_string("ABA")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("AAABBB")); + assert!(automaton.is_match("AA")); + assert!(automaton.is_match("AB")); + assert!(!automaton.is_match("B")); + assert!(!automaton.is_match("ABA")); + assert!(!automaton.is_match("")); Ok(()) } @@ -189,11 +189,11 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaaaaaa")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaaaaaa")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); Ok(()) } @@ -204,11 +204,11 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("c")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aac")); - assert!(automaton.match_string("aaaaaaac")); - assert!(!automaton.match_string("abc")); + assert!(automaton.is_match("c")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aac")); + assert!(automaton.is_match("aaaaaaac")); + assert!(!automaton.is_match("abc")); Ok(()) } @@ -219,11 +219,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("ababab")); - assert!(automaton.match_string("abababab")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("abab")); - assert!(!automaton.match_string("ababababab")); + assert!(automaton.is_match("ababab")); + assert!(automaton.is_match("abababab")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("abab")); + assert!(!automaton.is_match("ababababab")); Ok(()) } @@ -234,10 +234,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aaaaa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aaaaa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); Ok(()) } @@ -248,10 +248,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); Ok(()) } @@ -262,11 +262,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); Ok(()) } @@ -277,11 +277,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); Ok(()) } @@ -292,15 +292,15 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(!automaton.match_string("aab")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("aaba")); - assert!(automaton.match_string("aabaaa")); - assert!(automaton.match_string("aaabaaabaaba")); - assert!(!automaton.match_string("aaabbaa")); + assert!(!automaton.is_match("")); + assert!(!automaton.is_match("aab")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("aaba")); + assert!(automaton.is_match("aabaaa")); + assert!(automaton.is_match("aaabaaabaaba")); + assert!(!automaton.is_match("aaabbaa")); Ok(()) } @@ -311,18 +311,18 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("acaadsac")); - assert!(automaton.match_string("adsaaaaaaaacaa")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("c")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("acaadsac")); + assert!(automaton.is_match("adsaaaaaaaacaa")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("c")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); Ok(()) } @@ -333,18 +333,18 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("ef")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("efadsa")); - assert!(automaton.match_string("aaadsefef")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("e")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("ef")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("efadsa")); + assert!(automaton.is_match("aaadsefef")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("e")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); Ok(()) } @@ -355,13 +355,13 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("bc")); - assert!(automaton.match_string("abcbca")); - assert!(automaton.match_string("bcabcbcaaaa")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("c")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("bc")); + assert!(automaton.is_match("abcbca")); + assert!(automaton.is_match("bcabcbcaaaa")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("c")); Ok(()) } @@ -372,14 +372,14 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); Ok(()) } @@ -390,14 +390,14 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 73f8464..0217834 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -91,7 +91,7 @@ mod tests { let deterministic_automaton = automaton.determinize().unwrap(); - assert!(deterministic_automaton.is_determinitic()); + assert!(deterministic_automaton.is_deterministic()); Ok(()) } @@ -125,7 +125,7 @@ mod tests { "States After: {}", deterministic_automaton.get_number_of_states() ); - assert!(deterministic_automaton.is_determinitic()); + assert!(deterministic_automaton.is_deterministic()); assert!( automaton .difference(&deterministic_automaton) diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 4e2bd22..59ca6ed 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -6,7 +6,7 @@ use super::*; impl FastAutomaton { fn totalize(&mut self) -> Result<(), EngineError> { - assert!(self.is_determinitic(), "The automaton should be deterministic."); + assert!(self.is_deterministic(), "The automaton should be deterministic."); let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index b79ebab..e373c55 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -14,13 +14,13 @@ impl FastAutomaton { FastAutomaton::intersection_all([self, other]) } - /// Computes the intersection of all automatons in the given iterator. + /// Computes the intersection of all automata in the given iterator. pub fn intersection_all<'a, I: IntoIterator>( - automatons: I, + automata: I, ) -> Result { let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); - for automaton in automatons { + for automaton in automata { result = result.intersection_internal(automaton)?; if result.is_empty() { @@ -31,15 +31,15 @@ impl FastAutomaton { Ok(result.into_owned()) } - /// Computes in parallel the intersection of all automatons in the given iterator. + /// Computes in parallel the intersection of all automata in the given iterator. pub fn intersection_all_par<'a, I: IntoParallelIterator>( - automatons: I, + automata: I, ) -> Result { let execution_profile = ExecutionProfile::get(); let total = FastAutomaton::new_total(); - automatons + automata .into_par_iter() .try_fold( || total.clone(), @@ -224,11 +224,11 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } @@ -244,9 +244,9 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(!intersection.match_string("a")); - assert!(!intersection.match_string("b")); + assert!(intersection.is_match("")); + assert!(!intersection.is_match("a")); + assert!(!intersection.is_match("b")); Ok(()) } @@ -262,11 +262,11 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(intersection.match_string("xxx")); - assert!(intersection.match_string("xxxxxx")); - assert!(!intersection.match_string("xx")); - assert!(!intersection.match_string("xxxx")); + assert!(intersection.is_match("")); + assert!(intersection.is_match("xxx")); + assert!(intersection.is_match("xxxxxx")); + assert!(!intersection.is_match("xx")); + assert!(!intersection.is_match("xxxx")); Ok(()) } @@ -282,12 +282,12 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("aaac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("aaac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } @@ -307,7 +307,7 @@ mod tests { assert!(!intersection.is_empty()); - assert!(intersection.match_string("avb@gmail.com")); + assert!(intersection.is_match("avb@gmail.com")); Ok(()) } } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index b21d25f..0805011 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -14,7 +14,7 @@ mod repeat; impl FastAutomaton { pub(crate) fn remove_dead_transitions(&mut self) { if !self.is_empty() { - let reacheable_states = self.get_reacheable_states(); + let reacheable_states = self.get_reachable_states(); let mut dead_states = IntSet::default(); for from_state in self.states() { @@ -45,7 +45,7 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_reacheable_states().len()); + assert_eq!(3, intersection.get_reachable_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index c7ff2d8..b94b328 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -116,12 +116,12 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("")); - assert!(automaton.match_string(",")); - assert!(automaton.match_string("aaa,")); - assert!(automaton.match_string("aaaa,aa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); + assert!(automaton.is_match("")); + assert!(automaton.is_match(",")); + assert!(automaton.is_match("aaa,")); + assert!(automaton.is_match("aaaa,aa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); Ok(()) } } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 8e80b39..db2f62c 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -13,26 +13,26 @@ impl FastAutomaton { Self::union_all([self, other]) } - /// Computes the union of all automatons in the given iterator. + /// Computes the union of all automata in the given iterator. pub fn union_all<'a, I: IntoIterator>( - automatons: I, + automata: I, ) -> Result { let mut new_automaton = FastAutomaton::new_empty(); - for automaton in automatons { + for automaton in automata { new_automaton.union_mut(automaton)?; } Ok(new_automaton) } - /// Computes in parallel the union of all automatons in the given iterator. + /// Computes in parallel the union of all automata in the given iterator. pub fn union_all_par<'a, I: IntoParallelIterator>( - automatons: I, + automata: I, ) -> Result { let execution_profile = ExecutionProfile::get(); let empty = FastAutomaton::new_empty(); - automatons + automata .into_par_iter() .try_fold( || empty.clone(), @@ -218,16 +218,16 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - assert!(automaton.match_string("abc")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); + assert!(automaton.is_match("abc")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); Ok(()) } @@ -238,11 +238,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("b")); - assert!(automaton.match_string("bb")); - assert!(!automaton.match_string("bbb")); - assert!(!automaton.match_string("bbbb")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("b")); + assert!(automaton.is_match("bb")); + assert!(!automaton.is_match("bbb")); + assert!(!automaton.is_match("bbbb")); Ok(()) } @@ -253,12 +253,12 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -269,12 +269,12 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -285,13 +285,13 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(automaton.match_string("ddd")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ddd")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); Ok(()) } @@ -302,10 +302,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("cc")); - assert!(automaton.match_string("caaac")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aababa")); + assert!(automaton.is_match("cc")); + assert!(automaton.is_match("caaac")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aababa")); Ok(()) } @@ -316,15 +316,15 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("q")); - assert!(automaton.match_string("aad")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aadadsaaa")); - assert!(!automaton.match_string("aaaas")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("adsq")); - assert!(!automaton.match_string("qq")); + assert!(automaton.is_match("q")); + assert!(automaton.is_match("aad")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aadadsaaa")); + assert!(!automaton.is_match("aaaas")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("adsq")); + assert!(!automaton.is_match("qq")); Ok(()) } @@ -335,11 +335,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("ab")); - assert!(automaton.match_string("")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("aab")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("aab")); Ok(()) } @@ -350,10 +350,10 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("d")); - assert!(automaton.match_string("ab")); - assert!(automaton.match_string("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); Ok(()) } @@ -364,11 +364,11 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert!(automaton.match_string("au")); - assert!(automaton.match_string("du")); - assert!(automaton.match_string("abu")); - assert!(automaton.match_string("u")); - assert!(automaton.match_string("")); + assert!(automaton.is_match("au")); + assert!(automaton.is_match("du")); + assert!(automaton.is_match("abu")); + assert!(automaton.is_match("u")); + assert!(automaton.is_match("")); Ok(()) } } diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 429c008..3074a0f 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -32,7 +32,7 @@ impl Tokenizer<'_> { vec.push(AutomatonToken::AcceptState) } - for (condition, to_state) in self.automaton.transitions_from_iter(current_state) { + for (condition, to_state) in self.automaton.transitions_from(current_state) { if condition.is_empty() { continue; } diff --git a/src/fast_automaton/serializer/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs index 95ccb18..8dccca2 100644 --- a/src/fast_automaton/serializer/tokenizer/mod.rs +++ b/src/fast_automaton/serializer/tokenizer/mod.rs @@ -39,7 +39,7 @@ impl Tokenizer<'_> { state_counter += 1; automaton - .transitions_from_iter(current_state) + .transitions_from(current_state) .filter(|(c, _)| !c.is_empty()) .for_each(|(_, to_state)| { if !seen.contains(to_state) { diff --git a/src/lib.rs b/src/lib.rs index 3d750f0..dcff59b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,7 +449,7 @@ impl Term { pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => Ok(if !automaton.is_determinitic() { + Term::Automaton(automaton) => Ok(if !automaton.is_deterministic() { automaton.determinize()?.get_cardinality() } else { automaton.get_cardinality() @@ -482,7 +482,7 @@ impl Term { minuend: &FastAutomaton, subtrahend: &'a FastAutomaton, ) -> Result, EngineError> { - if subtrahend.is_determinitic() { + if subtrahend.is_deterministic() { Ok(Cow::Borrowed(subtrahend)) } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { Ok(Cow::Owned( diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 7681b8e..727bcfe 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -272,22 +272,22 @@ mod tests { let regex_parsed = RegularExpression::new(".").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("\t")); - assert!(automaton.match_string("\n")); - assert!(automaton.match_string("\r")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("\t")); + assert!(automaton.is_match("\n")); + assert!(automaton.is_match("\r")); let regex_parsed = RegularExpression::new("(?i)a").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("A")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("A")); let regex_parsed = RegularExpression::new("a(?i)a(?-s).").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("aa\n")); - assert!(!automaton.match_string("aAb")); + assert!(automaton.is_match("aa\n")); + assert!(!automaton.is_match("aAb")); assert!(RegularExpression::new("\\1").is_err()); Ok(()) From 1fd6bfcdbec4bf2434ca3a357207f266f266c1bb Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:32:33 +0200 Subject: [PATCH 35/62] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93f116f..0a960fb 100644 --- a/README.md +++ b/README.md @@ -81,11 +81,11 @@ fn main() -> Result<(), EngineError> { ## Key Concepts & Limitations RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: +- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. - **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. - **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. - **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". - **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. From 499735d77afc09060aa46affe37f89637715cf6e Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 20 Sep 2025 21:17:45 +0200 Subject: [PATCH 36/62] update docs --- README.md | 5 +++-- src/fast_automaton/convert/to_regex/mod.rs | 1 + src/lib.rs | 2 +- tests/data/regex-todo.txt | 0 4 files changed, 5 insertions(+), 3 deletions(-) delete mode 100644 tests/data/regex-todo.txt diff --git a/README.md b/README.md index 0a960fb..69025a2 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | | `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | | `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | -| `to_regex(&self)` | `Cow` | Converts the term to a RegularExpression. | +| `to_regex(&self)` | `Cow` | Converts the term to a `RegularExpression`. | ### FastAutomaton @@ -197,7 +197,7 @@ This design allows us to perform unions, intersections, and complements of trans | `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | @@ -221,6 +221,7 @@ This design allows us to perform unions, intersections, and complements of trans | `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | | `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | | `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | +| `to_regex(&self)` | `RegularExpression` | Converts the term to a `RegularExpression`. | | `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | | `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | | `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 10a530e..6378449 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -4,6 +4,7 @@ mod state_elimination; mod transform; impl FastAutomaton { + /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> RegularExpression { let transformed_automaton = transform::transform(self); state_elimination::convert_to_regex(&transformed_automaton) diff --git a/src/lib.rs b/src/lib.rs index dcff59b..de45599 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -465,7 +465,7 @@ impl Term { }) } - /// Converts the term to a RegularExpression. + /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> Cow { match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), diff --git a/tests/data/regex-todo.txt b/tests/data/regex-todo.txt deleted file mode 100644 index e69de29..0000000 From b67597dbd58ff080587689dab173f1107db4c246 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 21 Sep 2025 08:38:35 +0200 Subject: [PATCH 37/62] update method signatures --- README.md | 6 +++--- src/fast_automaton/analyze/cardinality.rs | 6 +++--- src/fast_automaton/analyze/length.rs | 4 ++-- src/fast_automaton/builder.rs | 4 ++-- .../convert/to_regex/transform/shape/dotstar.rs | 6 +++--- src/fast_automaton/mod.rs | 10 +++++----- src/fast_automaton/operation/repeat.rs | 2 +- src/fast_automaton/operation/union.rs | 4 ++-- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 69025a2..f45837d 100644 --- a/README.md +++ b/README.md @@ -194,8 +194,8 @@ This design allows us to perform unions, intersections, and complements of trans | Method | Return | Description | | -------- | ------- | ------- | | `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | -| `direct_states(&self, state: &State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | -| `direct_states_vec(&self, state: &State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | +| `direct_states(&self, state: State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | +| `direct_states_vec(&self, state: State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | | `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | @@ -209,7 +209,7 @@ This design allows us to perform unions, intersections, and complements of trans | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | | `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `is_accepted(&self, state: &State)` | `bool` | Returns `true` if the given state is one of the accept states. | +| `is_accepted(&self, state: State)` | `bool` | Returns `true` if the given state is one of the accept states. | | `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 4bbf9b7..ec5f514 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -65,8 +65,8 @@ impl FastAutomaton { let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.states_vec() { - in_degree.entry(*from_state).or_insert(0); + for &from_state in &self.states_vec() { + in_degree.entry(from_state).or_insert(0); for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } @@ -80,7 +80,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.direct_states(&from_state) { + for to_state in self.direct_states(from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index bbec964..c753908 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -27,7 +27,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -54,7 +54,7 @@ impl FastAutomaton { } seen.insert(state); - for to_state in self.direct_states(&state) { + for to_state in self.direct_states(state) { if to_state == state || seen.contains(&to_state) { max = None; break; diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 16f647b..8b8f844 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -286,9 +286,9 @@ impl FastAutomaton { return Ok(()); } let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.states_vec() { + for &from_state in &self.states_vec() { for to_state in self.direct_states_vec(from_state) { - match self.transitions[*from_state].entry(to_state) { + match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { o.insert(condition_converter.convert(o.get())?); } diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs index 33142ca..6c91106 100644 --- a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs +++ b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs @@ -39,7 +39,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) let start_state = start_state.unwrap(); let mut first_hop = automaton - .direct_states(&start_state) + .direct_states(start_state) .filter(|&s| s != start_state) .collect::>(); let mut states_to_remove = vec![]; @@ -89,7 +89,7 @@ fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) automaton.add_transition(start_state, start_state, &out_condition.unwrap()); for &state in component { - for to_state in automaton.direct_states_vec(&state) { + for to_state in automaton.direct_states_vec(state) { if !component.contains(&to_state) { continue; } @@ -149,7 +149,7 @@ fn strongconnect( stack.push(v); on_stack[v] = true; - for w in automaton.direct_states(&v) { + for w in automaton.direct_states(v) { if indices[w] == -1 { strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); lowlink[v] = lowlink[v].min(lowlink[w]); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 94ff5f4..10b269c 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -106,8 +106,8 @@ impl FastAutomaton { /// Returns an iterator over states directly reachable from the given state in one transition. #[inline] - pub fn direct_states(&self, state: &State) -> impl Iterator + '_ { - self.transitions[*state] + pub fn direct_states(&self, state: State) -> impl Iterator + '_ { + self.transitions[state] .keys() .cloned() .filter(|s| !self.removed_states.contains(s)) @@ -115,7 +115,7 @@ impl FastAutomaton { /// Returns a vector of states directly reachable from the given state in one transition. #[inline] - pub fn direct_states_vec(&self, state: &State) -> Vec { + pub fn direct_states_vec(&self, state: State) -> Vec { self.direct_states(state).collect() } @@ -210,8 +210,8 @@ impl FastAutomaton { /// Returns `true` if the given state is one of the accept states. #[inline] - pub fn is_accepted(&self, state: &State) -> bool { - self.accept_states.contains(state) + pub fn is_accepted(&self, state: State) -> bool { + self.accept_states.contains(&state) } /// Returns `true` if the automaton is deterministic. diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index b94b328..f9e256d 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -23,7 +23,7 @@ impl FastAutomaton { if min == 0 && self.in_degree(self.start_state) != 0 { let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { + if self.is_accepted(self.start_state) { self.accept(new_state); } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index db2f62c..96480ea 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -62,7 +62,7 @@ impl FastAutomaton { ) -> Result, EngineError> { let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - if other.is_accepted(&other.start_state) { + if other.is_accepted(other.start_state) { self.accept(self.start_state); } let self_start_state_in_degree = self.in_degree(self.start_state); @@ -82,7 +82,7 @@ impl FastAutomaton { } if other_start_state_in_degree != 0 { let new_state = self.new_state(); - if other.is_accepted(&other.start_state) { + if other.is_accepted(other.start_state) { self.accept(new_state); } From c243522264bd8f4774551d9b69e9a3326b28e871 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 21 Sep 2025 20:02:22 +0200 Subject: [PATCH 38/62] fix failed build --- src/fast_automaton/serializer/tokenizer/embed_automaton.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 3074a0f..5cb2c5b 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -27,7 +27,7 @@ impl Tokenizer<'_> { AutomatonToken::State(*self.state_to_token.get(¤t_state).unwrap()); vec.push(embedded_state); - if self.automaton.is_accepted(¤t_state) { + if self.automaton.is_accepted(current_state) { // accept state vec.push(AutomatonToken::AcceptState) } From 02852f967f334c40979b7328a27fbdaffb3beaec Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 24 Sep 2025 21:57:28 +0200 Subject: [PATCH 39/62] fix serialization --- src/cardinality/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 9adad1c..54bdcde 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// Represent a number. #[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] +#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value", rename_all = "camelCase"))] pub enum Cardinality { /// An infinite number. Infinite, From 9a1266f3f3752fa6eb3cfbb6e8f0dbec37c47279 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 2 Oct 2025 21:37:03 +0200 Subject: [PATCH 40/62] Huge improvements in generate strings --- src/fast_automaton/generate.rs | 171 +++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 59 deletions(-) diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 3e60f29..e0d5ae3 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,5 +1,3 @@ -use std::cmp; - use crate::{EngineError, execution_profile::ExecutionProfile}; use ahash::AHashSet; @@ -9,80 +7,119 @@ impl FastAutomaton { /// Generates `count` strings matched by the automaton. pub fn generate_strings(&self, count: usize) -> Result, EngineError> { if self.is_empty() { - return Ok(Vec::new()); + return Ok(vec![]); } - let mut strings = AHashSet::with_capacity(cmp::min(count, 1000)); + let (min, max) = self.get_length(); + let max_len = if let Some(max) = max { + max + } else { + let min = min.expect("A non empty automaton should have a minimum length"); + min.saturating_add(100) + } as usize; let execution_profile = ExecutionProfile::get(); - let mut ranges_cache: AHashMap<&Condition, CharRange> = - AHashMap::with_capacity(self.get_number_of_states()); + let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); + let mut strings = AHashSet::with_capacity(count); + let mut visited = AHashSet::with_capacity(self.get_number_of_states()); + let mut q = VecDeque::with_capacity(self.get_number_of_states()); + q.push_back((self.get_start_state(), vec![], 0u64)); + while let Some((state, ranges, h)) = q.pop_front() { + execution_profile.assert_not_timed_out()?; - let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(count, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(count, 1000)); + if ranges.len() > max_len { + continue; + } - worklist.push_back((vec![], self.start_state)); - while let Some((ranges, state)) = worklist.pop_front() { - if self.accept_states.contains(&state) { + if self.is_accepted(state) { if ranges.is_empty() { strings.insert(String::new()); } else { - let mut end = false; - let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < count { - execution_profile.assert_not_timed_out()?; - let mut string = vec![]; - for i in 0..ranges.len() { - if let Some(character) = ranges_iter[i].next() { - string.push(character); - } else { - ranges_iter[i] = ranges[i].iter(); - if i + 1 < ranges.len() { - string.push(ranges_iter[i].next().unwrap()); - } else { - end = true; - break; - } - } - } - if end { - break; - } - strings.insert(string.into_iter().map(|c| c.to_char()).collect()); - } + Self::ranges_to_strings(&mut strings, &ranges, count, &execution_profile)?; } - if strings.len() == count { + if strings.len() >= count { break; } } - for (cond, to_state) in self.transitions_from(state) { + + for (cond, &to_state) in self.transitions_from(state) { + let hash = + Self::path_mix(h, Self::mix64(state as u64 ^ Self::mix64(to_state as u64))); + + if visited.insert((to_state, ranges.len() + 1, hash)) { + let mut new_ranges = ranges.clone(); + new_ranges.push( + ranges_cache + .entry(cond) + .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) + .clone(), + ); + + q.push_back((to_state, new_ranges, hash)); + } + } + } + let mut strings: Vec = strings.into_iter().collect(); + strings.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); + Ok(strings) + } + + pub fn ranges_to_strings( + strings: &mut AHashSet, + ranges: &Vec, + count: usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + let n = count - strings.len(); + if n == 0 { + return Ok(()); + } + + let mut end = false; + let mut out: Vec = Vec::with_capacity(n); + out.push(String::with_capacity(ranges.len())); + for r in ranges { + let mut next = Vec::with_capacity(n); + for prefix in out.into_iter() { execution_profile.assert_not_timed_out()?; - let range = match ranges_cache.entry(cond) { - Entry::Occupied(o) => o.get().clone(), - Entry::Vacant(v) => { - let range = cond.to_range(&self.spanning_set)?; - v.insert(range.clone()); - range + for ch in r.clone().iter() { + let mut s = prefix.clone(); + s.push(ch.to_char()); + next.push(s); + if next.len() == n { + end = true; + break; } - }; - if range.is_empty() { - continue; } - let mut new_ranges = ranges.clone(); - new_ranges.push(range); - let element = (new_ranges, *to_state); - - if !visited.contains(&element) { - visited.insert(element.clone()); - worklist.push_back(element); + if end { + end = false; + break; } } + out = next; + if out.is_empty() { + break; + } } + strings.extend(out); + Ok(()) + } + + #[inline] + fn mix64(mut x: u64) -> u64 { + // splitmix64 + x = x.wrapping_add(0x9E3779B97F4A7C15); + let mut z = x; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); + z ^ (z >> 31) + } - Ok(strings.into_iter().collect()) + #[inline] + fn path_mix(h: u64, x: u64) -> u64 { + h.wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7) ^ x } } @@ -94,8 +131,24 @@ mod tests { #[test] fn test_generate_strings() -> Result<(), String> { + assert_generate_strings("a{100}[a-z]", 100); + assert_generate_strings("(ab|cd)e", 100); + assert_generate_strings("[a-z]+", 100); + assert_generate_strings("[a-z]+@", 100); assert_generate_strings("ù", 1000); + assert_generate_strings("[0-9]+[A-Z]*", 500); + assert_generate_strings("a+(ba+)*", 200); + assert_generate_strings("((a|bc)*|d)", 200); + assert_generate_strings(".*", 50); + assert_generate_strings("(ac|ads|a)*", 200); + assert_generate_strings("((aad|ads|a)*|q)", 200); + + assert_generate_strings( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + 1000, + ); + assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); assert_generate_strings( "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", @@ -109,6 +162,7 @@ mod tests { assert_generate_strings("((aad|ads|a)*|q)", 200); assert_generate_strings("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", 1000); //((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5} + Ok(()) } @@ -118,23 +172,22 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - println!("{}", automaton.get_number_of_states()); + //println!("{}", automaton.get_number_of_states()); //automaton.to_dot(); let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); let strings = automaton.generate_strings(number).unwrap(); - let mut strings: Vec<_> = strings.iter().collect(); - strings.sort_unstable(); println!("nb of strings: {}/{}", strings.len(), number); assert!(number >= strings.len()); for string in strings { - if !re.is_match(string) { + // println!("{string}"); + if !re.is_match(&string) { for byte in string.as_bytes() { print!("{:02x} ", byte); } panic!("'{string}'") } - assert!(re.is_match(string), "'{string}'"); + assert!(re.is_match(&string), "'{string}'"); } } } From fb5eb1a72dcb6bc390c1a9a828e15be0fbd6505c Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:20:34 +0200 Subject: [PATCH 41/62] Fix bad implementation of to_embedding --- src/fast_automaton/serializer/tokenizer/embed_automaton.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs index 5cb2c5b..bfe8197 100644 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs @@ -16,11 +16,13 @@ impl Tokenizer<'_> { worklist.push_front(self.automaton.get_start_state()); while let Some(current_state) = worklist.pop_back() { + if !seen.insert(current_state) { + continue; + } if !vec.is_empty() { // separator vec.push(AutomatonToken::SeparatorState) } - seen.insert(current_state); // state let embedded_state = From 1e4980b201bd4a549a82d495e49a74e6f209f9a1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:47:53 +0200 Subject: [PATCH 42/62] improve assert_not_timed_out clock cycle --- src/execution_profile.rs | 64 +++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index cda0e11..b845261 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -1,4 +1,7 @@ -use std::{cell::RefCell, time::SystemTime}; +use std::{ + cell::RefCell, + time::{Duration, Instant}, +}; use crate::error::EngineError; @@ -41,10 +44,10 @@ use crate::error::EngineError; pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. max_number_of_states: Option, - /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - start_execution_time: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - execution_timeout: Option, + execution_timeout: Option, + /// The time after when a [`EngineError::OperationTimeOutError`] should be thrown. + execution_deadline: Option, } impl PartialEq for ExecutionProfile { @@ -66,15 +69,8 @@ impl ExecutionProfile { /// /// Return [`EngineError::OperationTimeOutError`] otherwise. pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let (Some(start), Some(execution_timeout)) = - (self.start_execution_time, self.execution_timeout) - { - let run_duration = SystemTime::now() - .duration_since(start) - .expect("Time went backwards") - .as_millis(); - - if run_duration > execution_timeout { + if let Some(execution_deadline) = self.execution_deadline { + if Instant::now() > execution_deadline { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -101,7 +97,7 @@ impl ExecutionProfile { Ok(()) } - pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } @@ -123,7 +119,9 @@ impl ExecutionProfile { let initial_execution_profile = ThreadLocalParams::get_execution_profile(); let mut execution_profile = self.clone(); - execution_profile.start_execution_time = Some(SystemTime::now()); + if let Some(execution_timeout) = execution_profile.execution_timeout { + execution_profile.execution_deadline = Some(Instant::now() + Duration::from_millis(execution_timeout)); + } ThreadLocalParams::set_execution_profile(&execution_profile); let result = f(); @@ -149,7 +147,7 @@ pub struct ExecutionProfileBuilder { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. max_number_of_states: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - execution_timeout: Option, + execution_timeout: Option, } impl Default for ExecutionProfileBuilder { fn default() -> Self { @@ -165,7 +163,7 @@ impl ExecutionProfileBuilder { } } - pub fn execution_timeout(mut self, execution_timeout_in_ms: u128) -> Self { + pub fn execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } @@ -179,7 +177,7 @@ impl ExecutionProfileBuilder { ExecutionProfile { max_number_of_states: self.max_number_of_states, execution_timeout: self.execution_timeout, - start_execution_time: None, + execution_deadline: None, } } } @@ -188,8 +186,8 @@ struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; - static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; + static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; } /// Store on the current thread [`ExecutionProfile`]. @@ -198,8 +196,8 @@ impl ThreadLocalParams { *cell.borrow_mut() = profile.max_number_of_states; }); - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| { - *cell.borrow_mut() = profile.start_execution_time; + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| { + *cell.borrow_mut() = profile.execution_deadline; }); ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { @@ -211,11 +209,11 @@ impl ThreadLocalParams { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - fn get_start_execution_time() -> Option { - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) + fn get_execution_deadline() -> Option { + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| *cell.borrow()) } - fn get_execution_timeout() -> Option { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } @@ -223,7 +221,7 @@ impl ThreadLocalParams { fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), - start_execution_time: Self::get_start_execution_time(), + execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), } } @@ -283,7 +281,7 @@ mod tests { let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 10; - let start_time = SystemTime::now(); + let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) .build() @@ -293,13 +291,12 @@ mod tests { term.generate_strings(100).unwrap_err() ); - let run_duration = SystemTime::now() + let run_duration = Instant::now() .duration_since(start_time) - .expect("Time went backwards") .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 50); + assert!(run_duration <= (execution_timeout_in_ms + 50) as u128); }); Ok(()) @@ -311,7 +308,7 @@ mod tests { let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); let execution_timeout_in_ms = 50; - let start_time = SystemTime::now(); + let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) .build() @@ -321,13 +318,12 @@ mod tests { term1.difference(&term2).unwrap_err() ); - let run_duration = SystemTime::now() + let run_duration = Instant::now() .duration_since(start_time) - .expect("Time went backwards") .as_millis(); println!("{run_duration}"); - assert!(run_duration <= execution_timeout_in_ms + 25); + assert!(run_duration <= (execution_timeout_in_ms + 25) as u128); }); Ok(()) From 68e87c416bf83f6ece73d6fd5f75ea6b17c35b46 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 8 Oct 2025 20:36:30 +0200 Subject: [PATCH 43/62] Parallelize state selection for elimination --- src/fast_automaton/convert/to_regex/mod.rs | 7 -- .../to_regex/state_elimination/eliminate.rs | 83 +++++++++---------- 2 files changed, 37 insertions(+), 53 deletions(-) diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 6378449..fbe36ce 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -15,13 +15,6 @@ impl FastAutomaton { mod tests { use super::*; - #[test] - fn test_convert_t() -> Result<(), String> { - assert_convert("abc.*def.*uif(ab|de)"); - - Ok(()) - } - #[test] fn test_convert() -> Result<(), String> { diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs index d528f1b..c587a99 100644 --- a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -16,59 +16,50 @@ impl Gnfa { } fn get_next_state_to_eliminate(&self) -> Option { - let mut best_state: Option = None; - let mut best_score: u128 = u128::MAX; - - for state in self.all_states_iter() { - if state == self.start_state || state == self.accept_state { - continue; - } - - let preds = self.transitions_to_vec(state); - let succs = self.transitions_from_vec(state); - - let in_deg = preds.len() as u128; - let out_deg = succs.len() as u128; - - if in_deg == 0 || out_deg == 0 { - let score = state as u128 & 0xFF; - if score < best_score { - best_score = score; - best_state = Some(state); + let states: Vec = self + .all_states_iter() + .filter(|&s| s != self.start_state && s != self.accept_state) + .collect(); + + states + .into_par_iter() + .filter_map(|state| { + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = (state as u128) & 0xFF; + return Some((score, state)); } - continue; - } - - let mut score: u128 = in_deg * out_deg; - - if self.has_self_loop(state) { - score = score + (score >> 1); - } - let mut label_cost: u128 = 0; + let mut score: u128 = in_deg * out_deg; - for (_, regex) in &preds { - label_cost += regex.evaluate_complexity() as u128; - } - for (regex, _) in &succs { - label_cost += regex.evaluate_complexity() as u128; - } - if let Some(re) = self.get_transition(state, state) { - label_cost += (re.evaluate_complexity() as u128) * 2; - } + if self.has_self_loop(state) { + score = score + (score >> 1); + } - score = score.saturating_mul(1).saturating_add(label_cost); + let mut label_cost: u128 = 0; - let tie = state as u128 & 0xFFFF; - let score = score.saturating_add(tie); + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } - if score < best_score { - best_score = score; - best_state = Some(state); - } - } + score = score.saturating_add(label_cost); - best_state + let tie = (state as u128) & 0xFFFF; + Some((score.saturating_add(tie), state)) + }) + .reduce_with(|a, b| if a.0 < b.0 { a } else { b }) + .map(|(_, state)| state) } fn eliminate_state(&mut self, k: usize) { From 863fdce9d8bdc4e2ab0b86011b16bd4612a292d1 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 8 Oct 2025 20:38:33 +0200 Subject: [PATCH 44/62] Fix misuse of hashmap for determinize --- Cargo.toml | 1 + src/fast_automaton/analyze/mod.rs | 2 +- src/fast_automaton/operation/determinize.rs | 84 ++++++++------------- src/fast_automaton/operation/mod.rs | 8 +- 4 files changed, 38 insertions(+), 57 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index eb32825..0cf5afd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } rayon = "1.10.0" +bit-set = "0.8.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index dbc05f4..700a2d9 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -72,7 +72,7 @@ impl FastAutomaton { live } - pub(crate) fn get_ranges(&self) -> Result, EngineError> { + pub(crate) fn get_spanning_bases(&self) -> Result, EngineError> { self.spanning_set .get_spanning_ranges() .map(|range| Condition::from_range(range, &self.spanning_set)) diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 0217834..6a578eb 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,4 +1,4 @@ -use ahash::HashMapExt; +use bit_set::BitSet; use crate::{EngineError, execution_profile::ExecutionProfile}; @@ -12,90 +12,73 @@ impl FastAutomaton { } let execution_profile = ExecutionProfile::get(); - let ranges = self.get_ranges()?; - - let initial_vec = VecDeque::from(vec![self.start_state]); + let bases = self.get_spanning_bases()?; let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); let map_capacity = (self.get_number_of_states() as f64 / 0.75).ceil() as usize; - let mut new_states = IntMap::with_capacity(map_capacity); + let mut new_states = AHashMap::with_capacity(map_capacity); + + let mut accept_states = BitSet::new(); + for &state in &self.accept_states { + accept_states.insert(state); + } let mut new_automaton = FastAutomaton::new_empty(); new_automaton.spanning_set = self.spanning_set.clone(); - worklist.push_back((vec![self.start_state], new_automaton.start_state)); - new_states.insert(Self::simple_hash(&initial_vec), new_automaton.start_state); + let mut initial_state = BitSet::new(); + initial_state.insert(self.start_state); + + worklist.push_back((initial_state.clone(), new_automaton.start_state)); + new_states.insert(initial_state, new_automaton.start_state); - let mut new_states_to_add = VecDeque::with_capacity(self.get_number_of_states()); + let mut new_states_to_add = BitSet::new(); while let Some((states, r)) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; - for state in &states { - if self.accept_states.contains(state) { - new_automaton.accept_states.insert(r); - break; - } + if !states.is_disjoint(&accept_states) { + new_automaton.accept_states.insert(r); } - for base in &ranges { + for base in &bases { for from_state in &states { - for (cond, to_state) in self.transitions_from(*from_state) { + for (cond, to_state) in self.transitions_from(from_state) { if cond.has_intersection(base) { - match new_states_to_add.binary_search(to_state) { - Ok(_) => {} // element already in vector @ `pos` - Err(pos) => new_states_to_add.insert(pos, *to_state), - }; + new_states_to_add.insert(*to_state); } } } if !new_states_to_add.is_empty() { - let q = match new_states.entry(Self::simple_hash(&new_states_to_add)) { - Entry::Occupied(o) => *o.get(), + match new_states.entry(new_states_to_add.clone()) { + Entry::Occupied(o) => { + let q = *o.get(); + + new_states_to_add.clear(); + + new_automaton.add_transition(r, q, base); + } Entry::Vacant(v) => { let new_q = new_automaton.new_state(); - worklist - .push_back((new_states_to_add.iter().cloned().collect(), new_q)); v.insert(new_q); - new_q + + let new_states = std::mem::take(&mut new_states_to_add); + worklist.push_back((new_states, new_q)); + + new_automaton.add_transition(r, new_q, base); } }; - - new_automaton.add_transition(r, q, base); } - new_states_to_add.clear(); } } Ok(Cow::Owned(new_automaton)) } - - fn simple_hash(list: &VecDeque) -> u64 { - let mut hasher = AHasher::default(); - for &item in list { - hasher.write_usize(item); - } - hasher.finish() - } } #[cfg(test)] mod tests { use crate::regex::RegularExpression; - #[test] - fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::parse(".*ab", false) - .unwrap() - .to_automaton() - .unwrap(); - - let deterministic_automaton = automaton.determinize().unwrap(); - - assert!(deterministic_automaton.is_deterministic()); - - Ok(()) - } - #[test] fn test_determinize_regex() -> Result<(), String> { assert_determinization("(aad|ads|a)"); @@ -117,8 +100,6 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - //automaton.compute_determinization_cost(); - //println!("Determinization Cost: {:?}", automaton.determinisation_cost); println!("States Before: {}", automaton.get_number_of_states()); let deterministic_automaton = automaton.determinize().unwrap(); println!( @@ -126,6 +107,7 @@ mod tests { deterministic_automaton.get_number_of_states() ); assert!(deterministic_automaton.is_deterministic()); + //deterministic_automaton.print_dot(); assert!( automaton .difference(&deterministic_automaton) diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 0805011..37241a8 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -1,15 +1,13 @@ -use std::{cmp, hash::Hasher}; - -use ahash::AHasher; +use std::cmp; use super::*; -mod union; mod concat; mod determinize; -mod intersection; mod difference; +mod intersection; mod repeat; +mod union; impl FastAutomaton { pub(crate) fn remove_dead_transitions(&mut self) { From 2dcf14172fee6da623c099f16efcc5e82516c17a Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 17 Mar 2026 22:17:22 +0100 Subject: [PATCH 45/62] improve generate_strings + fix clippy issues --- README.md | 6 +- benches/my_benchmark.rs | 13 +- src/execution_profile.rs | 25 +- src/fast_automaton/analyze/cardinality.rs | 17 +- src/fast_automaton/analyze/mod.rs | 9 +- src/fast_automaton/builder.rs | 5 +- .../condition/fast_bit_vec/mod.rs | 6 +- src/fast_automaton/generate.rs | 243 ++++++++++++++---- src/fast_automaton/mod.rs | 5 +- src/fast_automaton/operation/concat.rs | 22 +- src/fast_automaton/operation/determinize.rs | 2 +- src/fast_automaton/operation/repeat.rs | 23 +- src/fast_automaton/spanning_set/mod.rs | 2 +- src/lib.rs | 29 ++- src/regex/operation/concat.rs | 10 +- src/regex/operation/repeat.rs | 14 +- tests/integration_tests.rs | 4 +- 17 files changed, 289 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index f45837d..652520a 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ fn main() -> Result<(), EngineError> { // Generate examples let samples = Term::from_pattern("(x|y){1,3}")? - .generate_strings(5)?; + .generate_strings(5, 0)?; println!("Some matches: {:?}", samples); // Equivalence & subset @@ -119,7 +119,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | Method | Return | Description | | -------- | ------- | ------- | | `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term. | +| `generate_strings(&self, count: usize, offset: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term, skipping the first `offset` strings. | | `get_cardinality(&self)` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | @@ -197,7 +197,7 @@ This design allows us to perform unions, intersections, and complements of trans | `direct_states(&self, state: State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | | `direct_states_vec(&self, state: State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | | `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton. | +| `generate_strings(&self, count: usize, offset: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton, skipping the first `offset` strings. | | `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | | `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index 71898ec..2488acf 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,4 +1,4 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{Criterion, black_box, criterion_group, criterion_main}; use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; fn parse_regex(regex: &str) -> RegularExpression { @@ -18,7 +18,7 @@ fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> Fas } fn generate_strings(automaton: &FastAutomaton) -> Vec { - automaton.generate_strings(2000).unwrap() + automaton.generate_strings(2000, 1000).unwrap() } fn criterion_benchmark(c: &mut Criterion) { @@ -61,10 +61,12 @@ fn criterion_benchmark(c: &mut Criterion) { { let automaton1 = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") .unwrap() - .to_automaton().unwrap(); + .to_automaton() + .unwrap(); let automaton2 = RegularExpression::new("a(bcfe|mkv|opr)*(abc){2,4}") .unwrap() - .to_automaton().unwrap(); + .to_automaton() + .unwrap(); c.bench_function("intersection", |b| { b.iter(|| intersection(black_box(&automaton1), black_box(&automaton2))) @@ -74,7 +76,8 @@ fn criterion_benchmark(c: &mut Criterion) { { let automaton = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") .unwrap() - .to_automaton().unwrap(); + .to_automaton() + .unwrap(); c.bench_function("generate_strings", |b| { b.iter(|| generate_strings(black_box(&automaton))) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index b845261..3e4576c 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -37,7 +37,7 @@ use crate::error::EngineError; /// .build(); /// /// execution_profile.run(|| { -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 0).unwrap_err()); /// }); /// ``` #[derive(Clone, Debug)] @@ -89,10 +89,10 @@ impl ExecutionProfile { &self, number_of_states: usize, ) -> Result<(), EngineError> { - if let Some(max_number_of_states) = self.max_number_of_states { - if number_of_states >= max_number_of_states { - return Err(EngineError::AutomatonHasTooManyStates); - } + if let Some(max_number_of_states) = self.max_number_of_states + && number_of_states >= max_number_of_states + { + return Err(EngineError::AutomatonHasTooManyStates); } Ok(()) } @@ -120,7 +120,8 @@ impl ExecutionProfile { let mut execution_profile = self.clone(); if let Some(execution_timeout) = execution_profile.execution_timeout { - execution_profile.execution_deadline = Some(Instant::now() + Duration::from_millis(execution_timeout)); + execution_profile.execution_deadline = + Some(Instant::now() + Duration::from_millis(execution_timeout)); } ThreadLocalParams::set_execution_profile(&execution_profile); @@ -288,12 +289,10 @@ mod tests { .run(|| { assert_eq!( EngineError::OperationTimeOutError, - term.generate_strings(100).unwrap_err() + term.generate_strings(100, 0).unwrap_err() ); - let run_duration = Instant::now() - .duration_since(start_time) - .as_millis(); + let run_duration = Instant::now().duration_since(start_time).as_millis(); println!("{run_duration}"); assert!(run_duration <= (execution_timeout_in_ms + 50) as u128); @@ -307,7 +306,7 @@ mod tests { let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); - let execution_timeout_in_ms = 50; + let execution_timeout_in_ms = 10; let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) @@ -318,9 +317,7 @@ mod tests { term1.difference(&term2).unwrap_err() ); - let run_duration = Instant::now() - .duration_since(start_time) - .as_millis(); + let run_duration = Instant::now().duration_since(start_time).as_millis(); println!("{run_duration}"); assert!(run_duration <= (execution_timeout_in_ms + 25) as u128); diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index ec5f514..02f825d 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -10,7 +10,10 @@ impl FastAutomaton { } else if self.cyclic || self.is_total() { return Cardinality::Infinite; } - assert!(self.is_deterministic(), "The automaton should be deterministic."); + assert!( + self.is_deterministic(), + "The automaton should be deterministic." + ); let topologically_sorted_states = self.topological_sorted_states(); if topologically_sorted_states.is_none() { @@ -31,13 +34,11 @@ impl FastAutomaton { condition .get_cardinality(&self.spanning_set) .expect("It should be possible to get the cardinality of a condition."), - ) { - if let Some(new_distance) = - distances.get(to_state).unwrap_or(&0).checked_add(distance) - { - distances.insert(*to_state, new_distance); - continue; - } + ) && let Some(new_distance) = + distances.get(to_state).unwrap_or(&0).checked_add(distance) + { + distances.insert(*to_state, new_distance); + continue; } return Cardinality::BigInteger; diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 700a2d9..5327c34 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -19,11 +19,12 @@ impl FastAutomaton { /// Checks if the automaton matches all possible strings. #[inline] pub fn is_total(&self) -> bool { - if self.accept_states.contains(&self.start_state) { - if let Some(condition) = self.transitions[self.start_state].get(&self.start_state) { - return condition.is_total(); - } + if self.accept_states.contains(&self.start_state) + && let Some(condition) = self.transitions[self.start_state].get(&self.start_state) + { + return condition.is_total(); } + false } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 8b8f844..e999c86 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -46,8 +46,9 @@ impl FastAutomaton { } let new_state = automaton.new_state(); - let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); - let condition = Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); + let spanning_set = SpanningSet::compute_spanning_set(std::slice::from_ref(range)); + let condition = + Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); automaton.spanning_set = spanning_set; automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index 9c85a43..c02c522 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -17,7 +17,11 @@ impl std::fmt::Display for FastBitVec { impl FastBitVec { #[inline] pub fn from_elem(n: usize, bit: bool) -> Self { - let nblocks = if n % 64 == 0 { n / 64 } else { n / 64 + 1 }; + let nblocks = if n.is_multiple_of(64) { + n / 64 + } else { + n / 64 + 1 + }; let bits = vec![if bit { !0_u64 } else { 0_u64 }; nblocks]; let mut bit_vec = FastBitVec { bits, n }; bit_vec.fix_last_block(); diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index e0d5ae3..2cfcdcb 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -4,39 +4,51 @@ use ahash::AHashSet; use super::*; impl FastAutomaton { - /// Generates `count` strings matched by the automaton. - pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - if self.is_empty() { + /// Generates `count` strings matched by the automaton, skipping the first `offset` strings. + pub fn generate_strings( + &self, + count: usize, + mut offset: usize, + ) -> Result, EngineError> { + if self.is_empty() || count == 0 { return Ok(vec![]); } - let (min, max) = self.get_length(); - let max_len = if let Some(max) = max { - max - } else { - let min = min.expect("A non empty automaton should have a minimum length"); - min.saturating_add(100) - } as usize; + let (_, max) = self.get_length(); + let max_len = max.unwrap_or(u32::MAX); let execution_profile = ExecutionProfile::get(); let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); + // Only allocate memory for the final `count`! let mut strings = AHashSet::with_capacity(count); let mut visited = AHashSet::with_capacity(self.get_number_of_states()); let mut q = VecDeque::with_capacity(self.get_number_of_states()); + q.push_back((self.get_start_state(), vec![], 0u64)); + while let Some((state, ranges, h)) = q.pop_front() { execution_profile.assert_not_timed_out()?; - if ranges.len() > max_len { + if ranges.len() > max_len as usize { continue; } if self.is_accepted(state) { if ranges.is_empty() { - strings.insert(String::new()); + if offset > 0 { + offset -= 1; + } else { + strings.insert(String::new()); + } } else { - Self::ranges_to_strings(&mut strings, &ranges, count, &execution_profile)?; + Self::ranges_to_strings( + &mut strings, + &ranges, + count, + &mut offset, + &execution_profile, + )?; } if strings.len() >= count { @@ -61,49 +73,111 @@ impl FastAutomaton { } } } + let mut strings: Vec = strings.into_iter().collect(); strings.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); Ok(strings) } - pub fn ranges_to_strings( + fn ranges_to_strings( strings: &mut AHashSet, ranges: &Vec, count: usize, + offset: &mut usize, execution_profile: &ExecutionProfile, ) -> Result<(), EngineError> { - let n = count - strings.len(); - if n == 0 { + if strings.len() >= count { return Ok(()); } - let mut end = false; - let mut out: Vec = Vec::with_capacity(n); - out.push(String::with_capacity(ranges.len())); - for r in ranges { - let mut next = Vec::with_capacity(n); - for prefix in out.into_iter() { - execution_profile.assert_not_timed_out()?; - for ch in r.clone().iter() { - let mut s = prefix.clone(); - s.push(ch.to_char()); - next.push(s); - if next.len() == n { - end = true; - break; - } - } - if end { - end = false; - break; - } + // Precompute the lengths of each range to avoid repeated iteration overhead + let range_lengths: Vec = ranges.iter().map(|r| r.clone().iter().count()).collect(); + + // Calculate the total Cartesian combinations this path will yield + let mut total_combinations = 1usize; + for &len in &range_lengths { + total_combinations = total_combinations.saturating_mul(len); + } + + // Analytical skip: if this entire subtree's yield is within the offset, + // subtract it and skip without doing any string allocations! + if *offset >= total_combinations { + *offset -= total_combinations; + return Ok(()); + } + + // DFS generation using a single shared String buffer. + // This is significantly more memory efficient than building Vecs level by level. + let mut current_str = String::with_capacity(ranges.len()); + Self::generate_combinations( + ranges, + &range_lengths, + 0, + &mut current_str, + strings, + count, + offset, + execution_profile, + ) + } + + #[allow(clippy::too_many_arguments)] + fn generate_combinations( + ranges: &Vec, + range_lengths: &[usize], + depth: usize, + current_str: &mut String, + strings: &mut AHashSet, + count: usize, + offset: &mut usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + if strings.len() >= count { + return Ok(()); + } + + if depth == ranges.len() { + if *offset > 0 { + *offset -= 1; + } else { + strings.insert(current_str.clone()); } - out = next; - if out.is_empty() { + return Ok(()); + } + + // Calculate combinations for the remaining suffix of ranges + let mut sub_combinations = 1usize; + for &len in &range_lengths[depth + 1..] { + sub_combinations = sub_combinations.saturating_mul(len); + } + + for ch in ranges[depth].clone().iter() { + execution_profile.assert_not_timed_out()?; + + // If skipping this character's subtree fits within the remaining offset + if *offset >= sub_combinations { + *offset -= sub_combinations; + continue; + } + + current_str.push(ch.to_char()); + Self::generate_combinations( + ranges, + range_lengths, + depth + 1, + current_str, + strings, + count, + offset, + execution_profile, + )?; + current_str.pop(); + + if strings.len() >= count { break; } } - strings.extend(out); + Ok(()) } @@ -125,10 +199,9 @@ impl FastAutomaton { #[cfg(test)] mod tests { + use crate::{cardinality::Cardinality, regex::RegularExpression}; use regex::Regex; - use crate::regex::RegularExpression; - #[test] fn test_generate_strings() -> Result<(), String> { assert_generate_strings("a{100}[a-z]", 100); @@ -154,33 +227,99 @@ mod tests { "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", 500, ); - assert_generate_strings("[0-9]+[A-Z]*", 500); - assert_generate_strings("a+(ba+)*", 200); - assert_generate_strings("((a|bc)*|d)", 200); - assert_generate_strings(".*", 50); - assert_generate_strings("(ac|ads|a)*", 200); - assert_generate_strings("((aad|ads|a)*|q)", 200); assert_generate_strings("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", 1000); - //((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5} Ok(()) } + #[test] + fn test_generate_strings_offset() -> Result<(), String> { + assert_generate_strings_offset("[a-z]+"); + assert_generate_strings_offset("[a-z]+@"); + + assert_generate_strings_offset("[0-9]+[A-Z]*"); + assert_generate_strings_offset("a+(ba+)*"); + assert_generate_strings_offset("((a|bc)*|d)"); + assert_generate_strings_offset(".*"); + assert_generate_strings_offset("(ac|ads|a)*"); + assert_generate_strings_offset("((aad|ads|a)*|q)"); + + assert_generate_strings_offset( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_generate_strings_offset("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@"); + assert_generate_strings_offset( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + assert_generate_strings_offset("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)"); + + Ok(()) + } + + fn assert_generate_strings_offset(regex: &str) { + println!("regex: {regex}"); + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + // Generate 30 strings at once + let all_strings = automaton.generate_strings(30, 0).unwrap(); + + //println!("all_strings {:?}", all_strings); + + // Generate the same 30 strings in chunks of 10 + let chunk1 = automaton.generate_strings(10, 0).unwrap(); + let chunk2 = automaton.generate_strings(10, 10).unwrap(); + let chunk3 = automaton.generate_strings(10, 20).unwrap(); + + /* + println!("chunk1 {:?}", chunk1); + println!("chunk2 {:?}", chunk2); + println!("chunk3 {:?}", chunk3); + */ + + assert_eq!(all_strings.len(), 30, "Should generate exactly 30 strings"); + assert_eq!(chunk1.len(), 10); + assert_eq!(chunk2.len(), 10); + assert_eq!(chunk3.len(), 10); + + // Combine the chunks + let mut combined = chunk1; + combined.extend(chunk2); + combined.extend(chunk3); + + combined.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); + + // Prove that generating in chunks perfectly matches the bulk generation + assert_eq!( + all_strings, combined, + "Chunked generation did not match bulk generation" + ); + + let cardinality = automaton.get_cardinality(); + + if let Cardinality::Integer(count) = cardinality { + let empty_chunk = automaton.generate_strings(10, count as usize).unwrap(); + assert!(empty_chunk.is_empty(), "Chunk past limits should be empty"); + } + } + fn assert_generate_strings(regex: &str, number: usize) { println!(":{}", regex); let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - //println!("{}", automaton.get_number_of_states()); - //automaton.to_dot(); + let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let strings = automaton.generate_strings(number).unwrap(); + // Modified to include an offset of 0 + let strings = automaton.generate_strings(number, 0).unwrap(); println!("nb of strings: {}/{}", strings.len(), number); assert!(number >= strings.len()); for string in strings { - // println!("{string}"); if !re.is_match(&string) { for byte in string.as_bytes() { print!("{:02x} ", byte); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 10b269c..31b25de 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -145,10 +145,7 @@ impl FastAutomaton { /// Returns an iterator over transitions from the given state. #[inline] - pub fn transitions_from( - &self, - state: State, - ) -> impl Iterator { + pub fn transitions_from(&self, state: State) -> impl Iterator { self.transitions[state] .iter() .map(|(s, c)| (c, s)) diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 7eac73f..1da8877 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -13,8 +13,9 @@ impl FastAutomaton { } /// Computes the concatenation of all automata in the given iterator. - pub fn concat_all<'a, I: IntoIterator>(automata: I) -> Result - { + pub fn concat_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { let mut new_automaton = FastAutomaton::new_empty_string(); for automaton in automata { new_automaton.concat_mut(automaton)?; @@ -107,23 +108,20 @@ impl FastAutomaton { let projected_condition = condition_converter.convert(condition)?; for new_from_state in new_from_states.iter() { for new_to_state in new_to_states.iter() { - self.add_transition( - *new_from_state, - *new_to_state, - &projected_condition, - ); + self.add_transition(*new_from_state, *new_to_state, &projected_condition); } } } } - if start_state_and_accept_states_not_mergeable { - if let Some(&other_start_state) = new_states.get(&other.start_state) { - for accept_state in &accept_states { - self.add_epsilon_transition(*accept_state, other_start_state); - } + if start_state_and_accept_states_not_mergeable + && let Some(&other_start_state) = new_states.get(&other.start_state) + { + for accept_state in &accept_states { + self.add_epsilon_transition(*accept_state, other_start_state); } } + self.cyclic = self.cyclic || other.cyclic; Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 6a578eb..88fd2a8 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -6,7 +6,7 @@ use super::*; impl FastAutomaton { /// Determinizes the automaton and returns the result. - pub fn determinize(&self) -> Result, EngineError> { + pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { return Ok(Cow::Borrowed(self)); } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index f9e256d..553fbb3 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -12,11 +12,11 @@ impl FastAutomaton { } pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { - if let Some(max) = max_opt { - if min > max { - self.make_empty(); - return Ok(()); - } + if let Some(max) = max_opt + && min > max + { + self.make_empty(); + return Ok(()); } let automaton_to_repeat = self.clone(); @@ -39,13 +39,14 @@ impl FastAutomaton { } } - if let Some(max) = max_opt { - if min <= 1 && max == 1 { - if min == 0 { - self.accept_states.insert(self.start_state); - } - return Ok(()); + if let Some(max) = max_opt + && min <= 1 + && max == 1 + { + if min == 0 { + self.accept_states.insert(self.start_state); } + return Ok(()); } let iter = if min == 0 { 0..0 } else { 0..min - 1 }; diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index bdb9d9c..30d2a07 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -47,7 +47,7 @@ impl SpanningSet { } } - pub fn get_spanning_ranges(&self) -> Iter { + pub fn get_spanning_ranges(&self) -> Iter<'_, CharRange> { self.0.iter() } diff --git a/src/lib.rs b/src/lib.rs index de45599..177f0ff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,7 +68,7 @@ pub type CharRange = RangeSet; /// /// // Generate examples /// let samples = Term::from_pattern("(x|y){1,3}")? -/// .generate_strings(5)?; +/// .generate_strings(5, 0)?; /// println!("Some matches: {:?}", samples); /// /// // Equivalence & subset @@ -352,7 +352,7 @@ impl Term { } } - /// Generates `count` strings matched by the term. + /// Generates `count` strings matched by the term, skipping the first `offset` strings. /// /// # Example: /// @@ -361,12 +361,20 @@ impl Term { /// /// let term = Term::from_pattern("(abc|de){2}").unwrap(); /// - /// let strings = term.generate_strings(3).unwrap(); + /// // Generate the first 2 matched strings + /// let batch_1 = term.generate_strings(2, 0).unwrap(); + /// assert_eq!(2, batch_1.len()); // ["dede", "deabc"] /// - /// assert_eq!(3, strings.len()); // ex: ["deabc", "dede", "abcde"] + /// // Generate the next 2 matched strings by setting the offset + /// let batch_2 = term.generate_strings(2, 2).unwrap(); + /// assert_eq!(2, batch_2.len()); // ["abcde", "abcabc"] /// ``` - pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - self.to_automaton()?.generate_strings(count) + pub fn generate_strings( + &self, + count: usize, + offset: usize, + ) -> Result, EngineError> { + self.to_automaton()?.generate_strings(count, offset) } /// Returns `true` if both terms accept the same language. @@ -458,7 +466,7 @@ impl Term { } /// Converts the term to a [`FastAutomaton`]. - pub fn to_automaton(&self) -> Result, EngineError> { + pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), Term::Automaton(automaton) => Cow::Borrowed(automaton), @@ -466,7 +474,7 @@ impl Term { } /// Converts the term to a [`RegularExpression`]. - pub fn to_regex(&self) -> Cow { + pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()), @@ -525,10 +533,7 @@ impl Term { let mut regex_list = Vec::with_capacity(terms.len() + 1); regex_list.push(self.to_regex()); - let mut terms_regexes = terms - .iter() - .map(Term::to_regex) - .collect::>(); + let mut terms_regexes = terms.iter().map(Term::to_regex).collect::>(); regex_list.append(&mut terms_regexes); Some(regex_list) diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 9cefb01..2063e49 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,7 +1,7 @@ use super::*; impl RegularExpression { - /// Returns a regular expression that is the concatenation of all expressions in `patterns`. + /// Returns a regular expression that is the concatenation of all expressions in `patterns`. pub fn concat_all<'a, I: IntoIterator>( patterns: I, ) -> RegularExpression { @@ -13,7 +13,7 @@ impl RegularExpression { result } - + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { @@ -202,12 +202,12 @@ impl RegularExpression { { if this_range.contains_all(&that_range) && that_min == &0 && this_max_opt.is_none() { - return Some(this.clone()); + Some(this.clone()) } else { - return None; + None } } else { - return None; + None } } else if let RegularExpression::Repetition(this_regex, this_min, this_max_opt) = this { if **this_regex == *that { diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 235f4f1..3f42b31 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -26,11 +26,7 @@ impl RegularExpression { }; if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { - RegularExpression::Repetition( - regular_expression.clone(), - min * i_min, - new_max, - ) + RegularExpression::Repetition(regular_expression.clone(), min * i_min, new_max) } else { RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) } @@ -46,10 +42,10 @@ impl RegularExpression { o_min: u32, o_max_opt: Option, ) -> bool { - if let Some(o_max) = o_max_opt { - if o_min == o_max { - return true; - } + if let Some(o_max) = o_max_opt + && o_min == o_max + { + return true; } if let Some(i_max) = i_max_opt { diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 319f261..a4c142d 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -11,13 +11,13 @@ fn assert_regex(regex: &str) { let regex = RegularExpression::parse(regex, true).unwrap(); let automaton = regex.to_automaton().unwrap(); - let strings = automaton.generate_strings(500).unwrap(); + let strings = automaton.generate_strings(500, 0).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } let determinized_automaton = automaton.determinize().unwrap(); - let strings = determinized_automaton.generate_strings(500).unwrap(); + let strings = determinized_automaton.generate_strings(500, 0).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } From 06377a02e01e442ac05edb0c559096ffe0d3611a Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 18 Mar 2026 20:51:59 +0100 Subject: [PATCH 46/62] Fix generate_strings recomputing strings in offset --- src/fast_automaton/generate.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 2cfcdcb..1587c57 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -91,7 +91,10 @@ impl FastAutomaton { } // Precompute the lengths of each range to avoid repeated iteration overhead - let range_lengths: Vec = ranges.iter().map(|r| r.clone().iter().count()).collect(); + let range_lengths: Vec = ranges + .iter() + .map(|r| r.get_cardinality() as usize) + .collect(); // Calculate the total Cartesian combinations this path will yield let mut total_combinations = 1usize; @@ -234,6 +237,7 @@ mod tests { #[test] fn test_generate_strings_offset() -> Result<(), String> { + assert_generate_strings_offset(".{900}"); assert_generate_strings_offset("[a-z]+"); assert_generate_strings_offset("[a-z]+@"); From cac1a5dcf5ab61fb944645426570ef5397cfb605 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 21 Mar 2026 16:01:29 +0100 Subject: [PATCH 47/62] Add Hopcroft minimization algorithm --- src/fast_automaton/analyze/mod.rs | 6 +- src/fast_automaton/builder.rs | 30 +++ src/fast_automaton/generate.rs | 1 + src/fast_automaton/mod.rs | 13 + src/fast_automaton/operation/concat.rs | 135 +++++++++- src/fast_automaton/operation/determinize.rs | 1 + src/fast_automaton/operation/difference.rs | 7 +- src/fast_automaton/operation/intersection.rs | 1 + src/fast_automaton/operation/minimize.rs | 245 +++++++++++++++++++ src/fast_automaton/operation/mod.rs | 1 + src/fast_automaton/operation/repeat.rs | 147 +++++++++++ src/fast_automaton/operation/union.rs | 176 ++++++++++++- src/lib.rs | 156 +++++++++--- 13 files changed, 878 insertions(+), 41 deletions(-) create mode 100644 src/fast_automaton/operation/minimize.rs diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 5327c34..f753dcb 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,13 +10,13 @@ mod length; mod subset; impl FastAutomaton { - /// Checks if the automaton matches the empty language. + /// Checks if the automaton matches the empty language. There can be false negative if the automaton is not minimal. #[inline] pub fn is_empty(&self) -> bool { self.accept_states.is_empty() } - /// Checks if the automaton matches all possible strings. + /// Checks if the automaton matches all possible strings. There can be false negative if the automaton is not minimal. #[inline] pub fn is_total(&self) -> bool { if self.accept_states.contains(&self.start_state) @@ -28,7 +28,7 @@ impl FastAutomaton { false } - /// Checks if the automaton only matches the empty string `""`. + /// Checks if the automaton only matches the empty string `""`. There can be false negative if the automaton is not minimal. #[inline] pub fn is_empty_string(&self) -> bool { self.accept_states.len() == 1 diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index e999c86..afdf999 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -16,6 +16,7 @@ impl FastAutomaton { removed_states: IntSet::default(), spanning_set: SpanningSet::new_empty(), deterministic: true, + minimal: true, cyclic: false, } } @@ -25,6 +26,7 @@ impl FastAutomaton { pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); automaton.accept(automaton.start_state); + automaton.minimal = true; automaton } @@ -35,6 +37,7 @@ impl FastAutomaton { automaton.spanning_set = SpanningSet::new_total(); automaton.accept(automaton.start_state); automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.minimal = true; automaton } @@ -52,12 +55,14 @@ impl FastAutomaton { automaton.spanning_set = spanning_set; automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); + automaton.minimal = true; automaton } /// Creates a new state and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { + self.minimal = false; if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); *new_state @@ -71,6 +76,7 @@ impl FastAutomaton { #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); + self.minimal = false; self.accept_states.insert(state); } @@ -121,6 +127,7 @@ impl FastAutomaton { return; } + self.minimal = false; if self.deterministic { let mut deterministic = true; for (condition, state) in self.transitions_from(from_state) { @@ -156,6 +163,9 @@ impl FastAutomaton { } self.assert_state_exists(from_state); self.assert_state_exists(to_state); + + self.minimal = false; + if self.accept_states.contains(&to_state) { self.accept_states.insert(from_state); } @@ -201,6 +211,8 @@ impl FastAutomaton { self.assert_state_exists(to_state); } + self.minimal = false; + self.transitions_in .entry(to_state) .or_default() @@ -214,6 +226,7 @@ impl FastAutomaton { if self.start_state == state { panic!("Can not remove the state {state}, it is still used as start state."); } + self.minimal = false; self.accept_states.remove(&state); self.transitions_in.remove(&state); if self.transitions.len() - 1 == state { @@ -242,6 +255,7 @@ impl FastAutomaton { pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); + self.minimal = false; let mut states_to_remove = Vec::with_capacity(states.len()); for &state in states { @@ -278,6 +292,21 @@ impl FastAutomaton { } } + /// Recompute a minimal spanning set for the automaton and apply it. + pub fn recompute_minimal_spanning_set(&mut self) -> Result<(), EngineError> { + let mut ranges = Vec::with_capacity(self.get_number_of_states()); + + for state in self.states() { + for (condition, _) in self.transitions_from(state) { + ranges.push(condition.to_range(&self.spanning_set)?); + } + } + + let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); + + self.apply_new_spanning_set(&new_spanning_set) + } + /// Applies the provided spanning set and projects all existing conditions onto it. pub fn apply_new_spanning_set( &mut self, @@ -320,6 +349,7 @@ impl FastAutomaton { self.removed_states = model.removed_states.clone(); self.spanning_set = model.spanning_set.clone(); self.deterministic = model.deterministic; + self.minimal = model.minimal; self.cyclic = model.cyclic; } } diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 1587c57..1224b67 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -5,6 +5,7 @@ use super::*; impl FastAutomaton { /// Generates `count` strings matched by the automaton, skipping the first `offset` strings. + /// If the provided automaton is not deterministic, it is possible to get multiple time the same strings over multiple call with different offset. pub fn generate_strings( &self, count: usize, diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 31b25de..ccf9784 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -34,6 +34,7 @@ pub struct FastAutomaton { removed_states: IntSet, spanning_set: SpanningSet, deterministic: bool, + minimal: bool, cyclic: bool, } @@ -217,6 +218,18 @@ impl FastAutomaton { self.deterministic } + /// Assert the automaton is deterministic. + #[inline] + pub(crate) fn assert_deterministic(&self) { + assert!(self.deterministic, "The automaton should be deterministic."); + } + + /// Returns `true` if the automaton is minimal. + #[inline] + pub fn is_minimal(&self) -> bool { + self.minimal + } + /// Returns `true` if the automaton contains at least one cycle. #[inline] pub fn is_cyclic(&self) -> bool { diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 1da8877..1755c04 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -25,6 +25,9 @@ impl FastAutomaton { } pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.concat_state_count_heuristic(other))?; + if other.is_empty() { return Ok(()); } @@ -123,13 +126,44 @@ impl FastAutomaton { } self.cyclic = self.cyclic || other.cyclic; + self.minimal = false; Ok(()) } + + pub(crate) fn concat_state_count_heuristic(&self, other: &FastAutomaton) -> usize { + // Edge Case 1: If the other automaton is empty, the state count doesn't change. + if other.is_empty() { + return self.get_number_of_states(); + } + + // Edge Case 2: If this automaton is empty, the resulting state count is just the other's. + if self.is_empty() { + return other.get_number_of_states(); + } + + // Determine if we are forced to create a new state to avoid unintended loops + let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 + && self + .accept_states + .iter() + .cloned() + .any(|s| self.out_degree(s) > 0); + + let v1 = self.get_number_of_states(); + let v2 = other.get_number_of_states(); + + // Apply the heuristic + if start_state_and_accept_states_not_mergeable { + v1 + v2 + } else { + v1 + v2 - 1 + } + } } #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; #[test] fn test_simple_concatenation_regex() -> Result<(), String> { @@ -420,5 +454,104 @@ mod tests { assert_eq!(3, automaton.get_number_of_states()); Ok(()) } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic(".{900}", "[a-z]+"); + + assert_heuristic("[a-z]+@", "[0-9]+[A-Z]*"); + + assert_heuristic("a+(ba+)*", "((a|bc)*|d)"); + + assert_heuristic(".*", "(ac|ads|a)*"); + + assert_heuristic( + "((aad|ads|a)*|q)", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic( + "(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", ".*"); + + assert_heuristic( + ".{900}", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + Ok(()) + } + + fn assert_heuristic(regex1: &str, regex2: &str) { + println!( + "Testing concat heuristic for: '{}' and '{}'", + regex1, regex2 + ); + + let automaton1 = RegularExpression::parse(regex1, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton2 = RegularExpression::parse(regex2, false) + .unwrap() + .to_automaton() + .unwrap(); + + // Helper closure to run the test and assert + let test_pair = |a1: &FastAutomaton, a2: &FastAutomaton, desc: &str| { + let mut actual_concat = a1.clone(); + + // Execute the actual mutation + actual_concat.concat_mut(a2).unwrap(); + + let actual_states = actual_concat.get_number_of_states(); + let heuristic_states = a1.concat_state_count_heuristic(a2); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for {}.\nExpected (heuristic): {}\nActual (computed): {}", + desc, heuristic_states, actual_states + ); + }; + + // Test 1: regex1 + regex2 + test_pair( + &automaton1, + &automaton2, + &format!("'{}' + '{}'", regex1, regex2), + ); + + // Test 2: regex2 + regex1 (Reverse order) + test_pair( + &automaton2, + &automaton1, + &format!("'{}' + '{}'", regex2, regex1), + ); + + // Test 3: regex1 + regex1 (Self-concatenation, crucial for your repeat logic) + test_pair( + &automaton1, + &automaton1, + &format!("'{}' + '{}' (Self)", regex1, regex1), + ); + + // Test 4 & 5: Empty automaton edge cases + let empty_automaton = FastAutomaton::new_empty(); + + test_pair( + &empty_automaton, + &automaton2, + &format!("Empty + '{}'", regex2), + ); + test_pair( + &automaton1, + &empty_automaton, + &format!("'{}' + Empty", regex1), + ); + } } //(a|bc)* diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 88fd2a8..c8d6590 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -36,6 +36,7 @@ impl FastAutomaton { let mut new_states_to_add = BitSet::new(); while let Some((states, r)) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; if !states.is_disjoint(&accept_states) { new_automaton.accept_states.insert(r); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 59ca6ed..6ac8de6 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -5,8 +5,9 @@ use crate::EngineError; use super::*; impl FastAutomaton { + /// Totalize the automaton; it must be deterministic. fn totalize(&mut self) -> Result<(), EngineError> { - assert!(self.is_deterministic(), "The automaton should be deterministic."); + self.assert_deterministic(); let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = @@ -44,6 +45,7 @@ impl FastAutomaton { /// Complements the automaton; it must be deterministic. pub fn complement(&mut self) -> Result<(), EngineError> { + self.assert_deterministic(); self.totalize()?; let mut new_accept_states = IntSet::default(); @@ -58,8 +60,9 @@ impl FastAutomaton { Ok(()) } - /// Computes the difference between `self` and `other`. + /// Computes the difference between `self` and `other`. `other` must be deterministic. pub fn difference(&self, other: &FastAutomaton) -> Result { + other.assert_deterministic(); let mut complement = other.clone(); match complement.complement() { Ok(()) => self.intersection(&complement), diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index e373c55..470e7bc 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -92,6 +92,7 @@ impl FastAutomaton { while let Some(p) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; if self.accept_states.contains(&p.1) && other.accept_states.contains(&p.2) { new_automaton.accept(p.0); } diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs new file mode 100644 index 0000000..da3ec9f --- /dev/null +++ b/src/fast_automaton/operation/minimize.rs @@ -0,0 +1,245 @@ +use super::*; + +impl FastAutomaton { + /// Minimizes a deterministic automaton using Hopcroft's Algorithm. + pub fn minimize(&mut self) -> Result<(), EngineError> { + self.assert_deterministic(); + + let max_states = self.transitions.len(); + + let all_states: IntSet = self.states().collect(); + let accept_states: IntSet = self.get_accept_states().iter().cloned().collect(); + let non_accept_states: IntSet = + all_states.difference(&accept_states).cloned().collect(); + + let mut partitions: Vec> = vec![accept_states, non_accept_states]; + partitions.retain(|p| !p.is_empty()); + + let mut state_to_partition = vec![0; max_states]; + for (i, partition) in partitions.iter().enumerate() { + for &state in partition { + state_to_partition[state] = i; + } + } + + let mut worklist: Vec = (0..partitions.len()).collect(); + let mut in_worklist: Vec = vec![true; max_states]; + + let bases = self.get_spanning_bases()?; + + let mut inverse_transitions: Vec> = vec![Vec::new(); max_states]; + for to_state in self.states() { + for (from_state, condition) in self.transitions_to_vec(to_state) { + inverse_transitions[to_state].push((from_state, condition)); + } + } + + let mut x = IntSet::with_capacity(self.get_number_of_states()); + + let mut intersection_states: Vec> = vec![Vec::new(); max_states]; + let mut touched_partitions: Vec = Vec::with_capacity(max_states); + + while let Some(a_idx) = worklist.pop() { + in_worklist[a_idx] = false; + + let a = partitions[a_idx].clone(); + + for base in &bases { + x.clear(); + + // Find states that transition into partition 'A' on 'base' + for &to_state in &a { + for (from_state, condition) in &inverse_transitions[to_state] { + if base.has_intersection(condition) { + x.insert(*from_state); + } + } + } + + if x.is_empty() { + continue; + } + + // TARGETED SPLITTING: Only evaluate partitions we know overlap with 'x' + for &state in &x { + let p_idx = state_to_partition[state]; + if intersection_states[p_idx].is_empty() { + touched_partitions.push(p_idx); + } + intersection_states[p_idx].push(state); + } + + // Process only the affected partitions + for &p_idx in &touched_partitions { + let int_states = &mut intersection_states[p_idx]; + let y_len = partitions[p_idx].len(); + + // If the partition is fully contained in 'x', no split happens. + if int_states.len() == y_len { + int_states.clear(); + continue; + } + + // A split happens! 'int_states' becomes the new partition. + let new_idx = partitions.len(); + let mut new_part = IntSet::with_capacity(int_states.len()); + + for &state in int_states.iter() { + partitions[p_idx].remove(&state); // Remove from original (forming the difference) + new_part.insert(state); // Add to new partition (forming the intersection) + state_to_partition[state] = new_idx; // Update the lookup array + } + + let diff_len = partitions[p_idx].len(); + let int_len = new_part.len(); + + partitions.push(new_part); + in_worklist.push(false); + + // Worklist update + if in_worklist[p_idx] || int_len <= diff_len { + worklist.push(new_idx); + in_worklist[new_idx] = true; + } else { + worklist.push(p_idx); + in_worklist[p_idx] = true; + } + + int_states.clear(); + } + touched_partitions.clear(); + } + } + + if partitions.len() == all_states.len() { + self.minimal = true; + return Ok(()); + } + + self.rebuild_automaton_from_partition(&partitions)?; + + self.minimal = true; + Ok(()) + } + + fn rebuild_automaton_from_partition( + &mut self, + partitions: &[IntSet], + ) -> Result<(), EngineError> { + let mut state_to_rep = vec![0; self.transitions.len()]; + let mut representatives = Vec::with_capacity(partitions.len()); + + for partition in partitions { + let representative = if partition.contains(&self.get_start_state()) { + self.get_start_state() + } else { + *partition + .iter() + .next() + .expect("A partition cannot be empty") + }; + + representatives.push(representative); + + for &state in partition { + state_to_rep[state] = representative; + } + } + + let mut transitions_to_update = Vec::new(); + for &rep in &representatives { + for (condition, old_target) in self.transitions_from_vec(rep) { + let new_target = state_to_rep[old_target]; + transitions_to_update.push((rep, condition, new_target)); + } + } + + for partition in partitions { + for &state in partition { + let rep = state_to_rep[state]; + if state != rep { + self.remove_state(state); + } + } + } + + for (from, condition, to) in transitions_to_update { + self.add_transition(from, to, &condition); + } + + self.recompute_minimal_spanning_set() + } +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_minimize_various_regexes() -> Result<(), String> { + let test_cases = [ + "a", + "a|b", + "ab", + "a|a", + "a(b|c)d|a(b|c)d", + "(ab|ab|ab)", + "a*|b*", + "(a|b)*a(a|b)*", + "(abc|de)", + "a(b|c)*d", + "((a|b)c|(a|b)d)", + "a+b?", + "(a+b)*", + ]; + + for regex in test_cases { + assert_minimize(regex)?; + } + + Ok(()) + } + + fn assert_minimize(regex: &str) -> Result<(), String> { + println!("{regex}"); + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap().into_owned(); + let mut minimized_automaton = automaton.clone(); + minimized_automaton.minimize().unwrap(); + + assert!(automaton.equivalent(&minimized_automaton).unwrap()); + + assert!(minimized_automaton.is_deterministic()); + assert!(minimized_automaton.is_minimal()); + Ok(()) + } + + #[test] + fn test_minimize_union_complement_total() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|de)", false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + let mut complement = automaton.clone().into_owned(); + complement.complement().unwrap(); + + let union = automaton.union(&complement).unwrap(); + let mut union = union.determinize().unwrap().into_owned(); + + assert!(union.is_deterministic()); + assert!(!union.is_minimal()); + + union.minimize().unwrap(); + assert!(union.is_total()); + + assert!(union.is_deterministic()); + assert!(union.is_minimal()); + Ok(()) + } +} diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 37241a8..c34ca66 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -6,6 +6,7 @@ mod concat; mod determinize; mod difference; mod intersection; +mod minimize; mod repeat; mod union; diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 553fbb3..72dbb83 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -12,6 +12,9 @@ impl FastAutomaton { } pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.repeat_state_count_heuristic(min, max_opt))?; + if let Some(max) = max_opt && min > max { @@ -105,6 +108,88 @@ impl FastAutomaton { } Ok(()) } + + /// Computes the expected number of states after calling `repeat_mut`, + /// reusing the concatenation heuristic to determine loop costs. + fn repeat_state_count_heuristic(&self, min: u32, max_opt: Option) -> usize { + // 1. Invalid range clears the automaton + if let Some(max) = max_opt + && min > max + { + return 0; + } + + let v_original = self.get_number_of_states(); + if v_original == 0 { + return 0; + } + + let mut current_states = v_original; + let in_deg_start = self.in_degree(self.start_state) > 0; + + // --- REUSE CONCAT HEURISTIC HERE --- + // Calculate the state delta for a single concatenation. + let concat_cost = self.concat_state_count_heuristic(self) - v_original; + + // 2. Early state allocation for 0-minimum repeats with incoming start edges + if min == 0 && in_deg_start { + current_states += 1; + if max_opt.is_none() { + return current_states; + } + } + + // 3. Simple cases: 0..=1 or 1..=1 repetitions + if let Some(max) = max_opt + && min <= 1 + && max == 1 + { + return current_states; + } + + // 4. Minimum repetitions loop + let min_iters = if min == 0 { 0 } else { min - 1 }; + current_states += min_iters as usize * concat_cost; + + // 5. Infinite repetition (max_opt is None) + if max_opt.is_none() { + let mut v_modified = v_original; + let mut mod_start_in_deg_gt_0 = in_deg_start; + let acc_out_gt_0 = self.accept_states.iter().any(|&s| self.out_degree(s) > 0); + + // Check if it triggers the start-state removal optimization block + if self.accept_states.len() == 1 { + let accept_state = *self.accept_states.iter().next().unwrap(); + if self.out_degree(accept_state) == 0 && !in_deg_start { + // The old start state is removed in the cloned automaton + v_modified -= 1; + mod_start_in_deg_gt_0 = self.in_degree(accept_state) > 0; + } + } + + if min == 0 { + return v_modified; + } else { + // Calculate the final virtual concatenation cost manually since + // we can't pass a "virtually modified" automaton to concat_state_count_heuristic + let final_concat_cost = if mod_start_in_deg_gt_0 && acc_out_gt_0 { + v_modified + } else { + v_modified.saturating_sub(1) + }; + return current_states + final_concat_cost; + } + } + + // 6. Finite maximum repetition loop + let max = max_opt.unwrap(); + let loop_start = if min > 1 { min } else { 1 }; + let max_iters = max.saturating_sub(loop_start); + + current_states += max_iters as usize * concat_cost; + + current_states + } } #[cfg(test)] @@ -125,4 +210,66 @@ mod tests { assert!(!automaton.is_match("aa")); Ok(()) } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic(".{900}"); + assert_heuristic("[a-z]+"); + assert_heuristic("[a-z]+@"); + + assert_heuristic("[0-9]+[A-Z]*"); + assert_heuristic("a+(ba+)*"); + assert_heuristic("((a|bc)*|d)"); + assert_heuristic(".*"); + assert_heuristic("(ac|ads|a)*"); + assert_heuristic("((aad|ads|a)*|q)"); + + assert_heuristic( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@"); + assert_heuristic( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)"); + Ok(()) + } + + fn assert_heuristic(regex: &str) { + println!("Testing regex: {regex}"); + + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + // A matrix of test cases covering all edge cases in the repeat logic + let test_cases = vec![ + (0, Some(0)), // Zero-repeat + (0, Some(1)), // Optional once + (1, Some(1)), // Exactly once + (5, Some(10)), // Standard finite range + (0, None), // Zero or more (Kleene star) + (1, None), // One or more (Kleene plus) + (3, None), // Finite minimum, infinite maximum + ]; + + for (min, max_opt) in test_cases { + // Clone the original automaton to avoid mutating it across iterations + let mut actual_automaton = automaton.clone(); + + // Execute the actual mutation (assuming repeat_mut is the core method) + actual_automaton.repeat_mut(min, max_opt).unwrap(); + + let actual_states = actual_automaton.get_number_of_states(); + let heuristic_states = automaton.repeat_state_count_heuristic(min, max_opt); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for regex '{}' with min={}, max={:?}.\nExpected (heuristic): {}\nActual (computed): {}", + regex, min, max_opt, heuristic_states, actual_states + ); + } + } } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 96480ea..4fed407 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -60,8 +60,7 @@ impl FastAutomaton { new_states: &mut IntMap, condition_converter: &ConditionConverter, ) -> Result, EngineError> { - let mut imcomplete_states = - IntSet::with_capacity(other.out_degree(other.start_state) + 1); + let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); if other.is_accepted(other.start_state) { self.accept(self.start_state); } @@ -158,6 +157,9 @@ impl FastAutomaton { * - the accept states can't be merged if they have outgoing edges */ pub(crate) fn union_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.union_state_count_heuristic(other))?; + if other.is_empty() || self.is_total() { return Ok(()); } else if other.is_total() { @@ -204,13 +206,88 @@ impl FastAutomaton { } } self.cyclic = self.cyclic || other.cyclic; + self.minimal = false; Ok(()) } + + /// Computes the expected number of states after calling `union_mut`. + fn union_state_count_heuristic(&self, other: &FastAutomaton) -> usize { + // Edge cases + if other.is_empty() || self.is_total() { + return self.get_number_of_states(); + } else if other.is_total() || self.is_empty() { + return other.get_number_of_states(); + } + + let v1 = self.get_number_of_states(); + let v2 = other.get_number_of_states(); + + let self_in = self.in_degree(self.start_state); + let other_in = other.in_degree(other.start_state); + + let mut total_delta: i32 = 0; + + // --- 1. Start States Math --- + if self_in == 0 && other_in == 0 { + total_delta -= 1; + } else if self_in != 0 && other_in != 0 { + total_delta += 1; + } + + // Track which 'other' states are already mapped in the start phase + // so we don't double-count them when calculating accept state savings. + let mut mapped_other_states = std::collections::HashSet::new(); + mapped_other_states.insert(other.start_state); + + if other_in != 0 { + for (_, to_state) in other.transitions_from(other.start_state) { + mapped_other_states.insert(*to_state); + } + } + + // --- 2. Accept States Math --- + // Gather self's accept states. If other.start_state is accepted, + // it virtually triggers self.accept(self.start_state) early. + let mut self_accepts: std::collections::HashSet = + self.accept_states.iter().cloned().collect(); + + if other.is_accepted(other.start_state) { + self_accepts.insert(self.start_state); + } + + let case_a = self_in == 0 && other_in == 0; + let mut n = 0; + + for &state in &self_accepts { + let is_incomplete = case_a && state == self.start_state; + if self.out_degree(state) == 0 && !is_incomplete { + n += 1; + } + } + + let has_acc_target = n >= 1; + + // If n > 1, we replace `n` states with exactly 1 unified state. + if n > 1 { + total_delta += 1 - n; + } + + // Calculate mappings for other's accept states + if has_acc_target { + for &state in &other.accept_states { + if other.out_degree(state) == 0 && !mapped_other_states.contains(&state) { + total_delta -= 1; + } + } + } + + (v1 as i32 + v2 as i32 + total_delta) as usize + } } #[cfg(test)] mod tests { - use crate::regex::RegularExpression; + use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; #[test] fn test_simple_alternation_regex_1() -> Result<(), String> { @@ -371,4 +448,97 @@ mod tests { assert!(automaton.is_match("")); Ok(()) } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic(".{900}", "[a-z]+"); + + assert_heuristic("[a-z]+@", "[0-9]+[A-Z]*"); + + assert_heuristic("a+(ba+)*", "((a|bc)*|d)"); + + assert_heuristic(".*", "(ac|ads|a)*"); + + assert_heuristic( + "((aad|ads|a)*|q)", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic( + "(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", ".*"); + + assert_heuristic( + ".{900}", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + Ok(()) + } + + fn assert_heuristic(regex1: &str, regex2: &str) { + println!("Testing union heuristic for: '{}' | '{}'", regex1, regex2); + + let automaton1 = RegularExpression::parse(regex1, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton2 = RegularExpression::parse(regex2, false) + .unwrap() + .to_automaton() + .unwrap(); + + let test_pair = |a1: &FastAutomaton, a2: &FastAutomaton, desc: &str| { + let mut actual_union = a1.clone(); + actual_union.union_mut(a2).unwrap(); + + let actual_states = actual_union.get_number_of_states(); + let heuristic_states = a1.union_state_count_heuristic(a2); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for {}.\nExpected (heuristic): {}\nActual (computed): {}", + desc, heuristic_states, actual_states + ); + }; + + // Test standard union: A | B + test_pair( + &automaton1, + &automaton2, + &format!("'{}' | '{}'", regex1, regex2), + ); + + // Test reverse union: B | A + test_pair( + &automaton2, + &automaton1, + &format!("'{}' | '{}'", regex2, regex1), + ); + + // Test self-union: A | A + test_pair( + &automaton1, + &automaton1, + &format!("'{}' | '{}' (Self)", regex1, regex1), + ); + + // Test Empty states + let empty_automaton = FastAutomaton::new_empty(); + + test_pair( + &empty_automaton, + &automaton2, + &format!("Empty | '{}'", regex2), + ); + test_pair( + &automaton1, + &empty_automaton, + &format!("'{}' | Empty", regex1), + ); + } } diff --git a/src/lib.rs b/src/lib.rs index 177f0ff..a9f7391 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,7 +64,7 @@ pub type CharRange = RangeSet; /// /// // Analyze /// assert_eq!(rep.get_length(), (Some(6), Some(12))); -/// assert!(!rep.is_empty()); +/// assert!(!rep.is_empty().unwrap()); /// /// // Generate examples /// let samples = Term::from_pattern("(x|y){1,3}")? @@ -212,16 +212,9 @@ impl Term { /// } /// ``` pub fn union(&self, terms: &[Term]) -> Result { - if self.is_total() { - return Ok(Term::new_total()); - } - let mut has_automaton = matches!(self, Term::Automaton(_)); if !has_automaton { for term in terms { - if term.is_total() { - return Ok(Term::new_total()); - } if matches!(term, Term::Automaton(_)) { has_automaton = true; break; @@ -274,10 +267,6 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - if self.is_empty() || terms.iter().any(|t| t.is_empty()) { - return Ok(Term::new_empty()); - } - let parallel = terms.len() > 3; let automaton_list = self.get_automata(terms, parallel)?; @@ -319,6 +308,28 @@ impl Term { Ok(Term::Automaton(return_automaton)) } + /// Computes the complement of `self`. + /// + /// # Example: + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("(abc|de)").unwrap(); + /// + /// let complement = term.complement().unwrap(); + /// + /// assert!(term.intersection(&[complement.clone()]).unwrap().is_empty().unwrap()); + /// assert!(term.union(&[complement]).unwrap().is_total().unwrap()); + /// ``` + pub fn complement(&self) -> Result { + let automaton = self.to_automaton()?; + let mut automaton = automaton.determinize()?.into_owned(); + automaton.complement()?; + + Ok(Term::Automaton(automaton)) + } + /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. /// /// # Example: @@ -359,22 +370,28 @@ impl Term { /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_pattern("(abc|de){2}").unwrap(); + /// let mut term = Term::from_pattern("(abc|de){2}").unwrap(); + /// let mut batch: Vec; /// /// // Generate the first 2 matched strings - /// let batch_1 = term.generate_strings(2, 0).unwrap(); - /// assert_eq!(2, batch_1.len()); // ["dede", "deabc"] + /// let (term, batch) = term.generate_strings(2, 0).unwrap(); + /// assert_eq!(2, batch.len()); // ["dede", "deabc"] /// /// // Generate the next 2 matched strings by setting the offset - /// let batch_2 = term.generate_strings(2, 2).unwrap(); - /// assert_eq!(2, batch_2.len()); // ["abcde", "abcabc"] + /// let (term, batch) = term.generate_strings(2, 2).unwrap(); + /// assert_eq!(2, batch.len()); // ["abcde", "abcabc"] /// ``` pub fn generate_strings( &self, count: usize, offset: usize, - ) -> Result, EngineError> { - self.to_automaton()?.generate_strings(count, offset) + ) -> Result<(Term, Vec), EngineError> { + let deterministic_minimal_automaton = self.to_deterministic_minimal_automaton()?; + let generated_strings = deterministic_minimal_automaton.generate_strings(count, offset)?; + Ok(( + Term::Automaton(deterministic_minimal_automaton), + generated_strings, + )) } /// Returns `true` if both terms accept the same language. @@ -422,27 +439,57 @@ impl Term { } /// Checks if the term matches the empty language. - pub fn is_empty(&self) -> bool { - match self { + pub fn is_empty(&self) -> Result { + Ok(match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty(), - Term::Automaton(fast_automaton) => fast_automaton.is_empty(), - } + Term::Automaton(fast_automaton) => { + if fast_automaton.is_minimal() { + fast_automaton.is_empty() + } else if fast_automaton.is_empty() { + true + } else { + let mut fast_automaton = fast_automaton.determinize()?.into_owned(); + fast_automaton.minimize()?; + fast_automaton.is_empty() + } + } + }) } /// Checks if the term matches all possible strings. - pub fn is_total(&self) -> bool { - match self { + pub fn is_total(&self) -> Result { + Ok(match self { Term::RegularExpression(regular_expression) => regular_expression.is_total(), - Term::Automaton(fast_automaton) => fast_automaton.is_total(), - } + Term::Automaton(fast_automaton) => { + if fast_automaton.is_minimal() { + fast_automaton.is_total() + } else if fast_automaton.is_total() { + true + } else { + let mut fast_automaton = fast_automaton.determinize()?.into_owned(); + fast_automaton.minimize()?; + fast_automaton.is_total() + } + } + }) } /// Checks if the term matches only the empty string `""`. - pub fn is_empty_string(&self) -> bool { - match self { + pub fn is_empty_string(&self) -> Result { + Ok(match self { Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), - Term::Automaton(fast_automaton) => fast_automaton.is_empty_string(), - } + Term::Automaton(fast_automaton) => { + if fast_automaton.is_minimal() { + fast_automaton.is_empty_string() + } else if fast_automaton.is_empty_string() { + true + } else { + let mut fast_automaton = fast_automaton.determinize()?.into_owned(); + fast_automaton.minimize()?; + fast_automaton.is_empty_string() + } + } + }) } /// Returns the minimum and maximum length of matched strings. @@ -473,6 +520,20 @@ impl Term { }) } + /// Converts the term to a deterministic minimal [`FastAutomaton`]. + pub fn to_deterministic_minimal_automaton(&self) -> Result { + let mut automaton = self.to_automaton()?.into_owned(); + if !automaton.is_deterministic() { + automaton = automaton.determinize()?.into_owned(); + } + + if !automaton.is_minimal() { + automaton.minimize()?; + } + + Ok(automaton) + } + /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { @@ -546,13 +607,44 @@ mod tests { use super::*; + #[test] + fn test_complement() -> Result<(), String> { + let term = Term::from_pattern("(abc|de)").unwrap(); + + let complement = term.complement().unwrap(); + + assert!( + term.intersection(&[complement.clone()]) + .unwrap() + .is_empty() + .unwrap() + ); + + println!("term: {}", term.to_automaton().unwrap().as_dot()); + + if let Term::Automaton(complement) = &complement { + println!("complement: {}", complement.as_dot()); + } + + let union = term.union(&[complement]).unwrap(); + if let Term::Automaton(union) = &union { + println!("{}", union.as_dot()); + let union = union.determinize().unwrap(); + println!("{}", union.as_dot()); + } + + assert!(union.is_total().unwrap()); + + Ok(()) + } + #[test] fn test_intersection() -> Result<(), String> { let regex1 = Term::from_pattern("a").unwrap(); let regex2 = Term::from_pattern("b").unwrap(); let intersection = regex1.intersection(&vec![regex2]).unwrap(); - assert!(intersection.is_empty()); + assert!(intersection.is_empty().unwrap()); assert_eq!("[]", intersection.to_pattern()); Ok(()) From 284bad6f4e70343cc4c5483b8d63ae838973af73 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 21 Mar 2026 17:29:50 +0100 Subject: [PATCH 48/62] Update generate_strings --- src/execution_profile.rs | 4 +- src/fast_automaton/generate.rs | 10 ++--- src/lib.rs | 69 ++++++++++++++++++++-------------- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 3e4576c..81dbafe 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -37,7 +37,7 @@ use crate::error::EngineError; /// .build(); /// /// execution_profile.run(|| { -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 0).unwrap_err()); +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 0, false).unwrap_err()); /// }); /// ``` #[derive(Clone, Debug)] @@ -289,7 +289,7 @@ mod tests { .run(|| { assert_eq!( EngineError::OperationTimeOutError, - term.generate_strings(100, 0).unwrap_err() + term.generate_strings(100, 0, false).unwrap_err() ); let run_duration = Instant::now().duration_since(start_time).as_millis(); diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 1224b67..a40a82a 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -8,10 +8,10 @@ impl FastAutomaton { /// If the provided automaton is not deterministic, it is possible to get multiple time the same strings over multiple call with different offset. pub fn generate_strings( &self, - count: usize, + limit: usize, mut offset: usize, ) -> Result, EngineError> { - if self.is_empty() || count == 0 { + if self.is_empty() || limit == 0 { return Ok(vec![]); } @@ -22,7 +22,7 @@ impl FastAutomaton { let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); // Only allocate memory for the final `count`! - let mut strings = AHashSet::with_capacity(count); + let mut strings = AHashSet::with_capacity(limit); let mut visited = AHashSet::with_capacity(self.get_number_of_states()); let mut q = VecDeque::with_capacity(self.get_number_of_states()); @@ -46,13 +46,13 @@ impl FastAutomaton { Self::ranges_to_strings( &mut strings, &ranges, - count, + limit, &mut offset, &execution_profile, )?; } - if strings.len() >= count { + if strings.len() >= limit { break; } } diff --git a/src/lib.rs b/src/lib.rs index a9f7391..301bd0e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,7 +68,7 @@ pub type CharRange = RangeSet; /// /// // Generate examples /// let samples = Term::from_pattern("(x|y){1,3}")? -/// .generate_strings(5, 0)?; +/// .generate_strings(5, 0, false)?; /// println!("Some matches: {:?}", samples); /// /// // Equivalence & subset @@ -363,7 +363,17 @@ impl Term { } } - /// Generates `count` strings matched by the term, skipping the first `offset` strings. + /// Generates up to `limit` distinct strings matched by the term, skipping the first `offset` strings. + /// + /// When paginating through a large set of generated strings, you should set `return_stable_term` + /// to `true` on the initial call. This instructs the engine to compile the term into a deterministic + /// and minimized state (a "stable term"). + /// + /// Replacing your current term with this returned stable term for subsequent calls guarantees + /// that no strings are repeated. + /// + /// The stable term is returned as the first element of the tuple (`Some(Term)`). If the term is + /// already stable, or if `return_stable_term` is `false`, it returns `None` to save resources. /// /// # Example: /// @@ -371,27 +381,42 @@ impl Term { /// use regexsolver::Term; /// /// let mut term = Term::from_pattern("(abc|de){2}").unwrap(); - /// let mut batch: Vec; /// - /// // Generate the first 2 matched strings - /// let (term, batch) = term.generate_strings(2, 0).unwrap(); + /// // Generate the first 2 matched strings and request a stable term + /// let (stable_term, batch) = term.generate_strings(2, 0, true).unwrap(); /// assert_eq!(2, batch.len()); // ["dede", "deabc"] /// - /// // Generate the next 2 matched strings by setting the offset - /// let (term, batch) = term.generate_strings(2, 2).unwrap(); + /// // Update the term if a newly compiled stable term was returned + /// if let Some(t) = stable_term { + /// term = t; + /// } + /// + /// // Generate the next 2 matched strings by setting the offset using the stable term + /// let (_, batch) = term.generate_strings(2, 2, false).unwrap(); /// assert_eq!(2, batch.len()); // ["abcde", "abcabc"] /// ``` pub fn generate_strings( &self, - count: usize, + limit: usize, offset: usize, - ) -> Result<(Term, Vec), EngineError> { - let deterministic_minimal_automaton = self.to_deterministic_minimal_automaton()?; - let generated_strings = deterministic_minimal_automaton.generate_strings(count, offset)?; - Ok(( - Term::Automaton(deterministic_minimal_automaton), - generated_strings, - )) + return_stable_term: bool, + ) -> Result<(Option, Vec), EngineError> { + let automaton = self.to_automaton()?; + if !return_stable_term || automaton.is_deterministic() { + Ok((None, self.to_automaton()?.generate_strings(limit, offset)?)) + } else { + let mut automaton = automaton.into_owned(); + if !automaton.is_deterministic() { + automaton = automaton.determinize()?.into_owned(); + } + + if !automaton.is_minimal() { + automaton.minimize()?; + } + + let generated_strings = automaton.generate_strings(limit, offset)?; + Ok((Some(Term::Automaton(automaton)), generated_strings)) + } } /// Returns `true` if both terms accept the same language. @@ -520,20 +545,6 @@ impl Term { }) } - /// Converts the term to a deterministic minimal [`FastAutomaton`]. - pub fn to_deterministic_minimal_automaton(&self) -> Result { - let mut automaton = self.to_automaton()?.into_owned(); - if !automaton.is_deterministic() { - automaton = automaton.determinize()?.into_owned(); - } - - if !automaton.is_minimal() { - automaton.minimize()?; - } - - Ok(automaton) - } - /// Converts the term to a [`RegularExpression`]. pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { From aced1ab0c8b71a8d6a0cd47a1afe7a65518e9ff3 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 21 Mar 2026 22:42:16 +0100 Subject: [PATCH 49/62] Fix generate_strings --- Cargo.toml | 1 + src/execution_profile.rs | 4 +- src/fast_automaton/generate.rs | 204 +++++++++++++++++++++++++++------ tests/data/regex.txt | 12 +- 4 files changed, 176 insertions(+), 45 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0cf5afd..e11dad1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } rayon = "1.10.0" bit-set = "0.8.0" +indexmap = "2.13.0" [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 81dbafe..58ec9cf 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -37,7 +37,7 @@ use crate::error::EngineError; /// .build(); /// /// execution_profile.run(|| { -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 0, false).unwrap_err()); +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000, false).unwrap_err()); /// }); /// ``` #[derive(Clone, Debug)] @@ -289,7 +289,7 @@ mod tests { .run(|| { assert_eq!( EngineError::OperationTimeOutError, - term.generate_strings(100, 0, false).unwrap_err() + term.generate_strings(100, 1_000_000, false).unwrap_err() ); let run_duration = Instant::now().duration_since(start_time).as_millis(); diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index a40a82a..7eb57f0 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,7 +1,36 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; -use ahash::AHashSet; +use ahash::{AHashSet, RandomState}; +use indexmap::IndexSet; use super::*; +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +#[derive(Clone, Eq, PartialEq)] +struct QueueItem { + score: usize, + depth: usize, + state: usize, + ranges: Vec, + hash: u64, +} + +impl Ord for QueueItem { + fn cmp(&self, other: &Self) -> Ordering { + other + .score + .cmp(&self.score) + .then_with(|| self.depth.cmp(&other.depth)) + .then_with(|| self.state.cmp(&other.state)) + .then_with(|| self.hash.cmp(&other.hash)) + } +} + +impl PartialOrd for QueueItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} impl FastAutomaton { /// Generates `count` strings matched by the automaton, skipping the first `offset` strings. @@ -16,27 +45,71 @@ impl FastAutomaton { } let (_, max) = self.get_length(); - let max_len = max.unwrap_or(u32::MAX); + let max_len = max.unwrap_or(u32::MAX) as usize; let execution_profile = ExecutionProfile::get(); + let num_states = self.transitions.len(); + + // ----------------------------------------------------------------- + // 1. REVERSE BFS: Precalculate exact distances to Accept State + // ----------------------------------------------------------------- + let mut incoming = vec![vec![]; num_states]; + let mut dist_q = std::collections::VecDeque::new(); + let mut dist = vec![usize::MAX; num_states]; + + for state in self.states() { + if self.is_accepted(state as _) { + dist[state] = 0; + dist_q.push_back(state); + } + for (_cond, &to_state) in self.transitions_from(state as _) { + incoming[to_state].push(state); + } + } - let mut ranges_cache = AHashMap::with_capacity(self.get_number_of_states()); - // Only allocate memory for the final `count`! - let mut strings = AHashSet::with_capacity(limit); - let mut visited = AHashSet::with_capacity(self.get_number_of_states()); - let mut q = VecDeque::with_capacity(self.get_number_of_states()); + while let Some(state) = dist_q.pop_front() { + let d = dist[state]; + for &prev in &incoming[state] { + if dist[prev] == usize::MAX { + dist[prev] = d + 1; + dist_q.push_back(prev); + } + } + } - q.push_back((self.get_start_state(), vec![], 0u64)); + // ----------------------------------------------------------------- + // 2. A* SEARCH: Find matching strings instantly + // ----------------------------------------------------------------- + let mut ranges_cache = AHashMap::with_capacity(num_states); + let mut strings = IndexSet::with_capacity_and_hasher(limit, RandomState::default()); + let mut visited = AHashSet::with_capacity(num_states); + + let mut q = BinaryHeap::new(); + let start_state = self.get_start_state(); + + // If the start state can't reach an accept state, exit immediately + if dist[start_state] != usize::MAX { + q.push(QueueItem { + score: dist[start_state], + depth: 0, + state: start_state, + ranges: vec![], + hash: 0u64, + }); + } - while let Some((state, ranges, h)) = q.pop_front() { + while let Some(QueueItem { + score: _, + depth: current_depth, + state, + mut ranges, + hash: h, + }) = q.pop() + { execution_profile.assert_not_timed_out()?; - if ranges.len() > max_len as usize { - continue; - } - if self.is_accepted(state) { - if ranges.is_empty() { + if current_depth == 0 { if offset > 0 { offset -= 1; } else { @@ -57,31 +130,64 @@ impl FastAutomaton { } } + if current_depth >= max_len { + continue; + } + + let next_depth = current_depth + 1; + let mut valid_transitions = Vec::new(); + for (cond, &to_state) in self.transitions_from(state) { + let to_state_usize = to_state; + + // DEAD-END PRUNING: Instantly kill paths that cannot accept + if dist[to_state_usize] == usize::MAX { + continue; + } + let hash = Self::path_mix(h, Self::mix64(state as u64 ^ Self::mix64(to_state as u64))); - if visited.insert((to_state, ranges.len() + 1, hash)) { + if visited.insert((to_state, next_depth, hash)) { + let range = ranges_cache + .entry(cond) + .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) + .clone(); + + valid_transitions.push((to_state_usize, range, hash)); + } + } + + // Vector Reuse Optimization + if let Some((last_state, last_range, last_hash)) = valid_transitions.pop() { + for (to_state, range, hash) in valid_transitions { let mut new_ranges = ranges.clone(); - new_ranges.push( - ranges_cache - .entry(cond) - .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) - .clone(), - ); - - q.push_back((to_state, new_ranges, hash)); + new_ranges.push(range); + q.push(QueueItem { + score: next_depth + dist[to_state], // A* Score Formula + depth: next_depth, + state: to_state, + ranges: new_ranges, + hash, + }); } + + ranges.push(last_range); + q.push(QueueItem { + score: next_depth + dist[last_state], // A* Score Formula + depth: next_depth, + state: last_state, + ranges, + hash: last_hash, + }); } } - let mut strings: Vec = strings.into_iter().collect(); - strings.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); - Ok(strings) + Ok(strings.into_iter().collect()) } fn ranges_to_strings( - strings: &mut AHashSet, + strings: &mut IndexSet, ranges: &Vec, count: usize, offset: &mut usize, @@ -91,27 +197,21 @@ impl FastAutomaton { return Ok(()); } - // Precompute the lengths of each range to avoid repeated iteration overhead let range_lengths: Vec = ranges .iter() .map(|r| r.get_cardinality() as usize) .collect(); - // Calculate the total Cartesian combinations this path will yield let mut total_combinations = 1usize; for &len in &range_lengths { total_combinations = total_combinations.saturating_mul(len); } - // Analytical skip: if this entire subtree's yield is within the offset, - // subtract it and skip without doing any string allocations! if *offset >= total_combinations { *offset -= total_combinations; return Ok(()); } - // DFS generation using a single shared String buffer. - // This is significantly more memory efficient than building Vecs level by level. let mut current_str = String::with_capacity(ranges.len()); Self::generate_combinations( ranges, @@ -131,7 +231,7 @@ impl FastAutomaton { range_lengths: &[usize], depth: usize, current_str: &mut String, - strings: &mut AHashSet, + strings: &mut IndexSet, count: usize, offset: &mut usize, execution_profile: &ExecutionProfile, @@ -207,7 +307,39 @@ mod tests { use regex::Regex; #[test] - fn test_generate_strings() -> Result<(), String> { + fn test_generate_strings_1() -> Result<(), String> { + let automaton = + RegularExpression::parse(".*ab.*c(de|fg).*dab.*c(de|fg).*ab.*c(de|fg).*dab.*c", true) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + automaton.generate_strings(30, 0).unwrap(); + + Ok(()) + } + + #[test] + fn test_generate_strings_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|de){2}", true) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + let strings = automaton.generate_strings(2, 0).unwrap(); + assert_eq!(2, strings.len()); + + let strings = automaton.generate_strings(2, 2).unwrap(); + assert_eq!(2, strings.len()); + + Ok(()) + } + + #[test] + fn test_generate_strings_3() -> Result<(), String> { + assert_generate_strings(r"<([A-Za-z][A-Za-z0-9]*)[^>]*?/>", 500); assert_generate_strings("a{100}[a-z]", 100); assert_generate_strings("(ab|cd)e", 100); assert_generate_strings("[a-z]+", 100); @@ -295,8 +427,6 @@ mod tests { combined.extend(chunk2); combined.extend(chunk3); - combined.sort_unstable_by(|a, b| a.len().cmp(&b.len()).then_with(|| a.cmp(b))); - // Prove that generating in chunks perfectly matches the bulk generation assert_eq!( all_strings, combined, diff --git a/tests/data/regex.txt b/tests/data/regex.txt index 31aa829..c65eca9 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -55,18 +55,18 @@ a++ \p{Greek}+ \p{Sc} [a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,} -\b((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d)\b +((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d) https?://[^\s/$.?#][^\s]* \d{4}/\d{2}/\d{2} \d{1,2}:\d{2}(:\d{2})? -<([A-Za-z][A-Za-z0-9]*)\b[^>]*?/> +<([A-Za-z][A-Za-z0-9]*)[^>]*?/> \{(?:[^{}]|\{[^{}]*\})*\} -\b(?:\d[ -]*?){13,16}\b +(?:\d[ -]*?){13,16} #([A-Fa-f0-9]{8}) (a|b|c|d|e|f|g|h|i|j){5} (?:"[^"]*"|[^,]*)(?:,(?:"[^"]*"|[^,]*))* -\b([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b -\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b +([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2} +[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12} [[:alnum:]&&[^0-9]] [ \t]+ [\r\n]+ @@ -76,4 +76,4 @@ https?://[^\s/$.?#][^\s]* \{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) [+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? -<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> \ No newline at end of file +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> From 7c10284849eb8f451b7727e8204093707c90b534 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 18 Apr 2026 22:22:32 +0200 Subject: [PATCH 50/62] Fix buggy automaton to regex conversion --- src/fast_automaton/convert/to_regex/mod.rs | 34 +++- .../convert/to_regex/transform/mod.rs | 47 ----- .../to_regex/transform/shape/dotstar.rs | 172 ------------------ .../convert/to_regex/transform/shape/mod.rs | 1 - 4 files changed, 29 insertions(+), 225 deletions(-) delete mode 100644 src/fast_automaton/convert/to_regex/transform/mod.rs delete mode 100644 src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs delete mode 100644 src/fast_automaton/convert/to_regex/transform/shape/mod.rs diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index fbe36ce..4671a9d 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,23 +1,22 @@ use super::*; mod state_elimination; -mod transform; impl FastAutomaton { - /// Converts the term to a [`RegularExpression`]. + /// Converts the automaton to a [`RegularExpression`]. pub fn to_regex(&self) -> RegularExpression { - let transformed_automaton = transform::transform(self); - state_elimination::convert_to_regex(&transformed_automaton) + state_elimination::convert_to_regex(self) } } #[cfg(test)] mod tests { + use ::regex::Regex; + use super::*; #[test] fn test_convert() -> Result<(), String> { - assert_convert(".*u(ab|de)"); assert_convert(".*sf.*uif(ab|de)"); @@ -164,6 +163,31 @@ mod tests { Ok(()) } + #[test] + fn test_convert_after_operation_5() -> Result<(), String> { + let automaton = RegularExpression::parse(".*abc.*", false) + .unwrap() + .to_automaton() + .unwrap(); + let mut automaton = automaton.determinize().unwrap().into_owned(); + + automaton.complement().unwrap(); + + let result = format!("^{}$", automaton.to_regex().to_string()); + + println!("{result}"); + + let result = Regex::new(&result).unwrap(); + + assert!(!result.is_match("abc")); + assert!(!result.is_match("2374abc012")); + + assert!(result.is_match("bc")); + assert!(result.is_match("237a4bc012")); + + Ok(()) + } + #[test] fn test_automaton() -> Result<(), String> { let automaton = RegularExpression::parse("a*ba*", false) diff --git a/src/fast_automaton/convert/to_regex/transform/mod.rs b/src/fast_automaton/convert/to_regex/transform/mod.rs deleted file mode 100644 index 643e6ba..0000000 --- a/src/fast_automaton/convert/to_regex/transform/mod.rs +++ /dev/null @@ -1,47 +0,0 @@ -use crate::fast_automaton::{ - FastAutomaton, convert::to_regex::transform::shape::dotstar::dot_star, -}; - -mod shape; - -const TRANSFORM_FUNCTION: &[fn(&FastAutomaton) -> FastAutomaton] = &[dot_star]; - -pub fn transform(automaton: &FastAutomaton) -> FastAutomaton { - let mut automaton = automaton.clone(); - for transform in TRANSFORM_FUNCTION { - automaton = transform(&automaton); - } - - automaton -} - -#[cfg(test)] -mod tests { - use crate::{ - fast_automaton::convert::to_regex::transform::transform, regex::RegularExpression, - }; - - #[test] - fn test_equivalence() -> Result<(), String> { - assert_equivalent("abc"); - assert_equivalent(".*abc"); - assert_equivalent(".*abc.*def"); - assert_equivalent(".*abc.*def(ab|fr)"); - assert_equivalent(".*abc.*def(ab|fr).*mpa"); - - Ok(()) - } - - fn assert_equivalent(pattern: &str) { - let before = RegularExpression::parse(pattern, false) - .unwrap() - .to_automaton() - .unwrap(); - - let before = before.determinize().unwrap(); - - let after = transform(&before); - - assert!(before.equivalent(&after).unwrap()); - } -} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs b/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs deleted file mode 100644 index 6c91106..0000000 --- a/src/fast_automaton/convert/to_regex/transform/shape/dotstar.rs +++ /dev/null @@ -1,172 +0,0 @@ -use nohash_hasher::IntSet; - -use crate::fast_automaton::{FastAutomaton, State, condition::Condition}; - -pub(crate) fn dot_star(automaton: &FastAutomaton) -> FastAutomaton { - let components = identify_and_apply_components(automaton); - - let mut automaton = automaton.clone(); - for component in components { - dot_star_component(&mut automaton, &component); - } - - automaton -} - -fn dot_star_component(automaton: &mut FastAutomaton, component: &IntSet) { - let mut start_state = if component.contains(&automaton.start_state) { - Some(automaton.start_state) - } else { - None - }; - for &state in component { - for (from_state, _) in automaton.transitions_to_vec(state) { - if !component.contains(&from_state) { - if start_state.is_none() { - start_state = Some(state); - } else { - // Only one start state possible - return; - } - } - } - } - - if start_state.is_none() { - // Only one start state possible - return; - } - let start_state = start_state.unwrap(); - - let mut first_hop = automaton - .direct_states(start_state) - .filter(|&s| s != start_state) - .collect::>(); - let mut states_to_remove = vec![]; - - for state in &first_hop { - let transitions = automaton.transitions_to_vec(*state); - if !transitions.iter().all(|(_, c)| *c == transitions[0].1) { - // Some condition(s) to a given first hop state are not the same. - return; - } - - if transitions.len() != component.len() { - states_to_remove.push(*state); - } - } - - states_to_remove.iter().for_each(|s| { - first_hop.remove(s); - }); - - let mut out_condition = None; - for &state in component { - let mut has_transition_to_start_state = false; - - let mut this_condition = Condition::empty(automaton.get_spanning_set()); - for (condition, &to_state) in automaton.transitions_from(state) { - if to_state == start_state { - has_transition_to_start_state = true; - } - - this_condition = this_condition.union(condition); - } - if !has_transition_to_start_state { - // Some state(s) do not have transition to the start state. - return; - } - - if let Some(condition) = &out_condition { - if &this_condition != condition { - // The union of outcoming condition for some states are not identical - return; - } - } else { - out_condition = Some(this_condition); - } - } - - automaton.add_transition(start_state, start_state, &out_condition.unwrap()); - for &state in component { - for to_state in automaton.direct_states_vec(state) { - if !component.contains(&to_state) { - continue; - } - - if state != start_state && (to_state == start_state || first_hop.contains(&to_state)) { - automaton.remove_transition(state, to_state); - } - } - } - for state in states_to_remove { - automaton.remove_state(state); - } -} - -pub fn identify_and_apply_components(automaton: &FastAutomaton) -> Vec> { - let mut index = 0; - let mut stack = Vec::new(); - let mut indices = vec![-1; automaton.transitions.len()]; - let mut lowlink = vec![-1; automaton.transitions.len()]; - let mut on_stack = vec![false; automaton.transitions.len()]; - let mut scc = Vec::new(); - - for state in automaton.states() { - if indices[state] == -1 { - strongconnect( - automaton, - state, - &mut index, - &mut stack, - &mut indices, - &mut lowlink, - &mut on_stack, - &mut scc, - ); - } - } - - scc.into_iter() - .filter(|states| states.len() != 1) - .collect::>() -} - -#[allow(clippy::too_many_arguments)] -fn strongconnect( - automaton: &FastAutomaton, - v: usize, - index: &mut usize, - stack: &mut Vec, - indices: &mut Vec, - lowlink: &mut Vec, - on_stack: &mut Vec, - scc: &mut Vec>, -) { - indices[v] = *index as i32; - lowlink[v] = *index as i32; - *index += 1; - stack.push(v); - on_stack[v] = true; - - for w in automaton.direct_states(v) { - if indices[w] == -1 { - strongconnect(automaton, w, index, stack, indices, lowlink, on_stack, scc); - lowlink[v] = lowlink[v].min(lowlink[w]); - } else if on_stack[w] { - lowlink[v] = lowlink[v].min(indices[w]); - } - } - - if lowlink[v] == indices[v] { - let mut component = IntSet::default(); - while let Some(w) = stack.pop() { - on_stack[w] = false; - component.insert(w); - if w == v { - break; - } - } - scc.push(component); - } -} diff --git a/src/fast_automaton/convert/to_regex/transform/shape/mod.rs b/src/fast_automaton/convert/to_regex/transform/shape/mod.rs deleted file mode 100644 index 5c83bf6..0000000 --- a/src/fast_automaton/convert/to_regex/transform/shape/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(super) mod dotstar; \ No newline at end of file From 0cc654c8f7c5fd5a357aef529265184e16f40e62 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:18:25 +0200 Subject: [PATCH 51/62] remove serialization --- .github/workflows/rust.yml | 19 +- Cargo.toml | 17 -- src/cardinality/mod.rs | 5 - src/error/mod.rs | 8 - src/fast_automaton/mod.rs | 2 - src/fast_automaton/serializer/mod.rs | 181 ----------------- .../serializer/tokenizer/embed_automaton.rs | 192 ------------------ .../serializer/tokenizer/mod.rs | 68 ------- .../serializer/tokenizer/range_tokenizer.rs | 76 ------- .../tokenizer/token/automaton_token.rs | 94 --------- .../serializer/tokenizer/token/mod.rs | 26 --- .../serializer/tokenizer/token/range_token.rs | 55 ----- src/fast_automaton/spanning_set/mod.rs | 8 +- src/lib.rs | 6 - src/regex/mod.rs | 2 - 15 files changed, 13 insertions(+), 746 deletions(-) delete mode 100644 src/fast_automaton/serializer/mod.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/embed_automaton.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/mod.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/range_tokenizer.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/token/automaton_token.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/token/mod.rs delete mode 100644 src/fast_automaton/serializer/tokenizer/token/range_token.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7730802..0666f63 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -2,23 +2,22 @@ name: Rust on: push: - branches: [ "main" ] + branches: ["main"] pull_request: - branches: [ "main" ] + branches: ["main"] env: CARGO_TERM_COLOR: always jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Build - run: cargo build --all-features --verbose - - name: Test & Lint - run: | - cargo test --all-features - cargo clippy --all-features + - uses: actions/checkout@v4 + - name: Build + run: cargo build --verbose + - name: Test & Lint + run: | + cargo test + cargo clippy diff --git a/Cargo.toml b/Cargo.toml index e11dad1..d67f846 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,12 +10,6 @@ description = "High-performance Rust library for building, combining, and analyz readme = "README.md" [dependencies] -serde = { version = "1.0", features = ["derive"], optional = true } -ciborium = { version = "0.2.2", optional = true } -z85 = { version = "3.0.5", optional = true } -flate2 = { version = "1.0.30", features = [ - "zlib-ng", -], default-features = false, optional = true } nohash-hasher = "0.2" ahash = "0.8.11" log = "0.4.21" @@ -33,17 +27,6 @@ criterion = { version = "0.5", features = ["html_reports"] } env_logger = "0.11.3" serde_json = "1.0.114" - -[features] -default = [] -serializable = [ - "regex-charclass/serde", - "dep:serde", - "dep:ciborium", - "dep:z85", - "dep:flate2", -] - [[bench]] name = "my_benchmark" harness = false diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 54bdcde..4456820 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,10 +1,5 @@ -#[cfg(feature = "serializable")] -use serde::{Deserialize, Serialize}; - /// Represent a number. -#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value", rename_all = "camelCase"))] pub enum Cardinality { /// An infinite number. Infinite, diff --git a/src/error/mod.rs b/src/error/mod.rs index 29052b4..acaa9ef 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,8 +1,5 @@ use std::fmt::{self}; -#[cfg(feature = "serializable")] -use crate::fast_automaton::serializer::tokenizer::token::TokenError; - /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] pub enum EngineError { @@ -16,9 +13,6 @@ pub enum EngineError { RegexSyntaxError(String), /// The provided range can not be built from the spanning set. ConditionInvalidRange, - #[cfg(feature = "serializable")] - /// There is an error with one of the token. - TokenError(TokenError), } impl fmt::Display for EngineError { @@ -30,8 +24,6 @@ impl fmt::Display for EngineError { write!(f, "The automaton has too many states.") } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), - #[cfg(feature = "serializable")] - EngineError::TokenError(err) => write!(f, "{err}."), EngineError::ConditionInvalidRange => write!( f, "The provided range can not be built from the spanning set." diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index ccf9784..b2cc6d1 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -20,8 +20,6 @@ pub mod condition; mod convert; mod generate; mod operation; -#[cfg(feature = "serializable")] -pub mod serializer; pub mod spanning_set; /// Represent a finite state automaton. diff --git a/src/fast_automaton/serializer/mod.rs b/src/fast_automaton/serializer/mod.rs deleted file mode 100644 index 7a40bae..0000000 --- a/src/fast_automaton/serializer/mod.rs +++ /dev/null @@ -1,181 +0,0 @@ -use crate::fast_automaton::serializer::tokenizer::token::automaton_token::AutomatonToken; -use crate::fast_automaton::serializer::tokenizer::Tokenizer; - -use super::*; -use serde::{Deserialize, Serialize}; -use serde::{Deserializer, Serializer, de, ser}; - -use z85::{decode, encode}; - -use flate2::Compression; -use flate2::read::ZlibDecoder; -use flate2::write::ZlibEncoder; -use std::io::prelude::*; - -#[cfg(feature = "serializable")] -pub mod tokenizer; - -#[derive(Serialize, Deserialize, Debug)] -struct SerializedAutomaton(Vec, SpanningSet, usize); - -impl serde::Serialize for FastAutomaton { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let tokenizer = Tokenizer::new(self); - let number_of_states = self.get_number_of_states(); - match AutomatonToken::to_tokens( - &tokenizer.to_embedding(), - self.get_spanning_set().get_number_of_spanning_ranges(), - number_of_states, - ) { - Ok(tokens) => { - let serialized_automaton = - SerializedAutomaton(tokens, self.get_spanning_set().clone(), number_of_states); - - let mut serialized = Vec::with_capacity(number_of_states * 8); - if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { - return Err(ser::Error::custom(err.to_string())); - } - - serializer.serialize_str(&encode(compress_data(&serialized))) - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } - } -} - -impl<'de> serde::Deserialize<'de> for FastAutomaton { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - match String::deserialize(deserializer) { - Ok(decoded) => match decode(decoded) { - Ok(compressed) => { - let payload = decompress_data(&compressed); - - let automaton: Result< - SerializedAutomaton, - ciborium::de::Error, - > = ciborium::from_reader(&payload[..]); - match automaton { - Ok(automaton) => { - let mut temp_automaton = FastAutomaton::new_empty(); - temp_automaton.spanning_set = automaton.1; - let number_of_states = automaton.2; - let number_of_bases = - temp_automaton.spanning_set.get_number_of_spanning_ranges(); - let tokenizer = Tokenizer::new(&temp_automaton); - - match tokenizer.from_embedding( - &automaton - .0 - .into_iter() - .map(|t| { - AutomatonToken::from_token( - t, - number_of_bases, - number_of_states, - ) - }) - .collect::>(), - ) { - Ok(res) => Ok(res), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - }, - Err(err) => Err(err), - } - } -} - -fn compress_data(data: &[u8]) -> Vec { - let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default()); - encoder.write_all(data).expect("Failed to write data"); - encoder.finish().expect("Failed to finish compression") -} - -fn decompress_data(data: &[u8]) -> Vec { - let mut decoder = ZlibDecoder::new(data); - let mut decompressed_data = Vec::new(); - decoder - .read_to_end(&mut decompressed_data) - .expect("Failed to read data"); - decompressed_data -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_serialization() -> Result<(), String> { - assert_serialization("..."); - assert_serialization(".*abc"); - assert_serialization(".*"); - assert_serialization(".*abcdef.*dsqd"); - assert_serialization( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", - ); - assert_serialization( - "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", - ); - - Ok(()) - } - - fn assert_serialization(regex: &str) { - let regex = RegularExpression::parse(regex, false).unwrap(); - println!("{regex}"); - - let automaton = regex.to_automaton().unwrap(); - - let serialized = serde_json::to_string(&automaton).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = automaton.determinize().unwrap(); - - assert!(automaton.difference(&unserialized).unwrap().is_empty()); - assert!(unserialized.difference(&automaton).unwrap().is_empty()); - } - - #[test] - fn test_serialization_case_1() -> Result<(), String> { - let automaton1 = RegularExpression::parse(".*", false) - .unwrap() - .to_automaton() - .unwrap(); - let automaton2 = RegularExpression::parse("\\d+", false) - .unwrap() - .to_automaton() - .unwrap(); - let automaton2 = automaton2.determinize().unwrap(); - - let difference = automaton1.difference(&automaton2).unwrap(); - - let serialized = serde_json::to_string(&difference).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = difference.determinize().unwrap(); - - assert!(automaton.difference(&unserialized).unwrap().is_empty()); - assert!(unserialized.difference(&automaton).unwrap().is_empty()); - - Ok(()) - } -} diff --git a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs b/src/fast_automaton/serializer/tokenizer/embed_automaton.rs deleted file mode 100644 index bfe8197..0000000 --- a/src/fast_automaton/serializer/tokenizer/embed_automaton.rs +++ /dev/null @@ -1,192 +0,0 @@ -use token::TokenError; - -use crate::{error::EngineError, fast_automaton::{condition::Condition, serializer::tokenizer::token::automaton_token::AutomatonToken}, CharRange}; - -use self::token::range_token::RangeToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_embedding(&self) -> Vec { - let mut vec = vec![]; - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::default(); - - worklist.push_front(self.automaton.get_start_state()); - - while let Some(current_state) = worklist.pop_back() { - if !seen.insert(current_state) { - continue; - } - if !vec.is_empty() { - // separator - vec.push(AutomatonToken::SeparatorState) - } - - // state - let embedded_state = - AutomatonToken::State(*self.state_to_token.get(¤t_state).unwrap()); - vec.push(embedded_state); - - if self.automaton.is_accepted(current_state) { - // accept state - vec.push(AutomatonToken::AcceptState) - } - - for (condition, to_state) in self.automaton.transitions_from(current_state) { - if condition.is_empty() { - continue; - } - let embedded_state = - AutomatonToken::State(*self.state_to_token.get(to_state).unwrap()); - vec.push(embedded_state); - - if condition.is_total() { - vec.push(AutomatonToken::Range(RangeToken::Total)); - } else { - let range = condition - .to_range(self.automaton.get_spanning_set()) - .expect("It should be possible to convert the condition to range."); - self.range_tokenizer - .range_to_embedding(&range) - .unwrap() - .iter() - .for_each(|&e| { - vec.push(AutomatonToken::Range(e)); - }); - } - - if !seen.contains(to_state) { - worklist.push_front(*to_state); - } - } - } - - vec - } - - pub fn from_embedding(&self, vec: &Vec) -> Result { - let mut automaton = FastAutomaton::new_empty(); - automaton.apply_new_spanning_set(self.automaton.get_spanning_set())?; - - let mut from_state = None; - let mut to_state = None; - let mut range = CharRange::empty(); - for token in vec { - match token { - AutomatonToken::Range(r) => { - range = range.union(self.range_tokenizer.token_to_range(r).unwrap()); - } - AutomatonToken::State(s) => { - while !automaton.has_state(*s) { - automaton.new_state(); - } - if let Some(fs) = from_state { - if let Some(ts) = to_state { - Self::apply_transition(&mut automaton, fs, ts, &range)?; - range = CharRange::empty(); - } - to_state = Some(*s); - } else { - from_state = Some(*s); - } - } - AutomatonToken::AcceptState => { - automaton.accept(from_state.unwrap()); - } - AutomatonToken::SeparatorState => { - if let Some(to_state) = to_state { - Self::apply_transition( - &mut automaton, - from_state.unwrap(), - to_state, - &range, - )?; - } - from_state = None; - to_state = None; - range = CharRange::empty(); - } - _ => return Err(EngineError::TokenError(TokenError::UnknownToken)), - }; - } - if let Some(to_state) = to_state { - Self::apply_transition(&mut automaton, from_state.unwrap(), to_state, &range)?; - } - Ok(automaton) - } - - fn apply_transition( - automaton: &mut FastAutomaton, - from_state: State, - to_state: State, - range: &CharRange, - ) -> Result<(), EngineError> { - let condition = Condition::from_range(range, automaton.get_spanning_set())?; - automaton.add_transition(from_state, to_state, &condition); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion("(a|b)"); - assert_embedding_convertion("(|a)"); - assert_embedding_convertion(".*ab"); - assert_embedding_convertion("toto"); - assert_embedding_convertion(".{2,3}"); - assert_embedding_convertion("q(ab|ca|ab|abc)x"); - assert_embedding_convertion(".*q(ab|ca|ab|abc)x"); - assert_embedding_convertion( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - ); - assert_embedding_convertion( - "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", - ); - - Ok(()) - } - - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::parse(regex, false).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap(); - let automaton = automaton.determinize().unwrap(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_embedding(); - - let number_of_bases = automaton.get_spanning_set().get_number_of_spanning_ranges(); - let number_of_states = automaton.get_number_of_states(); - - let embedding_usize = - AutomatonToken::to_tokens(&embedding, number_of_bases, number_of_states).unwrap(); - let embedding: Vec = embedding_usize - .iter() - .map(|&t| AutomatonToken::from_token(t, number_of_bases, number_of_states)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!( - automaton - .difference(&unembedded_automaton) - .unwrap() - .is_empty() - ); - assert!( - unembedded_automaton - .difference(&automaton) - .unwrap() - .is_empty() - ); - } -} diff --git a/src/fast_automaton/serializer/tokenizer/mod.rs b/src/fast_automaton/serializer/tokenizer/mod.rs deleted file mode 100644 index 8dccca2..0000000 --- a/src/fast_automaton/serializer/tokenizer/mod.rs +++ /dev/null @@ -1,68 +0,0 @@ -use std::{cmp::Ordering, collections::VecDeque, vec}; - -use crate::fast_automaton::serializer::tokenizer::range_tokenizer::RangeTokenizer; -use crate::fast_automaton::spanning_set::SpanningSet; -use crate::{ - IntMap, IntSet, - fast_automaton::{FastAutomaton, State}, -}; -use ahash::HashMapExt; - - -mod embed_automaton; -pub mod range_tokenizer; -pub mod token; - -#[derive(Debug)] -pub struct Tokenizer<'a> { - range_tokenizer: RangeTokenizer<'a>, - automaton: &'a FastAutomaton, - state_to_token: IntMap, -} - -impl Tokenizer<'_> { - pub fn new(automaton: &FastAutomaton) -> Tokenizer<'_> { - let mut worklist = VecDeque::with_capacity(automaton.get_number_of_states()); - let mut seen = IntSet::default(); - - worklist.push_front(automaton.get_start_state()); - - let mut state_counter = 0; - let mut state_to_token = IntMap::with_capacity(automaton.get_number_of_states()); - - while let Some(current_state) = worklist.pop_back() { - if !seen.insert(current_state) { - continue; - } - - state_to_token.insert(current_state, state_counter); - state_counter += 1; - - automaton - .transitions_from(current_state) - .filter(|(c, _)| !c.is_empty()) - .for_each(|(_, to_state)| { - if !seen.contains(to_state) { - worklist.push_front(*to_state); - } - }); - } - - Tokenizer { - range_tokenizer: RangeTokenizer::new(automaton.get_spanning_set()), - automaton, - state_to_token, - } - } - - pub fn get_number_of_spanning_ranges(&self) -> usize { - self.range_tokenizer.get_number_of_spanning_ranges() - } - - pub fn get_spanning_set(&self) -> &SpanningSet { - self.range_tokenizer.get_spanning_set() - } -} - -#[cfg(test)] -mod tests {} diff --git a/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs b/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs deleted file mode 100644 index e3b3c9c..0000000 --- a/src/fast_automaton/serializer/tokenizer/range_tokenizer.rs +++ /dev/null @@ -1,76 +0,0 @@ -use crate::CharRange; - -use self::token::range_token::RangeToken; - -use super::*; - -#[derive(Debug)] -pub struct RangeTokenizer<'a> { - spanning_set: &'a SpanningSet, - total: CharRange, -} - -impl RangeTokenizer<'_> { - pub fn get_spanning_set(&self) -> &SpanningSet { - self.spanning_set - } - - pub fn new(spanning_set: &SpanningSet) -> RangeTokenizer<'_> { - let total = spanning_set.get_rest().complement(); - RangeTokenizer { - spanning_set, - total, - } - } - - pub fn range_to_embedding(&self, range: &CharRange) -> Option> { - if range == &self.total { - return Some(vec![RangeToken::Total]); - } else if !range.difference(&self.total).is_empty() { - return None; - } - - let mut vec = vec![]; - for (token, base) in self.spanning_set.get_spanning_ranges().enumerate() { - if range.contains_all(base) { - vec.push(RangeToken::Base(token)); - } - } - vec.sort_unstable(); - - Some(vec) - } - - pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { - if vec.is_empty() { - return Some(CharRange::empty()); - } - - let mut range = CharRange::empty(); - if vec[0] == RangeToken::Total { - return Some(self.total.clone()); - } - - for token in vec { - if let Some(base) = self.token_to_range(token) { - range = range.union(base); - } else { - return None; - } - } - - Some(range) - } - - pub fn token_to_range(&self, token: &RangeToken) -> Option<&CharRange> { - match token { - RangeToken::Total => Some(&self.total), - RangeToken::Base(b) => self.spanning_set.get_spanning_range(*b), - RangeToken::Error => panic!("error token"), - } - } - - pub fn get_number_of_spanning_ranges(&self) -> usize { - self.spanning_set.get_number_of_spanning_ranges() - } -} diff --git a/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs b/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs deleted file mode 100644 index 2e68ded..0000000 --- a/src/fast_automaton/serializer/tokenizer/token/automaton_token.rs +++ /dev/null @@ -1,94 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum AutomatonToken { - Range(RangeToken), - State(usize), - AcceptState, - SeparatorState, - Error, -} - -impl Ord for AutomatonToken { - fn cmp(&self, other: &Self) -> Ordering { - match (self, other) { - (AutomatonToken::Range(a), AutomatonToken::Range(b)) => a.cmp(b), - (AutomatonToken::Range(_), _) => Ordering::Less, - (_, AutomatonToken::Range(_)) => Ordering::Greater, - - (AutomatonToken::State(a), AutomatonToken::State(b)) => a.cmp(b), - (AutomatonToken::State(_), _) => Ordering::Less, - (_, AutomatonToken::State(_)) => Ordering::Greater, - - (AutomatonToken::AcceptState, AutomatonToken::AcceptState) => Ordering::Equal, - (AutomatonToken::AcceptState, _) => Ordering::Less, - (_, AutomatonToken::AcceptState) => Ordering::Greater, - - (AutomatonToken::SeparatorState, AutomatonToken::SeparatorState) => Ordering::Equal, - (AutomatonToken::SeparatorState, _) => Ordering::Less, - (_, AutomatonToken::SeparatorState) => Ordering::Greater, - - (AutomatonToken::Error, AutomatonToken::Error) => Ordering::Equal, - } - } -} - -impl PartialOrd for AutomatonToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl AutomatonToken { - pub fn from_token( - token: usize, - number_of_bases: usize, - number_of_states: usize, - ) -> AutomatonToken { - let states = number_of_bases + 1; - let accept_state = states + number_of_states; - let separator_state = accept_state + 1; - if (0..states).contains(&token) { - AutomatonToken::Range(RangeToken::from_token(token, number_of_bases)) - } else if (states..accept_state).contains(&token) { - AutomatonToken::State(token - states) - } else if token == accept_state { - AutomatonToken::AcceptState - } else if token == separator_state { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - pub fn to_token( - &self, - number_of_bases: usize, - number_of_states: usize, - ) -> Result { - let states = number_of_bases + 1; - let accept_state = states + number_of_states; - let separator_state = accept_state + 1; - Ok(match self { - AutomatonToken::Range(r) => r.to_token(number_of_bases)?, - AutomatonToken::State(s) => s + states, - AutomatonToken::AcceptState => accept_state, - AutomatonToken::SeparatorState => separator_state, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } - - pub fn to_tokens( - tokens: &[Self], - number_of_bases: usize, - number_of_states: usize, - ) -> Result, TokenError> { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_token(number_of_bases, number_of_states)?); - } - Ok(vec) - } -} diff --git a/src/fast_automaton/serializer/tokenizer/token/mod.rs b/src/fast_automaton/serializer/tokenizer/token/mod.rs deleted file mode 100644 index c510dd4..0000000 --- a/src/fast_automaton/serializer/tokenizer/token/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -use std::fmt::Display; - -use super::*; - -pub mod automaton_token; -pub mod range_token; - -#[derive(Debug, PartialEq, Eq)] -pub enum TokenError { - TokenOutOfBound(&'static str, usize, usize), - UnknownToken, - SyntaxError, -} - -impl Display for TokenError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TokenError::TokenOutOfBound(token, expected, got) => write!( - f, - "TokenOutOfBound: {token}, expected: {expected}, got: {got}." - ), - TokenError::UnknownToken => write!(f, "UnknownToken"), - TokenError::SyntaxError => write!(f, "SyntaxError"), - } - } -} \ No newline at end of file diff --git a/src/fast_automaton/serializer/tokenizer/token/range_token.rs b/src/fast_automaton/serializer/tokenizer/token/range_token.rs deleted file mode 100644 index 20ed515..0000000 --- a/src/fast_automaton/serializer/tokenizer/token/range_token.rs +++ /dev/null @@ -1,55 +0,0 @@ -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RangeToken { - Total, - Base(usize), - Error, -} - -impl Ord for RangeToken { - fn cmp(&self, other: &Self) -> Ordering { - match (self, other) { - (RangeToken::Total, RangeToken::Total) => Ordering::Equal, - (RangeToken::Total, _) => Ordering::Less, - (_, RangeToken::Total) => Ordering::Greater, - (RangeToken::Base(a), RangeToken::Base(b)) => a.cmp(b), - (RangeToken::Base(_), RangeToken::Error) => Ordering::Less, - (RangeToken::Error, RangeToken::Base(_)) => Ordering::Greater, - (RangeToken::Error, RangeToken::Error) => Ordering::Equal, - } - } -} - -impl PartialOrd for RangeToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RangeToken { - pub fn from_token(token: usize, number_of_bases: usize) -> RangeToken { - let max_number_of_bases = number_of_bases + 1; - if token == 0 { - RangeToken::Total - } else if (1..max_number_of_bases).contains(&token) { - RangeToken::Base(token - 1) - } else { - RangeToken::Error - } - } - - pub fn to_token(&self, number_of_bases: usize) -> Result { - let max_number_of_bases = number_of_bases + 1; - Ok(match self { - RangeToken::Total => 0, - RangeToken::Base(b) => { - if *b > max_number_of_bases { - return Err(TokenError::TokenOutOfBound("Base", max_number_of_bases, *b)); - } - b + 1 - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 30d2a07..54c3e5c 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -2,17 +2,17 @@ use std::slice::Iter; use ahash::AHashSet; -#[cfg(feature = "serializable")] -use serde::{Deserialize, Serialize}; - use crate::CharRange; /// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. -#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, Debug, PartialEq, Eq)] pub struct SpanningSet(Vec, CharRange); impl SpanningSet { + pub fn new(ranges: Vec, rest: CharRange) -> Self { + SpanningSet(ranges, rest) + } + pub fn new_empty() -> Self { SpanningSet(vec![], CharRange::total()) } diff --git a/src/lib.rs b/src/lib.rs index 301bd0e..e1d194c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,6 @@ use nohash_hasher::NoHashHasher; use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serializable")] -use serde::{Deserialize, Serialize}; use crate::execution_profile::ExecutionProfile; @@ -83,13 +81,9 @@ pub type CharRange = RangeSet; /// ``` /// /// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. -#[cfg_attr(feature = "serializable", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serializable", serde(tag = "type", content = "value"))] pub enum Term { - #[cfg_attr(feature = "serializable", serde(rename = "regex"))] RegularExpression(RegularExpression), - #[cfg_attr(feature = "serializable", serde(rename = "fair"))] Automaton(FastAutomaton), } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index a7682f6..bdc5484 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -11,8 +11,6 @@ use super::*; mod analyze; mod builder; mod operation; -#[cfg(feature = "serializable")] -mod serializer; /// Represent a regular expression. #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] From cc793fff3a204a7df9df1ff10cf528de0e3894dc Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:30:35 +0200 Subject: [PATCH 52/62] Make remove_dead_transitions public --- src/fast_automaton/operation/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index c34ca66..9faa952 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -11,7 +11,7 @@ mod repeat; mod union; impl FastAutomaton { - pub(crate) fn remove_dead_transitions(&mut self) { + pub fn remove_dead_transitions(&mut self) { if !self.is_empty() { let reacheable_states = self.get_reachable_states(); From 5856ebcd1cb32406e48881c4b1a7f739ef1b00ba Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:33:15 +0200 Subject: [PATCH 53/62] Rename method --- src/fast_automaton/operation/intersection.rs | 2 +- src/fast_automaton/operation/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 470e7bc..c20fd04 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -123,7 +123,7 @@ impl FastAutomaton { } } new_automaton.spanning_set = new_spanning_set; - new_automaton.remove_dead_transitions(); + new_automaton.remove_unreachable_states(); Ok(Cow::Owned(new_automaton)) } diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 9faa952..72a1380 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -11,7 +11,7 @@ mod repeat; mod union; impl FastAutomaton { - pub fn remove_dead_transitions(&mut self) { + pub fn remove_unreachable_states(&mut self) { if !self.is_empty() { let reacheable_states = self.get_reachable_states(); From 4b41ff3d6a86000e2ba38c83697dc59b3826bd0b Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 6 May 2026 19:42:19 +0200 Subject: [PATCH 54/62] Add new method to unaccept a state --- src/fast_automaton/builder.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index afdf999..e57312a 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -80,6 +80,14 @@ impl FastAutomaton { self.accept_states.insert(state); } + /// Marks the provided state as a non-accepting state. + #[inline] + pub fn unaccept(&mut self, state: State) { + self.assert_state_exists(state); + self.minimal = false; + self.accept_states.remove(&state); + } + /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: From 9bba5b4e080620433ac6325d86c59d9dcc81d723 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sat, 23 May 2026 18:14:54 +0200 Subject: [PATCH 55/62] Make methods public --- src/fast_automaton/analyze/mod.rs | 2 +- src/fast_automaton/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index f753dcb..4c800b5 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -73,7 +73,7 @@ impl FastAutomaton { live } - pub(crate) fn get_spanning_bases(&self) -> Result, EngineError> { + pub fn get_spanning_bases(&self) -> Result, EngineError> { self.spanning_set .get_spanning_ranges() .map(|range| Condition::from_range(range, &self.spanning_set)) diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index b2cc6d1..10d4eb2 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -218,7 +218,7 @@ impl FastAutomaton { /// Assert the automaton is deterministic. #[inline] - pub(crate) fn assert_deterministic(&self) { + pub fn assert_deterministic(&self) { assert!(self.deterministic, "The automaton should be deterministic."); } From 2529e58adc09882b67fbadb3f5f960d0fbc1343a Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Wed, 3 Jun 2026 21:49:57 +0200 Subject: [PATCH 56/62] A lot of small bug fixes --- src/fast_automaton/analyze/equivalence.rs | 4 +- src/fast_automaton/analyze/length.rs | 52 +++++- src/fast_automaton/analyze/mod.rs | 165 +++++++++++++++++-- src/fast_automaton/analyze/subset.rs | 11 +- src/fast_automaton/builder.rs | 77 +++++++++ src/fast_automaton/condition/mod.rs | 4 +- src/fast_automaton/convert/to_regex/mod.rs | 2 +- src/fast_automaton/mod.rs | 145 +++++++++++----- src/fast_automaton/operation/concat.rs | 43 ++++- src/fast_automaton/operation/difference.rs | 69 ++++++-- src/fast_automaton/operation/intersection.rs | 25 +++ src/fast_automaton/operation/minimize.rs | 8 +- src/fast_automaton/operation/repeat.rs | 53 ++++++ src/lib.rs | 56 ++----- src/regex/builder.rs | 11 ++ 15 files changed, 607 insertions(+), 118 deletions(-) diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 3f70711..070e206 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -72,10 +72,10 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.equivalent(&automaton_1).unwrap()); + assert!(automaton_1.equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.equivalent(&automaton_2).unwrap()); + assert!(automaton_2.equivalent(&automaton_2).unwrap()); assert_eq!( expected, diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index c753908..effc6b7 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -2,13 +2,25 @@ use super::*; impl FastAutomaton { /// Returns the minimum and maximum length of matched strings. + /// + /// Cycles are only treated as "language-extending" if they sit on an + /// accepting path. Cycles among dead states (states that can't reach any + /// accept) don't extend the language and therefore don't make the max + /// infinite. pub fn get_length(&self) -> (Option, Option) { if self.is_empty() { return (None, None); - } else if self.is_total(){ + } else if self.is_total() { return (Some(0), None); } + // States that lie on some accepting path. Walking only these prunes + // dead branches whose cycles cannot extend the language. + let live = self.get_reachable_states(); + if !live.contains(&self.start_state) { + return (None, None); + } + let mut min = None; let mut is_infinite = false; @@ -28,6 +40,9 @@ impl FastAutomaton { seen.insert(state); for to_state in self.direct_states(state) { + if !live.contains(&to_state) { + continue; + } if to_state == state || seen.contains(&to_state) { is_infinite = true; continue; @@ -55,6 +70,9 @@ impl FastAutomaton { seen.insert(state); for to_state in self.direct_states(state) { + if !live.contains(&to_state) { + continue; + } if to_state == state || seen.contains(&to_state) { max = None; break; @@ -65,4 +83,36 @@ impl FastAutomaton { (min, max) } +} + +#[cfg(test)] +mod tests { + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + // Regression: `get_length` used to set `max = None` on any cycle + // reachable from start, even dead cycles among non-accepting states that + // cannot reach an accept. Such cycles don't extend the language; the + // max must remain finite. Now fixed by filtering branches to the live + // (co-reachable-from-accept) subgraph. + #[test] + fn get_length_handles_dead_cycle() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.accept(0); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.add_transition(s2, s1, &cond); + // s1, s2 not accepting → language is {""} only. + + let (min, max) = a.get_length(); + assert_eq!(min, Some(0), "min length of {{\"\"}} is 0"); + assert_eq!( + max, + Some(0), + "max length of {{\"\"}} is 0; got {max:?} (cycle is dead, shouldn't extend the language)" + ); + } } \ No newline at end of file diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 4c800b5..c9c2404 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,30 +10,121 @@ mod length; mod subset; impl FastAutomaton { - /// Checks if the automaton matches the empty language. There can be false negative if the automaton is not minimal. - #[inline] + /// Checks if the automaton matches the empty language. + /// + /// Sound and complete: works on NFAs and non-minimal automata without + /// requiring determinization or minimization. O(V + E) worst case, with + /// O(1) fast paths for the common cases. pub fn is_empty(&self) -> bool { - self.accept_states.is_empty() + if self.accept_states.is_empty() { + return true; + } + if self.accept_states.contains(&self.start_state) { + return false; + } + if self.minimal { + // A minimal automaton with at least one accept state has a + // non-empty language (minimization prunes dead accepts). + return false; + } + + // Forward BFS from `start_state`; stop on first accept hit. + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + + while let Some(s) = worklist.pop_front() { + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + if self.accept_states.contains(to) { + return false; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + } + true } - /// Checks if the automaton matches all possible strings. There can be false negative if the automaton is not minimal. - #[inline] + /// Checks if the automaton matches all possible strings. + /// + /// Sound and complete for **deterministic** automata: a DFA's language + /// equals Σ\* iff every reachable state is accepting AND its outgoing + /// conditions union to Σ. For NFAs this is sound but conservative — + /// alternative paths may cover a character that no single reachable + /// state covers, so callers that need an exact answer on an NFA should + /// determinize first. + /// + /// O(V + E) plus one condition-union per outgoing transition. pub fn is_total(&self) -> bool { - if self.accept_states.contains(&self.start_state) - && let Some(condition) = self.transitions[self.start_state].get(&self.start_state) - { - return condition.is_total(); + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + + while let Some(s) = worklist.pop_front() { + if !self.accept_states.contains(&s) { + return false; + } + let mut covered = Condition::empty(&self.spanning_set); + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + covered = covered.union(cond); + if visited.insert(*to) { + worklist.push_back(*to); + } + } + if !covered.is_total() { + return false; + } } - - false + true } - /// Checks if the automaton only matches the empty string `""`. There can be false negative if the automaton is not minimal. - #[inline] + /// Checks if the automaton only matches the empty string `""`. + /// + /// Sound and complete on any automaton (DFA or NFA): the language equals + /// `{""}` iff start is accepting AND no state reachable from start by at + /// least one non-empty transition is, or can reach, an accept state. + /// O(V + E). pub fn is_empty_string(&self) -> bool { - self.accept_states.len() == 1 - && self.accept_states.contains(&self.start_state) - && self.in_degree(self.start_state) == 0 + if !self.accept_states.contains(&self.start_state) { + return false; + } + + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + + // Seed with states reachable in exactly one non-empty step from start. + for (cond, to) in self.transitions_from(self.start_state) { + if cond.is_empty() { + continue; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + + while let Some(s) = worklist.pop_front() { + if self.accept_states.contains(&s) { + return false; + } + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + } + true } /// Returns the set of all states reachable from the start state. @@ -73,6 +164,48 @@ impl FastAutomaton { live } + /// Recomputes from the transition graph whether the automaton contains + /// a cycle. Use this to refresh the [`is_cyclic`](Self::is_cyclic) cache + /// after operations that don't maintain it. + /// + /// Kahn's algorithm: a directed graph has a cycle iff topological sort + /// fails to consume all nodes. O(V + E). + pub(crate) fn detect_cyclic(&self) -> bool { + let total = self.get_number_of_states(); + if total == 0 { + return false; + } + + let mut in_deg: IntMap = IntMap::default(); + for s in self.states() { + in_deg.entry(s).or_insert(0); + for t in self.direct_states(s) { + *in_deg.entry(t).or_insert(0) += 1; + } + } + + let mut queue: VecDeque = in_deg + .iter() + .filter(|(_, d)| **d == 0) + .map(|(s, _)| *s) + .collect(); + + let mut processed = 0usize; + while let Some(s) = queue.pop_front() { + processed += 1; + for t in self.direct_states(s) { + if let Some(d) = in_deg.get_mut(&t) { + *d -= 1; + if *d == 0 { + queue.push_back(t); + } + } + } + } + + processed != total + } + pub fn get_spanning_bases(&self) -> Result, EngineError> { self.spanning_set .get_spanning_ranges() diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index e4ca7d6..9dd124c 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -7,8 +7,13 @@ impl FastAutomaton { pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); - } else if other.is_empty() || self.is_total() { + } else if other.is_empty() { return Ok(false); + } else if self.is_total() { + // self ⊆ other iff Σ* ⊆ other iff other = Σ*. We already failed + // the cheap `other.is_total()` check above; that check is sound + // but conservative on NFAs, so retry on the determinized form. + return Ok(other.determinize()?.is_total()); } let mut other = other.determinize()?.into_owned(); @@ -80,10 +85,10 @@ mod tests { ) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.subset(&automaton_1).unwrap()); + assert!(automaton_1.subset(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.subset(&automaton_2).unwrap()); + assert!(automaton_2.subset(&automaton_2).unwrap()); assert_eq!( expected_1_2, diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index e57312a..b0e6462 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -38,6 +38,7 @@ impl FastAutomaton { automaton.accept(automaton.start_state); automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); automaton.minimal = true; + automaton.cyclic = true; automaton } @@ -164,6 +165,39 @@ impl FastAutomaton { }; } + /// Adds a transition, but refuses if it would turn a DFA into an NFA. + /// + /// On `Err(DeterminismLost)` the automaton is left untouched; on `Ok`, + /// the transition has been added and `is_deterministic()` still holds + /// (provided it held before the call). This is the opt-in strict + /// counterpart to [`add_transition`](Self::add_transition). + pub fn try_add_transition( + &mut self, + from_state: State, + to_state: State, + new_cond: &Condition, + ) -> Result<(), super::DeterminismLost> { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + if new_cond.is_empty() { + return Ok(()); + } + if self.deterministic { + for (condition, state) in self.transitions_from(from_state) { + if *state == to_state { + continue; + } + if condition.has_intersection(new_cond) { + return Err(super::DeterminismLost); + } + } + } + self.add_transition(from_state, to_state, new_cond); + Ok(()) + } + /// Creates a new epsilon transition between the two states. pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { @@ -298,6 +332,15 @@ impl FastAutomaton { transitions.remove(state); } } + + for state in &states_to_remove { + self.transitions_in.remove(state); + } + for predecessors in self.transitions_in.values_mut() { + for state in &states_to_remove { + predecessors.remove(state); + } + } } /// Recompute a minimal spanning set for the automaton and apply it. @@ -364,8 +407,42 @@ impl FastAutomaton { #[cfg(test)] mod tests { + use crate::IntSet; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; use crate::regex::RegularExpression; + // Regression: `remove_states` used to skip the `transitions_in` cleanup + // that the single-state variant `remove_state` performs (drop entries + // keyed by removed states; purge them from surviving predecessor sets). + // Without that cleanup, `in_degree` of removed states stayed stale and + // any caller (repeat, concat, union, difference, to_regex) would see + // wrong values. + #[test] + fn remove_states_cleans_transitions_in() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + + assert_eq!(a.in_degree(s1), 1); + assert_eq!(a.in_degree(s2), 1); + + let mut to_remove = IntSet::default(); + to_remove.insert(s1); + a.remove_states(&to_remove); + + // After removing s1, its in_degree should report 0 (or, equivalently, + // queries on a removed state should be a clean no-op). Currently it + // still reports the pre-removal count. + assert_eq!(a.in_degree(s1), 0, "in_degree of removed state should be 0"); + assert_eq!(a.in_degree(s2), 1); + } + #[test] fn test_regex_build_deterministic_automaton() -> Result<(), String> { assert_regex_build_deterministic_automaton("...", true); diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 122ccc9..3cbeea5 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -357,12 +357,12 @@ mod tests { let condition_2 = Condition::from_range(range_2, used_characters).unwrap(); assert_eq!( - Condition::empty(&used_characters), + Condition::empty(used_characters), condition_1.intersection(&condition_1.complement()) ); assert_eq!( - Condition::empty(&used_characters), + Condition::empty(used_characters), condition_2.intersection(&condition_2.complement()) ); diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index 4671a9d..acbeab0 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -173,7 +173,7 @@ mod tests { automaton.complement().unwrap(); - let result = format!("^{}$", automaton.to_regex().to_string()); + let result = format!("^{}$", automaton.to_regex()); println!("{result}"); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 10d4eb2..7cd1014 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -36,6 +36,22 @@ pub struct FastAutomaton { cyclic: bool, } +/// Returned by [`FastAutomaton::try_add_transition`] when adding the requested +/// condition would turn a DFA into an NFA. The automaton is left unchanged. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DeterminismLost; + +impl std::fmt::Display for DeterminismLost { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "adding the transition would introduce overlapping conditions" + ) + } +} + +impl std::error::Error for DeterminismLost {} + impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; @@ -86,8 +102,12 @@ impl FastAutomaton { } /// Returns the number of transitions from the provided state. + /// Returns `0` if the state does not exist. #[inline] pub fn out_degree(&self, state: State) -> usize { + if !self.has_state(state) { + return 0; + } self.transitions[state].len() } @@ -104,12 +124,14 @@ impl FastAutomaton { } /// Returns an iterator over states directly reachable from the given state in one transition. + /// Returns an empty iterator if the state does not exist. #[inline] pub fn direct_states(&self, state: State) -> impl Iterator + '_ { - self.transitions[state] - .keys() - .cloned() - .filter(|s| !self.removed_states.contains(s)) + self.transitions.get(state).into_iter().flat_map(move |t| { + t.keys() + .copied() + .filter(|s| !self.removed_states.contains(s)) + }) } /// Returns a vector of states directly reachable from the given state in one transition. @@ -133,22 +155,29 @@ impl FastAutomaton { } /// Returns a vector of transitions from the given state. + /// Returns an empty vector if the state does not exist. #[inline] pub fn transitions_from_vec(&self, state: State) -> Vec<(Condition, State)> { - self.transitions[state] - .iter() - .map(|(s, c)| (c.clone(), *s)) - .filter(|s| !self.removed_states.contains(&s.1)) - .collect() + self.transitions + .get(state) + .map(|t| { + t.iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + }) + .unwrap_or_default() } /// Returns an iterator over transitions from the given state. + /// Returns an empty iterator if the state does not exist. #[inline] pub fn transitions_from(&self, state: State) -> impl Iterator { - self.transitions[state] - .iter() - .map(|(s, c)| (c, s)) - .filter(|s| !self.removed_states.contains(s.1)) + self.transitions.get(state).into_iter().flat_map(move |t| { + t.iter() + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) + }) } /// Returns `true` if there is a directed transition from `from_state` to `to_state`. @@ -181,9 +210,12 @@ impl FastAutomaton { } /// Returns a reference to the condition of the directed transition between the two states, if any. + /// Returns `None` if either state does not exist. #[inline] pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { - self.transitions[from_state].get(&to_state) + self.transitions + .get(from_state) + .and_then(|t| t.get(&to_state)) } /// Returns the start state. @@ -211,17 +243,17 @@ impl FastAutomaton { } /// Returns `true` if the automaton is deterministic. + /// + /// Note: this flag degrades monotonically. Once `add_transition` introduces + /// an overlapping condition, the flag flips to `false` and is not + /// re-checked by `remove_transition` or `remove_state`. The automaton may + /// in fact be deterministic again after such removals; call + /// [`determinize`](Self::determinize) if you need a fresh DFA. #[inline] pub fn is_deterministic(&self) -> bool { self.deterministic } - /// Assert the automaton is deterministic. - #[inline] - pub fn assert_deterministic(&self) { - assert!(self.deterministic, "The automaton should be deterministic."); - } - /// Returns `true` if the automaton is minimal. #[inline] pub fn is_minimal(&self) -> bool { @@ -242,30 +274,30 @@ impl FastAutomaton { /// Returns `true` if the automaton matches the given string. pub fn is_match(&self, string: &str) -> bool { - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); - worklist.push_back((0, &self.start_state)); + let mut current: IntSet = IntSet::default(); + current.insert(self.start_state); - while let Some((position, current_state)) = worklist.pop_back() { - if string.len() == position { - if self.accept_states.contains(current_state) { - return true; - } - continue; + let mut next: IntSet = IntSet::default(); + for c in string.chars() { + if current.is_empty() { + return false; } - let curr_char = string.chars().nth(position).unwrap() as u32; - for (cond, to_state) in self.transitions_from(*current_state) { - if cond.has_character(&curr_char, &self.spanning_set).unwrap() { - if position + 1 == string.len() { - if self.accept_states.contains(to_state) { - return true; - } - } else { - worklist.push_back((position + 1, to_state)); + let c_u32 = c as u32; + next.clear(); + for &state in ¤t { + for (cond, to_state) in self.transitions_from(state) { + if cond + .has_character(&c_u32, &self.spanning_set) + .unwrap_or(false) + { + next.insert(*to_state); } } } + std::mem::swap(&mut current, &mut next); } - false + + current.iter().any(|s| self.accept_states.contains(s)) } /// Returns the automaton's DOT representation. @@ -311,4 +343,41 @@ mod tests { Ok(()) } + + // Regression: `new_total` constructs an automaton with a total self-loop + // on the start state. It must report `cyclic = true` from construction. + #[test] + fn new_total_reports_cyclic() { + let a = FastAutomaton::new_total(); + assert!( + a.is_cyclic(), + "new_total has a total self-loop on start, must be cyclic" + ); + } + + // Regression: read-only query methods used to directly index + // `self.transitions[state]` without checking `has_state` first, panicking + // on out-of-range inputs. They now return gracefully (0 / None / empty + // iterator). + #[test] + fn out_degree_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert_eq!(a.out_degree(999), 0); + } + + #[test] + fn get_condition_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert!(a.get_condition(999, 0).is_none()); + assert!(a.get_condition(0, 999).is_none()); + } + + #[test] + fn direct_states_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert_eq!(a.direct_states(999).count(), 0); + assert_eq!(a.transitions_from(999).count(), 0); + assert!(a.transitions_from_vec(999).is_empty()); + assert!(a.direct_states_vec(999).is_empty()); + } } diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 1755c04..b0d4e44 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -29,9 +29,16 @@ impl FastAutomaton { .assert_max_number_of_states(self.concat_state_count_heuristic(other))?; if other.is_empty() { + self.make_empty(); + return Ok(()); + } else if other.is_empty_string() { return Ok(()); } + if self.is_empty() { + self.make_empty(); + return Ok(()); + } else if self.is_empty_string() { self.apply_model(other); return Ok(()); } @@ -131,13 +138,15 @@ impl FastAutomaton { } pub(crate) fn concat_state_count_heuristic(&self, other: &FastAutomaton) -> usize { - // Edge Case 1: If the other automaton is empty, the state count doesn't change. if other.is_empty() { + return 1; + } else if other.is_empty_string() { return self.get_number_of_states(); } - // Edge Case 2: If this automaton is empty, the resulting state count is just the other's. if self.is_empty() { + return 1; + } else if self.is_empty_string() { return other.get_number_of_states(); } @@ -165,6 +174,36 @@ impl FastAutomaton { mod tests { use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; + #[test] + fn bug_concat_empty_left() { + let e = FastAutomaton::new_empty(); + let t = FastAutomaton::new_total(); + let r = e.concat(&t).unwrap(); + assert!(r.is_empty(), "∅ · Σ* must be ∅, got something non-empty"); + } + + #[test] + fn bug_concat_empty_right() { + let e = FastAutomaton::new_empty(); + let t = FastAutomaton::new_total(); + let r = t.concat(&e).unwrap(); + assert!(r.is_empty(), "Σ* · ∅ must be ∅, got something non-empty"); + } + + #[test] + fn bug_term_concat_with_empty() { + use crate::Term; + let a = Term::from_automaton( + RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(), + ); + let e = Term::from_automaton(FastAutomaton::new_empty()); + let r = a.concat(&[e]).unwrap(); + assert!(r.is_empty().unwrap(), "'abc' · ∅ must be ∅"); + } + #[test] fn test_simple_concatenation_regex() -> Result<(), String> { let automaton = RegularExpression::parse("abc", false) diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 6ac8de6..0d9b105 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -5,9 +5,10 @@ use crate::EngineError; use super::*; impl FastAutomaton { - /// Totalize the automaton; it must be deterministic. + /// Totalize the automaton. Precondition: `self.deterministic` is true + /// (the only caller, `complement`, determinizes first). fn totalize(&mut self) -> Result<(), EngineError> { - self.assert_deterministic(); + debug_assert!(self.deterministic, "totalize requires a DFA"); let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = @@ -38,14 +39,23 @@ impl FastAutomaton { self.apply_new_spanning_set(&new_spanning_set)?; if self.in_degree(crash_state) == 1 { + // Only the self-loop points to crash; nothing else needs it. self.remove_state(crash_state); + } else { + // crash_state has incoming edges from real states and a total + // self-loop, so the automaton now contains a cycle. + self.cyclic = true; } Ok(()) } - /// Complements the automaton; it must be deterministic. + /// Complements the automaton. + /// + /// If `self` is non-deterministic, it is determinized in place first. pub fn complement(&mut self) -> Result<(), EngineError> { - self.assert_deterministic(); + if !self.deterministic { + *self = self.determinize()?.into_owned(); + } self.totalize()?; let mut new_accept_states = IntSet::default(); @@ -60,16 +70,51 @@ impl FastAutomaton { Ok(()) } - /// Computes the difference between `self` and `other`. `other` must be deterministic. + /// Computes the difference between `self` and `other`. + /// + /// If `other` is non-deterministic, it is determinized first. pub fn difference(&self, other: &FastAutomaton) -> Result { - other.assert_deterministic(); - let mut complement = other.clone(); - match complement.complement() { - Ok(()) => self.intersection(&complement), - Err(err) => Err(err), - } + let mut complement = other.determinize()?.into_owned(); + complement.complement()?; + self.intersection(&complement) } } #[cfg(test)] -mod tests {} +mod tests { + use crate::fast_automaton::FastAutomaton; + use crate::regex::RegularExpression; + + // Regression: `totalize` adds a `crash_state` with a total self-loop + // whenever the input isn't already total. That self-loop makes the + // automaton cyclic. The flag is now updated when the crash state + // survives. + #[test] + fn complement_updates_cyclic_flag() { + let mut a = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(!a.is_cyclic(), "precondition: 'abc' is acyclic"); + + a.complement().unwrap(); + + assert!(a.is_match("x")); + assert!(a.is_match("xx")); + assert!(a.is_match("xxxxxxxxxx")); + + assert!( + a.is_cyclic(), + "complement of finite acyclic must be cyclic (crash self-loop)" + ); + } + + // Regression: empty.complement() = Σ* which is cyclic. Same root cause + // as above plus `new_total` flag fix. + #[test] + fn complement_of_empty_is_cyclic() { + let mut a = FastAutomaton::new_empty(); + a.complement().unwrap(); + assert!(a.is_cyclic(), "Σ* must report cyclic"); + } +} diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index c20fd04..ba60671 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -124,6 +124,7 @@ impl FastAutomaton { } new_automaton.spanning_set = new_spanning_set; new_automaton.remove_unreachable_states(); + new_automaton.cyclic = new_automaton.detect_cyclic(); Ok(Cow::Owned(new_automaton)) } @@ -213,6 +214,30 @@ impl FastAutomaton { mod tests { use crate::regex::RegularExpression; + // Regression: `intersection` builds its result from `new_empty()` + // (cyclic=false) and used to leave the flag at false even when the + // result contained cycles. `Term::difference` and `get_cardinality` both + // branch on `is_cyclic`; a stale false would route them incorrectly. + // Fixed by recomputing the flag at the end of `intersection_internal`. + #[test] + fn intersection_recomputes_cyclic_flag() { + let a_star = RegularExpression::parse("a*", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!( + a_star.is_cyclic(), + "precondition: a* should be cyclic in the source automaton" + ); + + // a* ∩ a* = a*, which is cyclic. + let inter = a_star.intersection(&a_star).unwrap(); + assert!( + inter.is_cyclic(), + "intersection of two cyclic automata accepting a* should be cyclic" + ); + } + #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs index da3ec9f..6b7c12a 100644 --- a/src/fast_automaton/operation/minimize.rs +++ b/src/fast_automaton/operation/minimize.rs @@ -1,9 +1,13 @@ use super::*; impl FastAutomaton { - /// Minimizes a deterministic automaton using Hopcroft's Algorithm. + /// Minimizes the automaton using Hopcroft's Algorithm. + /// + /// If `self` is non-deterministic, it is determinized in place first. pub fn minimize(&mut self) -> Result<(), EngineError> { - self.assert_deterministic(); + if !self.deterministic { + *self = self.determinize()?.into_owned(); + } let max_states = self.transitions.len(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 72dbb83..ddabaf7 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -22,6 +22,16 @@ impl FastAutomaton { return Ok(()); } + // Empty language: ∅⁰ = {""}, ∅ⁿ = ∅ for n ≥ 1. The general algorithm + // below assumes at least one accept state when installing loop-backs + // for unbounded repeats; bail out here before it can panic. + if self.accept_states.is_empty() { + if min == 0 { + self.accept(self.start_state); + } + return Ok(()); + } + let automaton_to_repeat = self.clone(); if min == 0 && self.in_degree(self.start_state) != 0 { @@ -194,8 +204,51 @@ impl FastAutomaton { #[cfg(test)] mod tests { + use crate::fast_automaton::FastAutomaton; use crate::regex::RegularExpression; + // BUG: `repeat(0, Some(0))` on a non-empty language returns L ∪ {""} + // instead of just {""}. After the (effectively no-op) main loop, the + // code unconditionally calls `accept(start_state)` when min == 0, + // adding "" to the language WITHOUT first reducing the automaton to + // {""}. The result is the union of the original language and the + // empty string. + // + // Repro: "abc".repeat(0, Some(0)) should match "" only; it currently + // also matches "abc". + #[test] + #[ignore = "known bug: repeat(0, 0) returns L ∪ {\"\"} instead of {\"\"}"] + fn bug_repeat_zero_zero_on_non_empty() { + let a = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + let r = a.repeat(0, Some(0)).unwrap(); + assert!(r.is_match(""), "L^0 must contain \"\""); + assert!( + !r.is_match("abc"), + "L^0 must NOT contain L (got 'abc' match)" + ); + } + + // Regression: empty.repeat(_, None) used to panic on + // `accept_states.iter().next().unwrap()` at repeat.rs:63 because the + // unbounded-repeat branch assumed at least one accept state. Language + // theory: ∅* = {""} and ∅⁺ = ∅; both must be returnable without panic. + #[test] + fn empty_repeat_unbounded_does_not_panic() { + let empty = FastAutomaton::new_empty(); + // Expected: ∅* = {""}. + let r = empty.repeat(0, None).expect("should not error"); + assert!(r.is_match("")); + assert!(!r.is_match("a")); + + // Expected: ∅⁺ = ∅. + let r = empty.repeat(1, None).expect("should not error"); + assert!(!r.is_match("")); + assert!(!r.is_match("a")); + } + #[test] fn test_repeat_1() -> Result<(), String> { let automaton = RegularExpression::parse("(a*,a*)?", false) diff --git a/src/lib.rs b/src/lib.rs index e1d194c..123daa4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -460,54 +460,32 @@ impl Term { /// Checks if the term matches the empty language. pub fn is_empty(&self) -> Result { Ok(match self { - Term::RegularExpression(regular_expression) => regular_expression.is_empty(), - Term::Automaton(fast_automaton) => { - if fast_automaton.is_minimal() { - fast_automaton.is_empty() - } else if fast_automaton.is_empty() { - true - } else { - let mut fast_automaton = fast_automaton.determinize()?.into_owned(); - fast_automaton.minimize()?; - fast_automaton.is_empty() - } - } + Term::RegularExpression(regex) => regex.is_empty(), + Term::Automaton(automaton) => automaton.is_empty(), }) } /// Checks if the term matches all possible strings. pub fn is_total(&self) -> Result { - Ok(match self { - Term::RegularExpression(regular_expression) => regular_expression.is_total(), - Term::Automaton(fast_automaton) => { - if fast_automaton.is_minimal() { - fast_automaton.is_total() - } else if fast_automaton.is_total() { - true + match self { + Term::RegularExpression(regex) => Ok(regex.is_total()), + Term::Automaton(automaton) => { + if automaton.is_total() { + Ok(true) + } else if automaton.is_deterministic() { + Ok(false) } else { - let mut fast_automaton = fast_automaton.determinize()?.into_owned(); - fast_automaton.minimize()?; - fast_automaton.is_total() + Ok(automaton.determinize()?.is_total()) } } - }) + } } /// Checks if the term matches only the empty string `""`. pub fn is_empty_string(&self) -> Result { Ok(match self { - Term::RegularExpression(regular_expression) => regular_expression.is_empty_string(), - Term::Automaton(fast_automaton) => { - if fast_automaton.is_minimal() { - fast_automaton.is_empty_string() - } else if fast_automaton.is_empty_string() { - true - } else { - let mut fast_automaton = fast_automaton.determinize()?.into_owned(); - fast_automaton.minimize()?; - fast_automaton.is_empty_string() - } - } + Term::RegularExpression(regex) => regex.is_empty_string(), + Term::Automaton(automaton) => automaton.is_empty_string(), }) } @@ -619,7 +597,7 @@ mod tests { let complement = term.complement().unwrap(); assert!( - term.intersection(&[complement.clone()]) + term.intersection(std::slice::from_ref(&complement)) .unwrap() .is_empty() .unwrap() @@ -648,7 +626,7 @@ mod tests { let regex1 = Term::from_pattern("a").unwrap(); let regex2 = Term::from_pattern("b").unwrap(); - let intersection = regex1.intersection(&vec![regex2]).unwrap(); + let intersection = regex1.intersection(&[regex2]).unwrap(); assert!(intersection.is_empty().unwrap()); assert_eq!("[]", intersection.to_pattern()); @@ -689,7 +667,7 @@ mod tests { let regex1 = Term::from_pattern("a*").unwrap(); let regex2 = Term::from_pattern("b*").unwrap(); - let result = regex1.intersection(&vec![regex2]); + let result = regex1.intersection(&[regex2]); assert!(result.is_ok()); let result = result.unwrap().to_pattern(); assert_eq!("", result); @@ -702,7 +680,7 @@ mod tests { let regex1 = Term::from_pattern("x*").unwrap(); let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.intersection(&vec![regex2]); + let result = regex1.intersection(&[regex2]); assert!(result.is_ok()); let result = result.unwrap().to_pattern(); assert_eq!("(x{3})*", result); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 727bcfe..a77871d 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -290,6 +290,17 @@ mod tests { assert!(!automaton.is_match("aAb")); assert!(RegularExpression::new("\\1").is_err()); + + let two_chars = RegularExpression::new("..") + .unwrap() + .to_automaton() + .unwrap(); + assert!(two_chars.is_match("aé")); + assert!(two_chars.is_match("éa")); + assert!(two_chars.is_match("éé")); + assert!(!two_chars.is_match("é")); + assert!(!two_chars.is_match("aéa")); + Ok(()) } From d8d8fef24d43c37377ef50d0d74b47b8af083b73 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Thu, 4 Jun 2026 21:18:56 +0200 Subject: [PATCH 57/62] WIP: Improve testing and redesign lib --- Cargo.toml | 1 + README.md | 3 +- src/fast_automaton/analyze/cardinality.rs | 18 +- src/fast_automaton/analyze/length.rs | 2 +- src/fast_automaton/analyze/mod.rs | 85 ++- src/fast_automaton/builder.rs | 8 +- src/fast_automaton/mod.rs | 18 - src/fast_automaton/operation/concat.rs | 27 +- src/fast_automaton/operation/determinize.rs | 39 ++ src/fast_automaton/operation/difference.rs | 27 +- src/fast_automaton/operation/intersection.rs | 24 +- src/fast_automaton/operation/minimize.rs | 13 + src/fast_automaton/operation/mod.rs | 11 +- src/fast_automaton/operation/repeat.rs | 208 ++++--- src/fast_automaton/operation/union.rs | 147 ++++- src/lib.rs | 31 +- src/regex/analyze/mod.rs | 17 +- src/regex/analyze/number_of_states.rs | 47 +- src/regex/operation/simplify.rs | 28 +- src/regex/operation/union.rs | 40 +- tests/proptest_strategies.rs | 566 +++++++++++++++++++ 21 files changed, 1096 insertions(+), 264 deletions(-) create mode 100644 tests/proptest_strategies.rs diff --git a/Cargo.toml b/Cargo.toml index d67f846..25c0888 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ indexmap = "2.13.0" criterion = { version = "0.5", features = ["html_reports"] } env_logger = "0.11.3" serde_json = "1.0.114" +proptest = "1" [[bench]] name = "my_benchmark" diff --git a/README.md b/README.md index 652520a..d5a2885 100644 --- a/README.md +++ b/README.md @@ -203,14 +203,13 @@ This design allows us to perform unions, intersections, and complements of trans | `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | | `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | | `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | -| `get_reachable_states(&self)` | `IntSet` | Returns the set of all states reachable from the start state. | +| `get_live_states(&self)` | `IntSet` | Returns the set of "live" states: those that can reach an accept state. | | `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | | `get_start_state(&self)` | `State` | Returns the start state. | | `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | | `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | | `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | | `is_accepted(&self, state: State)` | `bool` | Returns `true` if the given state is one of the accept states. | -| `is_cyclic(&self)` | `bool` | Returns `true` if the automaton contains at least one cycle. | | `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | | `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | | `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 02f825d..7c36bb3 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -7,20 +7,24 @@ impl FastAutomaton { pub fn get_cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); - } else if self.cyclic || self.is_total() { + } else if self.is_total() { return Cardinality::Infinite; } + + // A cycle means infinitely many strings. `topological_sorted_states` + // returns `None` exactly when the transition graph is cyclic and needs + // no determinism, so this also covers cyclic non-deterministic inputs. + let topologically_sorted_states = match self.topological_sorted_states() { + None => return Cardinality::Infinite, + Some(states) => states, + }; + + // The finite count below assumes deterministic (single-path) transitions. assert!( self.is_deterministic(), "The automaton should be deterministic." ); - let topologically_sorted_states = self.topological_sorted_states(); - if topologically_sorted_states.is_none() { - return Cardinality::Infinite; - } - let topologically_sorted_states = topologically_sorted_states.unwrap(); - let len = self.transitions.len(); let mut distances: IntMap = IntMap::with_capacity_and_hasher(len, BuildHasherDefault::default()); diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index effc6b7..5b1fb7c 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -16,7 +16,7 @@ impl FastAutomaton { // States that lie on some accepting path. Walking only these prunes // dead branches whose cycles cannot extend the language. - let live = self.get_reachable_states(); + let live = self.get_live_states(); if !live.contains(&self.start_state) { return (None, None); } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index c9c2404..57b8783 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -127,8 +127,37 @@ impl FastAutomaton { true } - /// Returns the set of all states reachable from the start state. - pub fn get_reachable_states(&self) -> IntSet { + /// Returns the states reachable **from the start state** by following + /// non-empty transitions (the start state is always included). + /// + /// This is forward reachability. Contrast with [`Self::get_live_states`], + /// which returns the states that can **reach an accept state** + /// (co-reachability). + pub(crate) fn forward_reachable_states(&self) -> IntSet { + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + while let Some(s) = worklist.pop_front() { + for (condition, to_state) in self.transitions_from(s) { + if condition.is_empty() { + continue; + } + if visited.insert(*to_state) { + worklist.push_back(*to_state); + } + } + } + visited + } + + /// Returns the "live" (co-reachable) states: those that can **reach an + /// accept state** by following non-empty transitions. Computed by a reverse + /// traversal from the accept states. + /// + /// This is co-reachability — note it is *not* the set of states reachable + /// from the start state. + pub fn get_live_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); for from_state in self.states() { @@ -164,51 +193,19 @@ impl FastAutomaton { live } - /// Recomputes from the transition graph whether the automaton contains - /// a cycle. Use this to refresh the [`is_cyclic`](Self::is_cyclic) cache - /// after operations that don't maintain it. + /// Returns one [`Condition`] per base of the spanning set — including the + /// "rest" range when it is non-empty. /// - /// Kahn's algorithm: a directed graph has a cycle iff topological sort - /// fails to consume all nodes. O(V + E). - pub(crate) fn detect_cyclic(&self) -> bool { - let total = self.get_number_of_states(); - if total == 0 { - return false; - } - - let mut in_deg: IntMap = IntMap::default(); - for s in self.states() { - in_deg.entry(s).or_insert(0); - for t in self.direct_states(s) { - *in_deg.entry(t).or_insert(0) += 1; - } - } - - let mut queue: VecDeque = in_deg - .iter() - .filter(|(_, d)| **d == 0) - .map(|(s, _)| *s) - .collect(); - - let mut processed = 0usize; - while let Some(s) = queue.pop_front() { - processed += 1; - for t in self.direct_states(s) { - if let Some(d) = in_deg.get_mut(&t) { - *d -= 1; - if *d == 0 { - queue.push_back(t); - } - } - } - } - - processed != total - } - + /// The bases must partition the whole alphabet Σ: subset construction + /// ([`determinize`](Self::determinize)) and Hopcroft partitioning + /// ([`minimize`](Self::minimize)) iterate them and would otherwise silently + /// drop transitions whose condition lies in the "rest" range. (For a + /// spanning set with an empty rest this is exactly the spanning ranges, so + /// well-formed automata are unaffected.) pub fn get_spanning_bases(&self) -> Result, EngineError> { self.spanning_set - .get_spanning_ranges() + .get_spanning_ranges_with_rest() + .iter() .map(|range| Condition::from_range(range, &self.spanning_set)) .collect() } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index b0e6462..615c146 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -17,7 +17,6 @@ impl FastAutomaton { spanning_set: SpanningSet::new_empty(), deterministic: true, minimal: true, - cyclic: false, } } @@ -38,7 +37,6 @@ impl FastAutomaton { automaton.accept(automaton.start_state); automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); automaton.minimal = true; - automaton.cyclic = true; automaton } @@ -391,6 +389,11 @@ impl FastAutomaton { self.apply_model(&Self::new_total()) } + #[inline] + pub(crate) fn make_empty_string(&mut self) { + self.apply_model(&Self::new_empty_string()) + } + #[inline] pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { self.transitions = model.transitions.clone(); @@ -401,7 +404,6 @@ impl FastAutomaton { self.spanning_set = model.spanning_set.clone(); self.deterministic = model.deterministic; self.minimal = model.minimal; - self.cyclic = model.cyclic; } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 7cd1014..e8b4a60 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -33,7 +33,6 @@ pub struct FastAutomaton { spanning_set: SpanningSet, deterministic: bool, minimal: bool, - cyclic: bool, } /// Returned by [`FastAutomaton::try_add_transition`] when adding the requested @@ -260,12 +259,6 @@ impl FastAutomaton { self.minimal } - /// Returns `true` if the automaton contains at least one cycle. - #[inline] - pub fn is_cyclic(&self) -> bool { - self.cyclic - } - /// Returns `true` if the automaton contains the given state. #[inline] pub fn has_state(&self, state: State) -> bool { @@ -344,17 +337,6 @@ mod tests { Ok(()) } - // Regression: `new_total` constructs an automaton with a total self-loop - // on the start state. It must report `cyclic = true` from construction. - #[test] - fn new_total_reports_cyclic() { - let a = FastAutomaton::new_total(); - assert!( - a.is_cyclic(), - "new_total has a total self-loop on start, must be cyclic" - ); - } - // Regression: read-only query methods used to directly index // `self.transitions[state]` without checking `has_state` first, panicking // on out-of-range inputs. They now return gracefully (0 / None / empty diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index b0d4e44..9af1607 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -25,6 +25,19 @@ impl FastAutomaton { } pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + self.concat_mut_with(other, false) + } + + /// Concatenation where `force_no_merge` prevents merging `other`'s start + /// state into `self`'s accept states, always introducing a fresh start + /// state for `other` reached by epsilon transitions. Used by `repeat` to + /// keep accept states "clean" when they must remain accepting (so they do + /// not inherit the next copy's transitions). + pub(crate) fn concat_mut_with( + &mut self, + other: &FastAutomaton, + force_no_merge: bool, + ) -> Result<(), EngineError> { ExecutionProfile::get() .assert_max_number_of_states(self.concat_state_count_heuristic(other))?; @@ -52,12 +65,13 @@ impl FastAutomaton { BuildHasherDefault::default(), ); - let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 - && self - .accept_states - .iter() - .cloned() - .any(|s| self.out_degree(s) > 0); + let start_state_and_accept_states_not_mergeable = force_no_merge + || (other.in_degree(other.start_state) > 0 + && self + .accept_states + .iter() + .cloned() + .any(|s| self.out_degree(s) > 0)); let accept_states = self.accept_states.iter().cloned().collect::>(); @@ -132,7 +146,6 @@ impl FastAutomaton { } } - self.cyclic = self.cyclic || other.cyclic; self.minimal = false; Ok(()) } diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index c8d6590..8060b8c 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -78,7 +78,46 @@ impl FastAutomaton { #[cfg(test)] mod tests { + use crate::CharRange; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + use crate::fast_automaton::spanning_set::SpanningSet; use crate::regex::RegularExpression; + use regex_charclass::char::Char; + + // Regression: subset construction iterates `get_spanning_bases`, which used + // to omit the spanning set's "rest" range. A transition whose condition + // lies in the rest range was therefore silently dropped, so determinizing a + // non-deterministic automaton that uses the rest range produced a DFA with + // the wrong (smaller) language. + #[test] + fn determinize_keeps_rest_range_transitions() { + let rng = |c: char| { + let c = Char::new(c); + CharRange::new_from_range(c..=c) + }; + let ss = SpanningSet::compute_spanning_set(&[rng('a'), rng('b')]); + let rest = ss.get_rest().clone(); + + let mut a = FastAutomaton::new_empty(); + a.apply_new_spanning_set(&ss).unwrap(); + a.new_state(); + a.add_transition(0, 1, &Condition::from_range(&rest, &ss).unwrap()); // 0 -[^ab]-> 1 + a.add_transition(1, 0, &Condition::from_range(&rng('a'), &ss).unwrap()); + a.add_transition(1, 1, &Condition::from_range(&rng('a'), &ss).unwrap()); // nondeterministic + a.accept(1); + + assert!(!a.is_deterministic()); + assert!(a.is_match("\u{0}"), "a should accept a [^ab] character"); + + let d = a.determinize().unwrap(); + assert!(d.is_deterministic()); + assert!( + d.is_match("\u{0}"), + "determinize dropped the [^ab] transition" + ); + assert!(a.equivalent(&d).unwrap()); + } #[test] fn test_determinize_regex() -> Result<(), String> { diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 0d9b105..8d4f8cf 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -41,10 +41,6 @@ impl FastAutomaton { if self.in_degree(crash_state) == 1 { // Only the self-loop points to crash; nothing else needs it. self.remove_state(crash_state); - } else { - // crash_state has incoming edges from real states and a total - // self-loop, so the automaton now contains a cycle. - self.cyclic = true; } Ok(()) } @@ -85,36 +81,29 @@ mod tests { use crate::fast_automaton::FastAutomaton; use crate::regex::RegularExpression; - // Regression: `totalize` adds a `crash_state` with a total self-loop - // whenever the input isn't already total. That self-loop makes the - // automaton cyclic. The flag is now updated when the crash state - // survives. + // `totalize` adds a `crash_state` with a total self-loop, so the complement + // of a finite language is infinite (it matches arbitrarily long strings via + // the crash-state loop). #[test] - fn complement_updates_cyclic_flag() { + fn complement_of_finite_is_infinite() { let mut a = RegularExpression::parse("abc", false) .unwrap() .to_automaton() .unwrap(); - assert!(!a.is_cyclic(), "precondition: 'abc' is acyclic"); a.complement().unwrap(); + assert!(!a.is_match("abc"), "complement must not match 'abc'"); assert!(a.is_match("x")); assert!(a.is_match("xx")); assert!(a.is_match("xxxxxxxxxx")); - - assert!( - a.is_cyclic(), - "complement of finite acyclic must be cyclic (crash self-loop)" - ); } - // Regression: empty.complement() = Σ* which is cyclic. Same root cause - // as above plus `new_total` flag fix. + // empty.complement() = Σ*. #[test] - fn complement_of_empty_is_cyclic() { + fn complement_of_empty_is_total() { let mut a = FastAutomaton::new_empty(); a.complement().unwrap(); - assert!(a.is_cyclic(), "Σ* must report cyclic"); + assert!(a.is_total(), "complement of ∅ must be Σ*"); } } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index ba60671..c4335e9 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -123,8 +123,7 @@ impl FastAutomaton { } } new_automaton.spanning_set = new_spanning_set; - new_automaton.remove_unreachable_states(); - new_automaton.cyclic = new_automaton.detect_cyclic(); + new_automaton.remove_dead_states(); Ok(Cow::Owned(new_automaton)) } @@ -214,28 +213,19 @@ impl FastAutomaton { mod tests { use crate::regex::RegularExpression; - // Regression: `intersection` builds its result from `new_empty()` - // (cyclic=false) and used to leave the flag at false even when the - // result contained cycles. `Term::difference` and `get_cardinality` both - // branch on `is_cyclic`; a stale false would route them incorrectly. - // Fixed by recomputing the flag at the end of `intersection_internal`. + // a* ∩ a* = a*: the intersection keeps the (infinite) looping language. #[test] - fn intersection_recomputes_cyclic_flag() { + fn intersection_keeps_infinite_language() { let a_star = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - assert!( - a_star.is_cyclic(), - "precondition: a* should be cyclic in the source automaton" - ); - // a* ∩ a* = a*, which is cyclic. let inter = a_star.intersection(&a_star).unwrap(); - assert!( - inter.is_cyclic(), - "intersection of two cyclic automata accepting a* should be cyclic" - ); + assert!(inter.is_match("")); + assert!(inter.is_match("aaaaaaaa")); + assert!(!inter.is_match("b")); + assert!(inter.equivalent(&a_star).unwrap()); } #[test] diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs index 6b7c12a..b676864 100644 --- a/src/fast_automaton/operation/minimize.rs +++ b/src/fast_automaton/operation/minimize.rs @@ -9,6 +9,19 @@ impl FastAutomaton { *self = self.determinize()?.into_owned(); } + // Drop states unreachable from the start. A minimal automaton has none, + // and downstream invariants rely on it — in particular `is_empty`'s + // fast path treats any minimal automaton with an accept state as + // non-empty, which only holds if every accept state is reachable. + let reachable = self.forward_reachable_states(); + let unreachable: IntSet = self + .states() + .filter(|s| !reachable.contains(s)) + .collect(); + if !unreachable.is_empty() { + self.remove_states(&unreachable); + } + let max_states = self.transitions.len(); let all_states: IntSet = self.states().collect(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 72a1380..cf3274d 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -11,13 +11,16 @@ mod repeat; mod union; impl FastAutomaton { - pub fn remove_unreachable_states(&mut self) { + /// Removes "dead" states — those that cannot reach any accept state — since + /// they never contribute to the language. If the language is empty the whole + /// automaton collapses to the canonical empty automaton. + pub fn remove_dead_states(&mut self) { if !self.is_empty() { - let reacheable_states = self.get_reachable_states(); + let live_states = self.get_live_states(); let mut dead_states = IntSet::default(); for from_state in self.states() { - if !reacheable_states.contains(&from_state) { + if !live_states.contains(&from_state) { dead_states.insert(from_state); } } @@ -44,7 +47,7 @@ mod tests { .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_reachable_states().len()); + assert_eq!(3, intersection.get_live_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index ddabaf7..6045ef0 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -22,6 +22,23 @@ impl FastAutomaton { return Ok(()); } + // r⁰ = {""} for any language (max == 0 implies min == 0 here, since + // min > max already returned above). Without this, the general path + // below would leave the original language reachable and return + // L ∪ {""} instead of just {""}. + if max_opt == Some(0) { + self.make_empty_string(); + return Ok(()); + } + + // The empty-string language is a fixpoint of repetition: {""}{m,n} = {""} + // for any valid m ≤ n. Returning early also avoids the unbounded + // construction below, whose single-state "tight loop" branch would + // otherwise try to remove the start state and panic. + if self.is_empty_string() { + return Ok(()); + } + // Empty language: ∅⁰ = {""}, ∅ⁿ = ∅ for n ≥ 1. The general algorithm // below assumes at least one accept state when installing loop-backs // for unbounded repeats; bail out here before it can panic. @@ -68,48 +85,67 @@ impl FastAutomaton { } if max_opt.is_none() { - let mut automaton_to_repeat = automaton_to_repeat.clone(); - - let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); - if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 - { - automaton_to_repeat - .add_epsilon_transition(accept_state, automaton_to_repeat.start_state); - let old_start_state = automaton_to_repeat.start_state; - automaton_to_repeat.start_state = accept_state; - automaton_to_repeat.remove_state(old_start_state); - } else { - let t = Self::transitions_from_state_set( - &automaton_to_repeat.transitions, - automaton_to_repeat.start_state, - ); - let transitions = - Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); - - for state in automaton_to_repeat.accept_states.clone() { - for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition(state, *to_state, condition); + if min == 0 { + // r* with a start state that has no incoming edges (the + // in_degree > 0 case already returned above): loop the single + // copy in place by letting each accept state re-enter the + // start, and make the start accepting. + let mut star = automaton_to_repeat.clone(); + + let accept_state = *star.accept_states.iter().next().unwrap(); + if star.accept_states.len() == 1 + && star.out_degree(accept_state) == 0 + && star.in_degree(star.start_state) == 0 + { + star.add_epsilon_transition(accept_state, star.start_state); + let old_start_state = star.start_state; + star.start_state = accept_state; + star.remove_state(old_start_state); + } else { + let t = Self::transitions_from_state_set(&star.transitions, star.start_state); + let transitions = + Self::transitions_from_state_enumerate(&t, &star.removed_states); + + for state in star.accept_states.clone() { + for &(to_state, condition) in &transitions { + star.add_transition(state, *to_state, condition); + } } - } - automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); - } - automaton_to_repeat.cyclic = true; + star.accept(star.get_start_state()); + } - if min == 0 { - self.apply_model(&automaton_to_repeat); + self.apply_model(&star); } else { - self.concat_mut(&automaton_to_repeat)?; + // r{min,} = rᵐⁱⁿ · r*. Build the star part via recursion rather + // than looping `automaton_to_repeat` in place: when the start + // state has incoming edges, `repeat(0, None)` introduces a + // clean accepting start instead of marking the looping start + // accepting, which would otherwise accept partial copies + // (e.g. `(a*b)+` matching "aaba"). + let star = automaton_to_repeat.repeat(0, None)?; + self.concat_mut(&star)?; } return Ok(()); } + // Finite maximum: append the optional copies one at a time, keeping + // `self` with a single accept frontier so the chain stays linear, and + // collect each copy boundary in `end_states` to mark accepting at the + // end (stopping after any copy in `min..=max` is valid). + // + // When the copy's start state has incoming edges, merging it into the + // previous copy's accept state would let that (re-marked accepting) + // junction inherit the copy's own transitions and accept partial + // copies (e.g. `(a*b){1,3}` matching "ba"). In that case we force a + // non-merging concatenation so each boundary is a clean accept state + // reached by an epsilon transition. + let force_no_merge = + automaton_to_repeat.in_degree(automaton_to_repeat.start_state) > 0; let mut end_states = self.accept_states.iter().cloned().collect::>(); for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat_mut(&automaton_to_repeat)?; + self.concat_mut_with(&automaton_to_repeat, force_no_merge)?; end_states.extend(self.accept_states.iter()); } self.accept_states.extend(end_states); @@ -129,6 +165,11 @@ impl FastAutomaton { return 0; } + // 1b. r⁰ = {""} (a single state); see `repeat_mut`. + if max_opt == Some(0) { + return 1; + } + let v_original = self.get_number_of_states(); if v_original == 0 { return 0; @@ -163,40 +204,56 @@ impl FastAutomaton { // 5. Infinite repetition (max_opt is None) if max_opt.is_none() { - let mut v_modified = v_original; - let mut mod_start_in_deg_gt_0 = in_deg_start; - let acc_out_gt_0 = self.accept_states.iter().any(|&s| self.out_degree(s) > 0); - - // Check if it triggers the start-state removal optimization block - if self.accept_states.len() == 1 { - let accept_state = *self.accept_states.iter().next().unwrap(); - if self.out_degree(accept_state) == 0 && !in_deg_start { - // The old start state is removed in the cloned automaton - v_modified -= 1; - mod_start_in_deg_gt_0 = self.in_degree(accept_state) > 0; - } - } - if min == 0 { + // In-place looped r*: a single accept state with no outgoing + // edges and an incoming-edge-free start drops the old start + // state (`v_original - 1`); otherwise the state count is + // unchanged (the `min == 0 && in_deg_start` case already + // returned in step 2). + let mut v_modified = v_original; + if self.accept_states.len() == 1 { + let accept_state = *self.accept_states.iter().next().unwrap(); + if self.out_degree(accept_state) == 0 && !in_deg_start { + v_modified -= 1; + } + } return v_modified; } else { - // Calculate the final virtual concatenation cost manually since - // we can't pass a "virtually modified" automaton to concat_state_count_heuristic - let final_concat_cost = if mod_start_in_deg_gt_0 && acc_out_gt_0 { - v_modified - } else { - v_modified.saturating_sub(1) - }; - return current_states + final_concat_cost; + // r{min,} = rᵐⁱⁿ · r*. `current_states` already accounts for the + // rᵐⁱⁿ part. The star r* = repeat(0, None) is independent of + // `min` and small, so build it to obtain its exact contribution + // under the merging concatenation onto rᵐⁱⁿ (whose accept states + // carry outgoing edges iff `acc_out_gt_0`). + let acc_out_gt_0 = self.accept_states.iter().any(|&s| self.out_degree(s) > 0); + match self.repeat(0, None) { + Ok(star) => { + let star_states = star.get_number_of_states(); + let not_mergeable = + star.in_degree(star.start_state) > 0 && acc_out_gt_0; + let final_concat_cost = if not_mergeable { + star_states + } else { + star_states.saturating_sub(1) + }; + return current_states + final_concat_cost; + } + Err(_) => return current_states, + } } } // 6. Finite maximum repetition loop + // + // The mandatory copies (handled above) merge as plain `r`. Each + // optional tail copy merges as well (`v - 1` new states), except when + // the start state has an incoming edge: the non-merging concatenation + // then introduces a fresh start state, costing `v` per copy. let max = max_opt.unwrap(); let loop_start = if min > 1 { min } else { 1 }; let max_iters = max.saturating_sub(loop_start); - current_states += max_iters as usize * concat_cost; + let optional_states = v_original + if in_deg_start { 1 } else { 0 }; + current_states += max_iters as usize * (optional_states - 1); current_states } @@ -207,17 +264,10 @@ mod tests { use crate::fast_automaton::FastAutomaton; use crate::regex::RegularExpression; - // BUG: `repeat(0, Some(0))` on a non-empty language returns L ∪ {""} - // instead of just {""}. After the (effectively no-op) main loop, the - // code unconditionally calls `accept(start_state)` when min == 0, - // adding "" to the language WITHOUT first reducing the automaton to - // {""}. The result is the union of the original language and the - // empty string. - // - // Repro: "abc".repeat(0, Some(0)) should match "" only; it currently - // also matches "abc". + // Regression: `repeat(0, Some(0))` on a non-empty language used to return + // L ∪ {""} instead of just {""} — the general path left the original + // language reachable and only made the start accepting. r⁰ must be {""}. #[test] - #[ignore = "known bug: repeat(0, 0) returns L ∪ {\"\"} instead of {\"\"}"] fn bug_repeat_zero_zero_on_non_empty() { let a = RegularExpression::parse("abc", false) .unwrap() @@ -231,10 +281,31 @@ mod tests { ); } + // Regression: repeating the empty-string automaton ({""}) used to reach the + // unbounded "tight loop" branch, which removed the single state while it was + // still the start state and panicked. {""} is a fixpoint of repetition, so + // every bound must return {""} without panicking. + #[test] + fn repeat_of_empty_string_is_fixpoint() { + let empty_string = FastAutomaton::new_empty_string(); + for (min, max) in [ + (0, None), + (1, None), + (3, None), + (0, Some(1)), + (2, Some(5)), + (0, Some(0)), + ] { + let r = empty_string.repeat(min, max).unwrap(); + assert!(r.is_match(""), "{{\"\"}}{{{min},{max:?}}} must match \"\""); + assert!(!r.is_match("a"), "{{\"\"}}{{{min},{max:?}}} must match only \"\""); + } + } + // Regression: empty.repeat(_, None) used to panic on - // `accept_states.iter().next().unwrap()` at repeat.rs:63 because the - // unbounded-repeat branch assumed at least one accept state. Language - // theory: ∅* = {""} and ∅⁺ = ∅; both must be returnable without panic. + // `accept_states.iter().next().unwrap()` because the unbounded-repeat + // branch assumed at least one accept state. Language theory: ∅* = {""} + // and ∅⁺ = ∅; both must be returnable without panic. #[test] fn empty_repeat_unbounded_does_not_panic() { let empty = FastAutomaton::new_empty(); @@ -266,6 +337,9 @@ mod tests { #[test] fn test_heuristic() -> Result<(), String> { + assert_heuristic("b*a"); + assert_heuristic("a*b"); + assert_heuristic("ba*"); assert_heuristic(".{900}"); assert_heuristic("[a-z]+"); assert_heuristic("[a-z]+@"); diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 4fed407..e3162d7 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -61,9 +61,11 @@ impl FastAutomaton { condition_converter: &ConditionConverter, ) -> Result, EngineError> { let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - if other.is_accepted(other.start_state) { - self.accept(self.start_state); - } + // If `other` accepts the empty string we must make the union's *entry* + // state accepting — but only after the start state is finalized below. + // Marking the current start eagerly is wrong when it has incoming edges + // (e.g. a self-loop) and is about to be demoted behind a fresh start: + // the demoted state would then wrongly accept the strings on its loop. let self_start_state_in_degree = self.in_degree(self.start_state); let other_start_state_in_degree = other.in_degree(other.start_state); if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { @@ -103,6 +105,13 @@ impl FastAutomaton { } } } + // Now that `self.start_state` is the final entry state, record `other`'s + // empty-string acceptance there. `self`'s own empty-string acceptance is + // preserved by the start handling above (a freshly created start + // inherits it through the epsilon transition). + if other.is_accepted(other.start_state) { + self.accept(self.start_state); + } Ok(imcomplete_states) } @@ -114,7 +123,14 @@ impl FastAutomaton { ) { let mut self_accept_states_without_outgoing_edges = vec![]; for &state in &self.accept_states { - if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { + // The start state must never be a merge candidate: the n > 1 + // branch below removes the merged states, and removing the start + // state panics (e.g. an accepting start with no outgoing edges, + // unioned with an operand whose start has incoming edges). + if self.out_degree(state) == 0 + && !imcomplete_states.contains(&state) + && state != self.start_state + { self_accept_states_without_outgoing_edges.push(state); } } @@ -137,18 +153,26 @@ impl FastAutomaton { }; for &state in &other.accept_states { - match accept_state_without_outgoing_edges { + // Resolve the self-state that represents `state`, allocating one if + // it is not mapped yet, then mark it accepting. The accept flag must + // be applied even when `state` was already mapped during + // `prepare_start_states` (e.g. a start state with incoming edges + // whose outgoing edges reach this accept state); otherwise the + // union would silently drop `other`'s acceptance. + let mapped = match accept_state_without_outgoing_edges { Some(accept_state) if other.out_degree(state) == 0 => { - new_states.entry(state).or_insert(accept_state); + *new_states.entry(state).or_insert(accept_state) } - _ => { - if new_states.get(&state).is_none() { + _ => match new_states.get(&state) { + Some(&mapped) => mapped, + None => { let new_accept_state = self.new_state(); - self.accept(new_accept_state); new_states.insert(state, new_accept_state); + new_accept_state } - } - } + }, + }; + self.accept(mapped); } } @@ -205,7 +229,6 @@ impl FastAutomaton { self.add_transition(new_from_state, new_to_state, &new_condition); } } - self.cyclic = self.cyclic || other.cyclic; self.minimal = false; Ok(()) } @@ -255,12 +278,15 @@ impl FastAutomaton { self_accepts.insert(self.start_state); } - let case_a = self_in == 0 && other_in == 0; let mut n = 0; for &state in &self_accepts { - let is_incomplete = case_a && state == self.start_state; - if self.out_degree(state) == 0 && !is_incomplete { + // Mirror `prepare_accept_states`: a state that is (still) the + // start after the start-state phase is never a merge candidate. + // When `self_in != 0` the original start gets demoted behind a + // fresh start, so it *does* participate. + let is_excluded = self_in == 0 && state == self.start_state; + if self.out_degree(state) == 0 && !is_excluded { n += 1; } } @@ -287,7 +313,96 @@ impl FastAutomaton { #[cfg(test)] mod tests { - use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; + use crate::{Term, fast_automaton::FastAutomaton, regex::RegularExpression}; + + // Regression: unioning with the empty-string language used to drop the + // other operand's acceptance. When `other`'s start state has incoming edges + // its outgoing edges (and the accept states they reach) are mapped during + // `prepare_start_states`; `prepare_accept_states` then failed to mark those + // already-mapped images accepting, so `union({""}, "a+")` matched only "" + // instead of "" and "a", "aa", ... + #[test] + fn union_with_empty_string_keeps_other_accepts() { + let empty_string = RegularExpression::parse("", false) + .unwrap() + .to_automaton() + .unwrap(); + let a_plus = RegularExpression::parse("a+", false) + .unwrap() + .to_automaton() + .unwrap(); + + let u = empty_string.union(&a_plus).unwrap(); + assert!(u.is_match(""), "union must keep \"\""); + assert!(u.is_match("a"), "union dropped the other operand's language"); + assert!(u.is_match("aaa")); + + // It must be equivalent regardless of operand order. + let u2 = a_plus.union(&empty_string).unwrap(); + assert!(Term::from_automaton(u).equivalent(&Term::from_automaton(u2)).unwrap()); + } + + // Regression: `prepare_accept_states` merges accept states without + // outgoing edges and removes the originals. When `self`'s accepting start + // (no outgoing edges) met an operand whose start has incoming edges, the + // start landed in the merge list and `remove_state(start)` panicked. + #[test] + fn union_does_not_remove_accepting_start() { + use crate::CharRange; + use crate::fast_automaton::condition::Condition; + use crate::fast_automaton::spanning_set::SpanningSet; + use regex_charclass::char::Char; + + let rng = |c: char| { + let c = Char::new(c); + CharRange::new_from_range(c..=c) + }; + let ss = SpanningSet::compute_spanning_set(&[rng('a'), rng('b')]); + + // a: two accepting states without outgoing edges, one being the start. + let mut a = FastAutomaton::new_empty(); + a.apply_new_spanning_set(&ss).unwrap(); + a.new_state(); + a.accept(0); + a.accept(1); + + // b: start has an incoming edge (1 -a-> 0) but no outgoing edges. + let mut b = FastAutomaton::new_empty(); + b.apply_new_spanning_set(&ss).unwrap(); + b.new_state(); + b.add_transition(1, 0, &Condition::from_range(&rng('a'), &ss).unwrap()); + b.accept(0); + + let u = a.union(&b).unwrap(); // used to panic + assert!(u.is_match(""), "union must keep the empty string"); + } + + // Regression: unioning a language whose start state has a self-loop with the + // empty string used to mark that looping start accepting, so `a*b | ""` + // wrongly matched "a", "aa", ... The empty-string acceptance must land on + // the union's entry state, not on a demoted looping state. + #[test] + fn union_with_empty_string_does_not_over_accept() { + let a_star_b = RegularExpression::parse("a*b", false) + .unwrap() + .to_automaton() + .unwrap(); + let empty_string = RegularExpression::parse("", false) + .unwrap() + .to_automaton() + .unwrap(); + + let u = a_star_b.union(&empty_string).unwrap(); + assert!(u.is_match(""), "(a*b)? must match \"\""); + assert!(u.is_match("b")); + assert!(u.is_match("ab")); + assert!(u.is_match("aab")); + assert!( + !u.is_match("a"), + "union wrongly accepted 'a' (looping start marked accepting)" + ); + assert!(!u.is_match("aa")); + } #[test] fn test_simple_alternation_regex_1() -> Result<(), String> { diff --git a/src/lib.rs b/src/lib.rs index 123daa4..2ccfe0c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -295,8 +295,7 @@ impl Term { pub fn difference(&self, other: &Term) -> Result { let minuend_automaton = self.to_automaton()?; let subtrahend_automaton = other.to_automaton()?; - let subtrahend_automaton = - Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; + // `FastAutomaton::difference` determinizes the subtrahend itself. let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; Ok(Term::Automaton(return_automaton)) @@ -317,8 +316,8 @@ impl Term { /// assert!(term.union(&[complement]).unwrap().is_total().unwrap()); /// ``` pub fn complement(&self) -> Result { - let automaton = self.to_automaton()?; - let mut automaton = automaton.determinize()?.into_owned(); + // `FastAutomaton::complement` determinizes `self` itself. + let mut automaton = self.to_automaton()?.into_owned(); automaton.complement()?; Ok(Term::Automaton(automaton)) @@ -399,11 +398,9 @@ impl Term { if !return_stable_term || automaton.is_deterministic() { Ok((None, self.to_automaton()?.generate_strings(limit, offset)?)) } else { + // `minimize` determinizes first, yielding the deterministic, + // minimal "stable" automaton. let mut automaton = automaton.into_owned(); - if !automaton.is_deterministic() { - automaton = automaton.determinize()?.into_owned(); - } - if !automaton.is_minimal() { automaton.minimize()?; } @@ -530,24 +527,6 @@ impl Term { self.to_regex().to_string() } - fn determinize_subtrahend<'a>( - minuend: &FastAutomaton, - subtrahend: &'a FastAutomaton, - ) -> Result, EngineError> { - if subtrahend.is_deterministic() { - Ok(Cow::Borrowed(subtrahend)) - } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned( - minuend - .intersection(subtrahend)? - .determinize()? - .into_owned(), - )) - } else { - Ok(subtrahend.determinize()?) - } - } - fn get_automata<'a>( &'a self, terms: &'a [Term], diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index f5d1975..57593a9 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -224,13 +224,16 @@ mod tests { let cardinality = regex.get_cardinality(); - let mut automaton = regex.to_automaton().unwrap(); - - if !automaton.is_cyclic() { - automaton = automaton.determinize().unwrap().into_owned(); - } - - //automaton.to_dot(); + let automaton = regex.to_automaton().unwrap(); + // `get_cardinality` needs a DFA for an exact finite count, but returns + // `Infinite` for cyclic automata without requiring determinism. Only + // determinize the finite (bounded-length) ones — determinizing a large + // cyclic automaton can blow up. + let automaton = if automaton.get_length().1.is_some() { + automaton.determinize().unwrap().into_owned() + } else { + automaton + }; let expected = automaton.get_cardinality(); diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 8325456..36ebff2 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -17,7 +17,7 @@ impl AbstractStateMetadata { } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct AbstractNFAMetadata { start: AbstractStateMetadata, accepted: Vec, @@ -69,6 +69,23 @@ impl AbstractNFAMetadata { } pub(crate) fn repeat(&self, min: u32, max_opt: &Option) -> Self { + // r⁰ = {""} (the empty-string automaton, a single state). + if max_opt == &Some(0) { + return Self::new_empty_string(); + } + + // Unbounded with min >= 1: `repeat_mut` builds r{min,} = rᵐⁱⁿ · r*. + // Mirror that here (mandatory copies via merging concatenation, then a + // recursively-built star) so the predicted count stays consistent with + // the construction even when the start state has incoming edges. + if max_opt.is_none() && min >= 1 { + let mut acc = self.clone(); + for _ in 1..min { + acc = acc.concat(self); + } + return acc.concat(&self.repeat(0, &None)); + } + let start_state_not_mergeable = self.start.has_incoming_edges; let accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); let start_state_or_accept_states_not_mergeable = @@ -105,13 +122,30 @@ impl AbstractNFAMetadata { } let return_number_of_states = if let Some(max) = max_opt { - let mult = if start_state_not_mergeable && (accepted_not_mergeable || min == 0) { + // Mirror `repeat_mut`: rᵐⁱⁿ mandatory copies built by merging + // concatenation, then `max - max(min,1)` optional tail copies. A + // tail copy whose start has incoming edges is concatenated without + // merging (a fresh start state, so +`number_of_states`); otherwise + // it merges (+`number_of_states - 1`). + let max = *max as usize; + let merge_cost = if start_state_not_mergeable && accepted_not_mergeable { + self.number_of_states + } else { + self.number_of_states - 1 + }; + let tail_cost = if start_state_not_mergeable { self.number_of_states } else { self.number_of_states - 1 }; - *max as usize * mult + 1 + if min == 0 { + let base = self.number_of_states + if start_state_not_mergeable { 1 } else { 0 }; + base + max.saturating_sub(1) * tail_cost + } else { + let mandatory = self.number_of_states + (min as usize - 1) * merge_cost; + mandatory + max.saturating_sub(min as usize) * tail_cost + } } else { let mult = if start_state_not_mergeable { self.number_of_states @@ -222,6 +256,13 @@ mod tests { assert_number_of_states_in_nfa("(b*a){5,26}"); assert_number_of_states_in_nfa("(ba*){5,26}"); + // Unbounded with min >= 1 over a self-looping start (r{min,} = rᵐⁱⁿ·r*). + assert_number_of_states_in_nfa("(b*a){1,}"); + assert_number_of_states_in_nfa("(b*a){2,}"); + assert_number_of_states_in_nfa("(b*a){5,}"); + assert_number_of_states_in_nfa("(a*b){1,}"); + assert_number_of_states_in_nfa("(a*b){3,}"); + assert_number_of_states_in_nfa(""); assert_number_of_states_in_nfa("toto"); assert_number_of_states_in_nfa("A+B*"); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index 5156ce8..bb3f73c 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -6,28 +6,12 @@ impl RegularExpression { match self { RegularExpression::Character(_) => self.clone(), RegularExpression::Repetition(regex, min, max_opt) => { - let regex = regex.simplify(); - match regex { - RegularExpression::Repetition( - simplified_regex, - simplified_min, - simplified_max_opt, - ) => { - let new_max = if let (Some(max), Some(simplified_max)) = - (max_opt, simplified_max_opt) - { - Some(max * simplified_max) - } else { - None - }; - RegularExpression::Repetition( - simplified_regex, - min * simplified_min, - new_max, - ) - } - _ => RegularExpression::Repetition(Box::new(regex), *min, *max_opt), - } + // Delegate to `repeat`, which guards the nested-repetition + // collapse with `can_simplify_nested_repetition`. Collapsing + // `(r{a,b}){c,d}` to `r{a*c,b*d}` unconditionally is unsound + // when the step lengths leave a gap (e.g. `(a{3,4}){1,2}` + // would wrongly widen to `a{3,8}`). + regex.simplify().repeat(*min, *max_opt) } RegularExpression::Concat(elements) => { let elements: VecDeque<_> = diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index ee6abee..fe37ae6 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -279,7 +279,20 @@ impl RegularExpression { ); } } else { - return this_regex.repeat(cmp::min(*this_min, *that_min), None); + // At least one side is unbounded. The union collapses to + // r{min(m1,m2),} only when the ranges overlap or are + // adjacent — i.e. the unbounded side starts no later than + // one past the bounded side's end. Otherwise there is a + // gap (e.g. a? ∪ a{3,} must NOT become a*). + let mergeable = match (this_max_opt, that_max_opt) { + (None, None) => true, + (Some(this_max), None) => *that_min <= this_max.saturating_add(1), + (None, Some(that_max)) => *this_min <= that_max.saturating_add(1), + (Some(_), Some(_)) => unreachable!("handled above"), + }; + if mergeable { + return this_regex.repeat(cmp::min(*this_min, *that_min), None); + } } } @@ -353,6 +366,31 @@ impl RegularExpression { mod tests { use super::*; + // Regression: with an unbounded side, the repetition merge used to fire + // unconditionally, so `a? ∪ a{3,}` collapsed to `a*` even though `a{2}` is + // in neither operand. The merge is only sound when the unbounded range + // starts no later than one past the bounded range's end. + #[test] + fn union_does_not_merge_gapped_repetitions() { + let union = |x: &str, y: &str| { + RegularExpression::parse(x, false) + .unwrap() + .union(&RegularExpression::parse(y, false).unwrap()) + .to_string() + }; + + // Gapped: must stay alternations. + assert_eq!("(a?|a{3,})", union("a?", "a{3,}")); + assert_eq!("(a?|a{3,})", union("a{3,}", "a?")); + assert_eq!("(a{2}|a{5,})", union("a{2}", "a{5,}")); + + // Overlapping or adjacent: still merge. + assert_eq!("a*", union("a?", "a{2,}")); + assert_eq!("a{2,}", union("a{2}", "a{3,}")); + assert_eq!("a*", union("a*", "a{3,}")); + assert_eq!("a{3,}", union("a{3,}", "a{5,}")); + } + #[test] fn test_union() -> Result<(), String> { assert_union("(a+|a+b)", "a+b?"); diff --git a/tests/proptest_strategies.rs b/tests/proptest_strategies.rs new file mode 100644 index 0000000..dbf9bd7 --- /dev/null +++ b/tests/proptest_strategies.rs @@ -0,0 +1,566 @@ +//! Property-based tests built on `proptest` strategies that generate random +//! DFAs, NFAs and regular expressions. +//! +//! # Coverage guarantee +//! +//! The strategies are parameterized by a fixed finite alphabet ([`ALPHABET`]) +//! and a maximum number of states ([`MAX_STATES`]). Within those bounds every +//! structure has a strictly positive probability of being generated: +//! +//! * [`arb_dfa`] — every deterministic automaton over the alphabet with +//! `1..=MAX_STATES` states (start state fixed to `0`, any accepting subset, +//! any total transition function) can be produced. A transition function maps +//! each `(state, letter)` to at most one target, which is exactly the +//! definition of a DFA, so the whole DFA space is covered. +//! * [`arb_nfa`] — every nondeterministic automaton is reachable: each ordered +//! `(from, to)` pair may carry any subset of the alphabet letters as its +//! label, plus optional epsilon transitions. Since labels may overlap, this +//! spans all NFAs (and, as a subset, all DFAs). +//! * [`arb_regex`] — every regular expression built from the four +//! [`RegularExpression`] variants up to the configured recursion depth and +//! bound sizes is reachable, including the empty language (`[]`) and `.`. +//! +//! The alphabet is small on purpose so the spaces are finite and the +//! determinization / set operations under test stay cheap. +//! +//! While *coverage* is uniform-in-support, the *distribution* is deliberately +//! shaped: per-automaton edge/epsilon/accept densities are sampled and the +//! character-class strategy favors single letters, so that degenerate +//! languages (∅, {""}) and near-complete blobs are occasional rather than +//! dominant. All weights stay strictly inside (0, 1), preserving the +//! non-null-probability guarantee. + +use proptest::prelude::*; +use regex_charclass::char::Char; +use regexsolver::CharRange; +use regexsolver::error::EngineError; +use regexsolver::execution_profile::ExecutionProfileBuilder; +use regexsolver::fast_automaton::FastAutomaton; +use regexsolver::fast_automaton::condition::Condition; +use regexsolver::fast_automaton::spanning_set::SpanningSet; +use regexsolver::regex::RegularExpression; + +/// Fixed alphabet the strategies draw transition labels from. +pub const ALPHABET: &[char] = &['a', 'b']; + +/// Maximum number of states a generated automaton can have. +pub const MAX_STATES: usize = 4; + +/// The single-character range for the `i`-th alphabet letter. +fn letter(i: usize) -> CharRange { + let c = Char::new(ALPHABET[i]); + CharRange::new_from_range(c..=c) +} + +/// The spanning set induced by the alphabet (one base per letter + a "rest"). +fn spanning_set() -> SpanningSet { + let ranges: Vec = (0..ALPHABET.len()).map(letter).collect(); + SpanningSet::compute_spanning_set(&ranges) +} + +/// The transition-label bases: one [`CharRange`] per alphabet letter. +/// +/// We deliberately exclude the spanning set's "rest" range. The spanning set's +/// contract is that the rest holds exactly the characters that **no** transition +/// uses, so a label may only be a subset of the alphabet letters; that is also +/// precisely the standard "automaton over Σ" model. +fn bases(ss: &SpanningSet) -> Vec { + let _ = ss; + (0..ALPHABET.len()).map(letter).collect() +} + +/// Number of transition-label bases (one per alphabet letter). +fn num_bases() -> usize { + ALPHABET.len() +} + +/// Number of states, biased toward larger automata (the maximum of two uniform +/// draws). Tiny automata are still generated, but the n = 1 space is almost +/// entirely degenerate, so uniform sampling would waste a quarter of all cases +/// on it. +fn arb_num_states() -> impl Strategy { + (1usize..=MAX_STATES, 1usize..=MAX_STATES).prop_map(|(a, b)| a.max(b)) +} + +/// Builds an automaton from a structural description using only the public API. +/// +/// `accepts[s]` marks state `s` accepting; each `(from, to, mask)` adds a +/// transition whose label is the union of the bases selected by `mask`; each +/// `(from, to)` in `eps` adds an epsilon transition. The start state is `0`. +fn build( + n: usize, + accepts: &[bool], + char_edges: &[(usize, usize, Vec)], + eps: &[(usize, usize)], +) -> FastAutomaton { + let ss = spanning_set(); + let bs = bases(&ss); + + let mut a = FastAutomaton::new_empty(); + // `new_empty` already owns state 0; the spanning set is applied before any + // transition so the conditions we add line up with it. + a.apply_new_spanning_set(&ss) + .expect("applying a spanning set to an empty automaton never fails"); + for _ in 1..n { + a.new_state(); + } + + for (s, &acc) in accepts.iter().enumerate() { + if acc { + a.accept(s); + } + } + + for (from, to, mask) in char_edges { + let mut range = CharRange::empty(); + for (i, &on) in mask.iter().enumerate() { + if on { + range = range.union(&bs[i]); + } + } + if range.is_empty() { + continue; + } + let cond = Condition::from_range(&range, &ss) + .expect("a union of full spanning bases is always a valid condition"); + a.add_transition(*from, *to, &cond); + } + + // Epsilon transitions are added last: `add_epsilon_transition` eagerly + // folds the target's current transitions into the source. + for (from, to) in eps { + a.add_epsilon_transition(*from, *to); + } + + a +} + +/// Strategy producing every DFA over the alphabet with `1..=MAX_STATES` states. +/// +/// Determinism is structural: for each state and each letter we choose at most +/// one target, so transitions leaving a state always carry disjoint labels. +/// +/// An edge density and an accept density are sampled per automaton (both +/// bounded away from 0 and 1, so every DFA keeps a positive probability). +/// Mostly-total transition functions keep the states connected, which makes +/// degenerate (∅ / {""}) languages the exception rather than the rule. +pub fn arb_dfa() -> impl Strategy { + (arb_num_states(), 0.6f64..0.97, 0.4f64..0.9) + .prop_flat_map(|(n, edge_density, accept_density)| { + let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // transition function: tf[state][base] = optional target state + let tf = prop::collection::vec( + prop::collection::vec( + prop::option::weighted(edge_density, 0usize..n), + num_bases(), + ), + n, + ); + (Just(n), accepts, tf) + }) + .prop_map(|(n, accepts, tf)| { + let nb = num_bases(); + let mut edges: Vec<(usize, usize, Vec)> = Vec::new(); + for (from, row) in tf.iter().enumerate() { + // Group the bases by their chosen target so each (from, to) + // edge gets a single, disjoint-from-its-siblings label. + let mut by_target: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + for (base, target) in row.iter().enumerate() { + if let Some(t) = target { + by_target.entry(*t).or_insert_with(|| vec![false; nb])[base] = true; + } + } + for (to, mask) in by_target { + edges.push((from, to, mask)); + } + } + build(n, &accepts, &edges, &[]) + }) +} + +/// Strategy producing every NFA over the alphabet with `1..=MAX_STATES` states +/// (overlapping labels allowed, plus optional epsilon transitions). +/// +/// Instead of a fixed per-bit probability (which blobs large automata and +/// starves small ones), a per-state branching target is sampled and converted +/// into a bit density of `target / (n · |Σ|)`, so the *local* structure is +/// comparable across sizes. Epsilon and accept densities are sampled too. All +/// densities stay strictly inside (0, 1), so every NFA keeps a positive +/// probability. +pub fn arb_nfa() -> impl Strategy { + ( + arb_num_states(), + 1.0f64..2.8, + 0.02f64..0.18, + 0.4f64..0.9, + ) + .prop_flat_map(|(n, target_out_degree, eps_density, accept_density)| { + let label_density = + (target_out_degree / (n as f64 * num_bases() as f64)).clamp(0.02, 0.95); + let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // labels[from][to] = mask over the alphabet letters + let labels = prop::collection::vec( + prop::collection::vec( + prop::collection::vec(prop::bool::weighted(label_density), num_bases()), + n, + ), + n, + ); + // eps[from][to] = whether an epsilon transition is present + let eps = prop::collection::vec( + prop::collection::vec(prop::bool::weighted(eps_density), n), + n, + ); + (Just(n), accepts, labels, eps) + }) + .prop_map(|(n, accepts, labels, eps)| { + let mut char_edges = Vec::new(); + for (from, row) in labels.iter().enumerate() { + for (to, mask) in row.iter().enumerate() { + if mask.iter().any(|&b| b) { + char_edges.push((from, to, mask.clone())); + } + } + } + let mut eps_edges = Vec::new(); + for (from, row) in eps.iter().enumerate() { + for (to, &on) in row.iter().enumerate() { + if on && from != to { + eps_edges.push((from, to)); + } + } + } + build(n, &accepts, &char_edges, &eps_edges) + }) +} + +/// Strategy for a character class: any subset of the alphabet (including the +/// empty language) and, occasionally, the total range `.`. +/// +/// Single letters dominate: the unbiased subset mask would produce the empty +/// class `[]` a quarter of the time, and a single `[]` anywhere in a +/// concatenation collapses the whole expression to the empty language. The +/// mask branch keeps every subset (including `[]`) at a positive probability. +fn arb_charrange() -> impl Strategy { + prop_oneof![ + 8 => (0..ALPHABET.len()).prop_map(letter), + 3 => prop::collection::vec(any::(), ALPHABET.len()).prop_map(|mask| { + let mut r = CharRange::empty(); + for (i, &on) in mask.iter().enumerate() { + if on { + r = r.union(&letter(i)); + } + } + r + }), + 1 => Just(CharRange::total()), + ] +} + +/// Strategy producing regular expressions over the four [`RegularExpression`] +/// variants up to a bounded recursion depth. +pub fn arb_regex() -> impl Strategy { + let leaf = arb_charrange().prop_map(RegularExpression::Character); + leaf.prop_recursive(3, 24, 3, |inner| { + prop_oneof![ + (inner.clone(), 0u32..=2, 0u32..=2, any::()).prop_map( + |(r, min, extra, has_max)| { + let max = if has_max { Some(min + extra) } else { None }; + RegularExpression::Repetition(Box::new(r), min, max) + } + ), + prop::collection::vec(inner.clone(), 1..=3) + .prop_map(|v| RegularExpression::Concat(v.into_iter().collect())), + prop::collection::vec(inner, 1..=3).prop_map(RegularExpression::Alternation), + ] + }) +} + +/// Runs `f` under a bounded execution profile, returning `None` when the +/// operation legitimately exceeds the state/time budget (which is not a bug). +fn bounded Result>(f: F) -> Option { + ExecutionProfileBuilder::new() + .max_number_of_states(8192) + .execution_timeout(3000) + .build() + .run(|| match f() { + Ok(v) => Some(v), + Err(EngineError::AutomatonHasTooManyStates) + | Err(EngineError::OperationTimeOutError) => None, + Err(e) => panic!("unexpected engine error: {e:?}"), + }) +} + +fn determinized(a: &FastAutomaton) -> Option { + bounded(|| a.determinize().map(|c| c.into_owned())) +} + +fn complemented(a: &FastAutomaton) -> Option { + bounded(|| { + let mut c = a.clone(); + c.complement()?; + Ok(c) + }) +} + +/// All strings up to length 4 over the alphabet (plus the empty string). +fn probes() -> Vec { + let mut all = vec![String::new()]; + let mut frontier = vec![String::new()]; + for _ in 0..4 { + let mut next = Vec::new(); + for w in &frontier { + for &c in ALPHABET { + let mut s = w.clone(); + s.push(c); + next.push(s); + } + } + all.extend(next.iter().cloned()); + frontier = next; + } + all +} + +proptest! { + #![proptest_config(ProptestConfig::with_cases(192))] + + /// The DFA strategy really does produce deterministic automata. + #[test] + fn dfa_strategy_is_deterministic(a in arb_dfa()) { + prop_assert!(a.is_deterministic(), "arb_dfa produced a non-deterministic automaton"); + } + + /// `a` and `determinize(a)` accept the same language. + #[test] + fn determinize_preserves_language(a in arb_nfa()) { + if let Some(d) = determinized(&a) { + prop_assert!(d.is_deterministic()); + if let Some(eq) = bounded(|| a.equivalent(&d)) { + prop_assert!(eq, "determinize changed the language"); + } + } + } + + /// Minimizing a DFA preserves its language. + #[test] + fn minimize_preserves_language(a in arb_nfa()) { + if let Some(d) = determinized(&a) { + let mut m = d.clone(); + if bounded(|| m.minimize()).is_some() + && let Some(eq) = bounded(|| d.equivalent(&m)) + { + prop_assert!(eq, "minimize changed the language"); + } + } + } + + /// Complement laws: membership flips, `a ∩ ¬a = ∅`, `a ∪ ¬a = Σ*`. + #[test] + fn complement_laws(a in arb_dfa()) { + let d = match determinized(&a) { Some(d) => d, None => return Ok(()) }; + let c = match complemented(&d) { Some(c) => c, None => return Ok(()) }; + + for s in probes() { + prop_assert_eq!(d.is_match(&s), !c.is_match(&s), "complement membership for {:?}", s); + } + + if let Some(inter) = bounded(|| d.intersection(&c)) + && let Some(empty) = bounded(|| inter.equivalent(&FastAutomaton::new_empty())) + { + prop_assert!(empty, "a ∩ ¬a is not empty"); + } + if let Some(union) = bounded(|| d.union(&c)) + && let Some(total) = bounded(|| union.equivalent(&FastAutomaton::new_total())) + { + prop_assert!(total, "a ∪ ¬a is not total"); + } + } + + /// Membership of intersection / union / difference matches the boolean + /// combination of the operands on every probe string. + #[test] + fn set_ops_membership(a in arb_nfa(), b in arb_nfa()) { + let ps = probes(); + + if let Some(inter) = bounded(|| a.intersection(&b)) { + for s in &ps { + prop_assert_eq!( + inter.is_match(s), a.is_match(s) && b.is_match(s), + "intersection membership for {:?}", s + ); + } + } + if let Some(union) = bounded(|| a.union(&b)) { + for s in &ps { + prop_assert_eq!( + union.is_match(s), a.is_match(s) || b.is_match(s), + "union membership for {:?}", s + ); + } + } + // `difference` determinizes the subtrahend itself. + if let Some(diff) = bounded(|| a.difference(&b)) { + for s in &ps { + prop_assert_eq!( + diff.is_match(s), a.is_match(s) && !b.is_match(s), + "difference membership for {:?}", s + ); + } + } + } + + /// `subset` and `equivalent` agree: mutual subset iff equivalent; both are + /// reflexive. + #[test] + fn subset_equivalent_consistency(a in arb_nfa(), b in arb_nfa()) { + if let Some(refl) = bounded(|| a.equivalent(&a)) { + prop_assert!(refl, "equivalent is not reflexive"); + } + if let Some(refl) = bounded(|| a.subset(&a)) { + prop_assert!(refl, "subset is not reflexive"); + } + if let (Some(ab), Some(ba), Some(eq)) = ( + bounded(|| a.subset(&b)), + bounded(|| b.subset(&a)), + bounded(|| a.equivalent(&b)), + ) { + prop_assert_eq!(ab && ba, eq, "mutual subset disagrees with equivalent"); + } + } + + /// `a -> regex -> a` round-trips: the regular expression extracted from an + /// automaton compiles back to an equivalent automaton. + #[test] + fn automaton_to_regex_roundtrip(a in arb_nfa()) { + let r = a.to_regex(); + if let Some(a2) = bounded(|| r.to_automaton()) + && let Some(eq) = bounded(|| a.equivalent(&a2)) + { + prop_assert!(eq, "automaton -> regex -> automaton changed the language: {}", r); + } + } + + /// Regular expressions round-trip through an automaton and agree with the + /// reference `regex` crate on every probe string. + #[test] + fn regex_roundtrip_and_oracle(r in arb_regex()) { + let a = match bounded(|| r.to_automaton()) { Some(a) => a, None => return Ok(()) }; + + // regex -> automaton -> regex -> automaton preserves the language. + let r2 = a.to_regex(); + if let Some(a2) = bounded(|| r2.to_automaton()) + && let Some(eq) = bounded(|| a.equivalent(&a2)) + { + prop_assert!(eq, "regex round-trip changed the language: {} -> {}", r, r2); + } + + // Cross-check membership against the standard regex engine (anchored, + // dot-matches-newline). Patterns denoting the empty language ("[]") are + // rejected by the `regex` crate, so we only compare when it accepts the + // pattern. + let pattern = r.to_string(); + if let Ok(re) = regex::Regex::new(&format!("(?s)^(?:{})$", pattern)) { + for s in probes() { + prop_assert_eq!( + a.is_match(&s), re.is_match(&s), + "pattern {:?} disagrees with reference engine on {:?}", pattern, s + ); + } + } + } +} + +#[cfg(test)] +mod inspect { + use super::*; + use proptest::strategy::{Strategy, ValueTree}; + use proptest::test_runner::TestRunner; + + fn samples(strat: S, n: usize) -> Vec { + let mut runner = TestRunner::deterministic(); + (0..n) + .map(|_| strat.new_tree(&mut runner).unwrap().current()) + .collect() + } + + #[test] + fn show_regex() { + for (i, r) in samples(arb_regex(), 30).into_iter().enumerate() { + println!("regex[{i:02}] = {r}"); + } + } + + #[test] + fn show_dfa() { + for (i, a) in samples(arb_dfa(), 30).into_iter().enumerate() { + println!("dfa[{i:02}] det={} Graphviz={}", a.is_deterministic(), a); + } + } + + #[test] + fn show_nfa() { + for (i, a) in samples(arb_nfa(), 30).into_iter().enumerate() { + println!("nfa[{i:02}] det={} Graphviz={}", a.is_deterministic(), a); + } + } + + /// Classifies the language of an automaton for the quality summary. + fn classify(a: &FastAutomaton) -> &'static str { + if a.is_empty() { + "empty" + } else if a.is_empty_string() { + "{\"\"}" + } else if a + .determinize() + .map(|d| d.is_total()) + .unwrap_or(false) + { + "total" + } else { + "interesting" + } + } + + fn automaton_stats(name: &str, autos: &[FastAutomaton]) { + let n = autos.len() as f64; + let mut counts = std::collections::BTreeMap::new(); + let mut states = 0usize; + let mut edges = 0usize; + for a in autos { + *counts.entry(classify(a)).or_insert(0usize) += 1; + states += a.get_number_of_states(); + edges += a + .states_vec() + .iter() + .map(|&s| a.transitions_from_vec(s).len()) + .sum::(); + } + let pct = |k: &str| 100.0 * *counts.get(k).unwrap_or(&0) as f64 / n; + println!( + "{name}: empty {:>5.1}% | {{\"\"}} {:>5.1}% | total {:>5.1}% | interesting {:>5.1}% | avg states {:.2} | avg edges {:.2}", + pct("empty"), pct("{\"\"}"), pct("total"), pct("interesting"), + states as f64 / n, edges as f64 / n, + ); + } + + /// Quantitative quality summary over a larger sample. + #[test] + fn stats() { + const N: usize = 300; + + let regexes = samples(arb_regex(), N); + let regex_autos: Vec = regexes + .iter() + .map(|r| r.to_automaton().expect("small regexes always convert")) + .collect(); + let avg_len = + regexes.iter().map(|r| r.to_string().len()).sum::() as f64 / N as f64; + automaton_stats("regex", ®ex_autos); + println!("regex: avg pattern length {avg_len:.1}"); + + automaton_stats("dfa ", &samples(arb_dfa(), N)); + automaton_stats("nfa ", &samples(arb_nfa(), N)); + } +} From d37ac9a0950b3e4b9d9e57fae49cc7617a75bdb3 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:59:08 +0200 Subject: [PATCH 58/62] Big refactoring --- .github/workflows/rust.yml | 4 + Cargo.toml | 16 +- README.md | 419 ++++----- assets/automaton.svg | 70 ++ benches/my_benchmark.rs | 89 -- benches/operations.rs | 289 ++++++ examples/generate.rs | 25 + examples/relate.rs | 51 + src/error/mod.rs | 20 + src/execution_profile.rs | 330 ++++++- src/fast_automaton/analyze/cardinality.rs | 129 ++- src/fast_automaton/analyze/equivalence.rs | 13 +- src/fast_automaton/analyze/length.rs | 166 +++- src/fast_automaton/analyze/subset.rs | 18 +- src/fast_automaton/builder.rs | 180 ++++ src/fast_automaton/condition/converter.rs | 28 +- .../condition/fast_bit_vec/mod.rs | 16 + src/fast_automaton/condition/mod.rs | 47 + .../to_regex/state_elimination/builder.rs | 6 +- .../to_regex/state_elimination/eliminate.rs | 85 +- src/fast_automaton/generate.rs | 12 +- src/fast_automaton/mod.rs | 33 +- src/fast_automaton/operation/determinize.rs | 20 +- src/fast_automaton/operation/difference.rs | 13 +- src/fast_automaton/operation/intersection.rs | 32 + src/fast_automaton/operation/minimize.rs | 23 +- src/fast_automaton/operation/repeat.rs | 121 ++- src/fast_automaton/operation/union.rs | 15 +- src/lib.rs | 156 +-- src/regex/analyze/affixes.rs | 22 +- src/regex/analyze/mod.rs | 24 +- src/regex/analyze/number_of_states.rs | 55 +- src/regex/builder.rs | 113 ++- src/regex/mod.rs | 72 +- src/regex/operation/concat.rs | 106 ++- src/regex/operation/mod.rs | 2 +- src/regex/operation/repeat.rs | 66 +- src/regex/operation/simplify.rs | 2 +- src/regex/operation/union.rs | 58 +- src/regex/serializer.rs | 25 - tests/proptest_strategies.rs | 887 +++++++++++++++--- tests/readme_examples.rs | 77 ++ 42 files changed, 3093 insertions(+), 842 deletions(-) create mode 100644 assets/automaton.svg delete mode 100644 benches/my_benchmark.rs create mode 100644 benches/operations.rs create mode 100644 examples/generate.rs create mode 100644 examples/relate.rs delete mode 100644 src/regex/serializer.rs create mode 100644 tests/readme_examples.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0666f63..8373213 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -21,3 +21,7 @@ jobs: run: | cargo test cargo clippy + - name: Test & Lint (no default features) + run: | + cargo test --no-default-features + cargo clippy --no-default-features diff --git a/Cargo.toml b/Cargo.toml index 25c0888..6bc045a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,28 +6,28 @@ authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" keywords = ["automaton", "intersection", "union", "difference", "regex"] +categories = ["text-processing", "mathematics", "algorithms"] description = "High-performance Rust library for building, combining, and analyzing regular expressions and finite automata" readme = "README.md" [dependencies] nohash-hasher = "0.2" ahash = "0.8.11" -log = "0.4.21" -rand = "0.8.5" -lazy_static = "1.4.0" -regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } -rayon = "1.10.0" +rayon = { version = "1.10.0", optional = true } bit-set = "0.8.0" indexmap = "2.13.0" +[features] +default = ["parallel"] +parallel = ["dep:rayon"] + [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } -env_logger = "0.11.3" -serde_json = "1.0.114" proptest = "1" +regex = "1.10.3" [[bench]] -name = "my_benchmark" +name = "operations" harness = false diff --git a/README.md b/README.md index d5a2885..0cfb51a 100644 --- a/README.md +++ b/README.md @@ -1,265 +1,178 @@ # RegexSolver + [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) +[![docs.rs](https://img.shields.io/docsrs/regexsolver)](https://docs.rs/regexsolver) +[![CI](https://github.com/RegexSolver/regexsolver/actions/workflows/rust.yml/badge.svg)](https://github.com/RegexSolver/regexsolver/actions/workflows/rust.yml) +[![License: MIT](https://img.shields.io/crates/l/regexsolver)](LICENSE) + +The `regex` crate tells you whether a *string* matches a pattern. **RegexSolver treats patterns as the sets of strings they match** — so you can intersect, subtract, compare, complement, and enumerate them, and get the result back as a regex. + +```rust +use regexsolver::Term; + +let a = Term::from_pattern("(ab|xy){2}")?; +let b = Term::from_pattern(".*xy")?; -**RegexSolver** is a Rust library for building, combining, and analyzing regular expressions and finite automata. It is designed for constraint solvers, test generators, and other systems that need advanced regex and automaton operations. +// Which strings match BOTH patterns? Get the answer as a regex: +let both = a.intersection(&[b])?; +assert_eq!(both.to_pattern(), "(ab|xy)xy"); -## Table of Contents +// ...and sample them: +assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); +``` - - [Installation](#installation) - - [Example](#example) - - [Key Concepts & Limitations](#key-concepts--limitations) - - [API](#api) - - [Term](#term) - - [FastAutomaton](#fastautomaton) - - [RegularExpression](#regularexpression) - - [Bound Execution](#bound-execution) - - [Cross-Language Support](#cross-language-support) - - [License](#license) +## What would you use this for? -## Installation +- **Safe migrations** — `old_rule.subset(&new_rule)?`: does the new validation pattern accept *everything* the old one did? +- **Test-data generation** — `term.generate_strings(100, 0)?`: produce strings matching any pattern, with pagination. +- **Rule analysis** — find shadowed or overlapping routes, firewall rules, and validators with `intersection` / `difference`. +- **Equivalence proofs** — `a.equivalent(&b)?`: show that two differently-written patterns match exactly the same strings. +- **Pattern simplification** — every operation returns a `Term` you can turn back into a clean pattern with `to_pattern()`. -Add to your `Cargo.toml`: +Under the hood, every pattern compiles to a finite automaton: -```toml -[dependencies] -regexsolver = "1" +

the minimal automaton of (ab|cd)*

+

(ab|cd)* compiled to its minimal automaton — generated with this library's as_dot()

+ +## Try it + +```bash +git clone https://github.com/RegexSolver/regexsolver && cd regexsolver + +# How do two patterns relate? (equivalence, subsets, intersection, differences) +cargo run --example relate -- "(ab|xy){2}" ".*xy" + +# Sample strings matching a pattern +cargo run --example generate -- "[a-z]{2}[0-9]" 20 ``` -## Example +```text +a = (ab|xy){2} +b = .*xy -```rust -use regexsolver::Term; -use regexsolver::error::EngineError; +equivalent: no +a subset of b: false +b subset of a: false -fn main() -> Result<(), EngineError> { - // Create terms from regex - let t1 = Term::from_pattern("abc.*")?; - let t2 = Term::from_pattern(".*xyz")?; - - // Concatenate - let concat = t1.concat(&[t2])?; - assert_eq!(concat.to_pattern(), "abc.*xyz"); - - // Union - let union = t1.union(&[Term::from_pattern("fgh")?])?; - assert_eq!(union.to_pattern(), "(abc.*|fgh)"); - - // Intersection - let inter = Term::from_pattern("(ab|xy){2}")? - .intersection(&[Term::from_pattern(".*xy")?])?; - assert_eq!(inter.to_pattern(), "(ab|xy)xy"); - - // Difference - let diff = Term::from_pattern("a*")? - .difference(&Term::from_pattern("")?)?; - assert_eq!(diff.to_pattern(), "a+"); - - // Repetition - let rep = Term::from_pattern("abc")? - .repeat(2, Some(4))?; - assert_eq!(rep.to_pattern(), "(abc){2,4}"); - - // Analyze - assert_eq!(rep.get_length(), (Some(6), Some(12))); - assert!(!rep.is_empty()); - - // Generate examples - let samples = Term::from_pattern("(x|y){1,3}")? - .generate_strings(5, 0)?; - println!("Some matches: {:?}", samples); - - // Equivalence & subset - let a = Term::from_pattern("a+")?; - let b = Term::from_pattern("a*")?; - assert!(!a.equivalent(&b)?); - assert!(a.subset(&b)?); - - Ok(()) -} +a ∩ b = (ab|xy)xy + e.g. ["xyxy", "abxy"] +a - b = (ab|xy)ab +b - a = (x{1,2}|ax|([^ax]|a[^b]|x[^y]).*x|(ab|xy)(x{2}|ax|([^ax]|a[^b]|x[^y]|(ab|xy).).*x|(ab|xy)x))y ``` -## Key Concepts & Limitations +Or in your own project: -RegexSolver supports a subset of regular expressions that adhere to the principles of regular languages. Here are the key characteristics and limitations of the regular expressions supported by RegexSolver: -- **Anchored Expressions:** All regular expressions in RegexSolver are anchored. This means that the expressions are treated as if they start and end at the boundaries of the input text. For example, the expression `abc` will match the string "abc" but not "xabc" or "abcx". -- **Lookahead/Lookbehind:** RegexSolver does not support lookahead (`(?=...)`) or lookbehind (`(?<=...)`) assertions. Using them returns an error. -- **Pure Regular Expressions:** RegexSolver focuses on pure regular expressions as defined in regular language theory. This means features that extend beyond regular languages, such as backreferences (`\1`, `\2`, etc.), are not supported. Any use of backreference would return an error. -- **Greedy/Ungreedy Quantifiers:** The concept of ungreedy (`*?`, `+?`, `??`) quantifiers is not supported. All quantifiers are treated as greedy. For example, `a*` or `a*?` will match the longest possible sequence of "a"s. -- **Line Feed and Dot:** RegexSolver handles all characters the same way. The dot `.` matches any Unicode character including line feed (`\n`). -- **Empty Regular Expressions:** The empty language (matches no string) is represented by constructs like `[]` (empty character class). This is distinct from the empty string. +```bash +cargo add regexsolver +``` + +By default the `parallel` feature is enabled: unions/intersections of more than 3 operands and parts of the automaton-to-regex conversion run on [rayon](https://crates.io/crates/rayon). Disable it for a leaner dependency tree on single-threaded workloads: + +```toml +regexsolver = { version = "1", default-features = false } +``` + +## Semantics in 30 seconds + +RegexSolver implements **pure regular languages**, which differs from typical regex engines in two ways that surprise people: + +- **Everything is anchored**: `abc` matches the string "abc" — not "xabc" or "abcx". Patterns describe *whole strings*. +- **`.` matches any character**, including line feed (`\n`). + +The rest follows from regular-language theory: + +- **Backreferences** (`\1`, `\2`, ...) go beyond regular languages and return an error, as do **lookahead/lookbehind** assertions (`(?=...)`, `(?<=...)`). +- **All quantifiers are greedy**: ungreedy markers (`*?`, `+?`, `??`) are ignored — as *sets of strings*, `a*` and `a*?` are the same language. +- **The empty language** (matches no string at all) is written `[]` (empty character class). This is distinct from the empty string `""`. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. -## API - -### Term - -`Term` is an enum designed to represent either a regular expression or an automaton. Used when working with both regular expressions and automata, allowing operations to be performed transparently regardless of the underlying representation. - -#### Build -| Method | Return | Description | -| -------- | ------- | ------- | -| `from_automaton(automaton: FastAutomaton)` | `Term` | Creates a new `Term` holding the provided `FastAutomaton`. | -| `from_pattern(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns a new `Term` holding the resulting `RegularExpression`. | -| `from_regex(regex: RegularExpression)` | `Term` | Creates a new `Term` holding the provided `RegularExpression`. | -| `new_empty()` | `Term` | Creates a term that matches the empty language. | -| `new_empty_string()` | `Term` | Creates a term that only matches the empty string `""`. | -| `new_total()` | `Term` | Creates a term that matches all possible strings. | - -#### Manipulate -| Method | Return | Description | -| -------- | ------- | ------- | -| `concat(&self, terms: &[Term])` | `Result` | Computes the concatenation of the given terms. | -| `difference(&self, other: &Term)` | `Result` | Computes the difference between `self` and `other`. | -| `intersection(&self, terms: &[Term])` | `Result` | Computes the intersection of the given terms. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `union(&self, terms: &[Term])` | `Result` | Computes the union of the given terms. | - -#### Analyze -| Method | Return | Description | -| -------- | ------- | ------- | -| `equivalent(&self, term: &Term)` | `Result` | Returns `true` if both terms accept the same language. | -| `generate_strings(&self, count: usize, offset: usize)` | `Result, EngineError>` | Generates `count` strings matched by the term, skipping the first `offset` strings. | -| `get_cardinality(&self)` | `Result, EngineError>` | Returns the cardinality of the term (i.e., the number of possible matched strings). | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `is_empty(&self)` | `bool` | Checks if the term matches the empty language. | -| `is_empty_string(&self)` | `bool` | Checks if the term matches only the empty string `""`. | -| `is_total(&self)` | `bool` | Checks if the term matches all possible strings. | -| `subset(&self, term: &Term)` | `Result` | Returns `true` if all strings matched by the current term are also matched by the given term. | -| `to_automaton(&self)` | `Result, EngineError>` | Converts the term to a `FastAutomaton`. | -| `to_pattern(&self)` | `String` | Converts the term to a regular expression pattern. | -| `to_regex(&self)` | `Cow` | Converts the term to a `RegularExpression`. | - -### FastAutomaton +## A tour of the API + +[`Term`](https://docs.rs/regexsolver/latest/regexsolver/enum.Term.html) is the type you'll interact with: it wraps either a regular expression or an automaton and picks the best representation for each operation. The essentials: + +| Method | Description | +| -------- | ------- | +| `Term::from_pattern(pattern)` | Parses a pattern into a term. | +| `intersection(&self, terms)` / `union(&self, terms)` | Set operations over any number of terms. | +| `difference(&self, other)` / `complement(&self)` | What `self` matches and `other` doesn't / everything `self` doesn't match. | +| `concat(&self, terms)` / `repeat(&self, min, max)` | Sequence and repeat languages. | +| `equivalent(&self, other)` / `subset(&self, other)` | Compare languages. | +| `is_empty()` / `is_total()` / `get_length()` / `get_cardinality()` | Analyze a language: matches nothing? everything? string lengths? how many strings? | +| `generate_strings(limit, offset)` | Enumerate matching strings (call `minimize()` once first when paginating). | +| `to_pattern()` / `to_automaton()` / `to_regex()` | Convert back out. | + +All fallible operations return `Result<_, EngineError>` — nothing panics on adversarial input. + +### Building automata by hand `FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. -When building or modifying an automaton you might come to use the method `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)`. This method accepts a `Condition` rather than a raw character set. To build a `Condition`, call: -```rust -Condition::from_range(&range, &spanning_set); -``` -where `spanning_set` is the automaton's current `SpanningSet`. The `CharRange` you pass must be fully covered by that spanning set. If it isn't, you have two options: +States are created with `new_state()` and transitions with `add_transition_from_range`, which labels the transition with a plain `CharRange`: -1. Merge an existing spanning set with another: ```rust -let new_set = SpanningSet::merge(&old_set, &other_set); +use regexsolver::CharRange; +use regexsolver::fast_automaton::FastAutomaton; +use regex_charclass::char::Char; + +// Build an automaton matching "[a-c][0-9]*" by hand: +let mut automaton = FastAutomaton::new_empty(); +let s1 = automaton.new_state(); +automaton.accept(s1); + +let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); +let digits = CharRange::new_from_range(Char::new('0')..=Char::new('9')); +automaton.add_transition_from_range(0, s1, &a_to_c)?; +automaton.add_transition_from_range(s1, s1, &digits)?; + +assert!(automaton.is_match("b42")); +assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); ``` -2. Recompute from a list of ranges: -```rust -let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2, …]); -``` +Internally, transition labels are bitvector `Condition`s over the automaton's `SpanningSet` of disjoint character ranges — that is what makes label union/intersection/complement O(1) ([article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation)). `add_transition_from_range` maintains that representation for you; for full manual control over conditions and spanning sets, see the [`add_transition` documentation](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html#method.add_transition). + +Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html) — `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex` — plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `as_dot`, ...). + +### Working with patterns as ASTs + +`RegularExpression` is the parsed pattern itself: a plain AST enum (`Character` / `Repetition` / `Concat` / `Alternation`) you can analyze and walk directly. Set operations like intersection and difference live on `FastAutomaton` (or, more conveniently, on `Term`); convert with `to_automaton()`. -After constructing `new_set`, apply it to the automaton: ```rust -fast_automaton.apply_new_spanning_set(&new_set); +use regexsolver::cardinality::Cardinality; +use regexsolver::regex::RegularExpression; + +// A validation pattern for an order id, e.g. "ORD-2024-12345". +let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; + +// How long can matching ids get? Size your database column accordingly. +assert_eq!(pattern.get_length(), (Some(13), Some(15))); + +// How many distinct ids does the pattern allow? +assert_eq!(pattern.get_cardinality(), Cardinality::Integer(111_000_000)); + +// The AST is a plain enum: walk it to lint patterns, e.g. reject +// validation rules that accept unboundedly long input. +fn has_unbounded_repetition(regex: &RegularExpression) -> bool { + match regex { + RegularExpression::Character(_) => false, + RegularExpression::Repetition(inner, _, max) => { + max.is_none() || has_unbounded_repetition(inner) + } + RegularExpression::Concat(parts) => parts.iter().any(has_unbounded_repetition), + RegularExpression::Alternation(parts) => parts.iter().any(has_unbounded_repetition), + } +} +assert!(!has_unbounded_repetition(&pattern)); +assert!(has_unbounded_repetition(&RegularExpression::new(".*@example\\.com")?)); ``` -This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). - -#### Build -| Method | Return | Description | -| -------- | ------- | ------- | -| `accept(&mut self, state: State)` | `()` | Marks the provided state as an accepting (final) state. | -| `add_epsilon_transition(&mut self, from_state: State, to_state: State)` | `()` | Creates a new epsilon transition between the two states. | -| `add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition)` | `()` | Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. | -| `apply_new_spanning_set(&mut self, new_spanning_set: &SpanningSet)` | `Result<(), EngineError>` | Applies the provided spanning set and projects all existing conditions onto it. | -| `new_empty()` | `FastAutomaton` | Creates an automaton that matches the empty language. | -| `new_empty_string()` | `FastAutomaton` | Creates an automaton that only matches the empty string `""`. | -| `new_from_range(range: &CharRange)` | `FastAutomaton` | Creates an automaton that matches one of the characters in the given `CharRange`. | -| `new_state(&mut self)` | `State` | Creates a new state and returns its identifier. | -| `new_total()` | `FastAutomaton` | Creates an automaton that matches all possible strings. | -| `remove_state(&mut self, state: State)` | `()` | Removes the state and its connected transitions; panics if it's a start state. | -| `remove_states(&mut self, states: &IntSet)` | `()` | Removes the given states and their connected transitions; panics if any is a start state. | -| `remove_transition(&mut self, from_state: State, to_state: State)` | `()` | Removes the transition between the two provided states if it exists. | - -#### Manipulate -| Method | Return | Description | -| -------- | ------- | ------- | -| `complement(&mut self)` | `Result<(), EngineError>` | Complements the automaton; it must be deterministic. | -| `concat(&self, other: &FastAutomaton)` | `Result` | Computes the concatenation between `self` and `other`. | -| `concat_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the concatenation of all automata in the given iterator. | -| `determinize(&self)` | `Result, EngineError>` | Determinizes the automaton and returns the result. | -| `difference(&self, other: &FastAutomaton)` | `Result` | Computes the difference between `self` and `other`. | -| `has_intersection(&self, other: &FastAutomaton)` | `Result` | Returns `true` if the two automata have a non-empty intersection. | -| `intersection(&self, other: &FastAutomaton)` | `Result` | Computes the intersection between `self` and `other`. | -| `intersection_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the intersection of all automata in the given iterator. | -| `intersection_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the intersection of all automata in the given iterator. | -| `repeat(&self, min: u32, max_opt: Option)` | `Result` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `union(&self, other: &FastAutomaton)` | `Result` | Computes the union between `self` and `other`. | -| `union_all<'a, I: IntoIterator>(automata: I)` | `Result` | Computes the union of all automata in the given iterator. | -| `union_all_par<'a, I: IntoParallelIterator>(automata: I)` | `Result` | Computes in parallel the union of all automata in the given iterator. | - -#### Analyze -| Method | Return | Description | -| -------- | ------- | ------- | -| `as_dot(&self)` | `String` | Returns the automaton's DOT representation. | -| `direct_states(&self, state: State)` | `impl Iterator` | Returns an iterator over states directly reachable from the given state in one transition. | -| `direct_states_vec(&self, state: State)` | `Vec` | Returns a vector of states directly reachable from the given state in one transition. | -| `equivalent(&self, other: &FastAutomaton)` | `Result` | Returns `true` if both automata accept the same language. | -| `generate_strings(&self, count: usize, offset: usize)` | `Result, EngineError>` | Generates `count` strings matched by the automaton, skipping the first `offset` strings. | -| `get_accept_states(&self)` | `&IntSet` | Returns a reference to the set of accept (final) states. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the automaton (i.e., the number of possible matched strings). | -| `get_condition(&self, from_state: State, to_state: State)` | `Option<&Condition>` | Returns a reference to the condition of the directed transition between the two states, if any. | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of matched strings. | -| `get_number_of_states(&self)` | `usize` | Returns the number of states in the automaton. | -| `get_live_states(&self)` | `IntSet` | Returns the set of "live" states: those that can reach an accept state. | -| `get_spanning_set(&self)` | `&SpanningSet` | Returns a reference to the automaton's spanning set. | -| `get_start_state(&self)` | `State` | Returns the start state. | -| `has_state(&self, state: State)` | `bool` | Returns `true` if the automaton contains the given state. | -| `has_transition(&self, from_state: State, to_state: State)` | `bool` | Returns `true` if there is a directed transition from `from_state` to `to_state`. | -| `in_degree(&self, state: State)` | `usize` | Returns the number of transitions to the provided state. | -| `is_accepted(&self, state: State)` | `bool` | Returns `true` if the given state is one of the accept states. | -| `is_deterministic(&self)` | `bool` | Returns `true` if the automaton is deterministic. | -| `is_empty(&self)` | `bool` | Checks if the automaton matches the empty language. | -| `is_empty_string(&self)` | `bool` | Checks if the automaton only matches the empty string `""`. | -| `is_match(&self, string: &str)` | `bool` | Returns `true` if the automaton matches the given string. | -| `is_total(&self)` | `bool` | Checks if the automaton matches all possible strings. | -| `out_degree(&self, state: State)` | `usize` | Returns the number of transitions from the provided state. | -| `print_dot(&self)` | `()` | Prints the automaton's DOT representation. | -| `states(&self)` | `impl Iterator` | Returns an iterator over the automaton’s states. | -| `states_vec(&self)` | `Vec` | Returns a vector containing the automaton’s states. | -| `subset(&self, other: &FastAutomaton)` | `Result` | Returns `true` if all strings accepted by `self` are also accepted by `other`. | -| `to_regex(&self)` | `RegularExpression` | Converts the term to a `RegularExpression`. | -| `transitions_from(&self, state: State)` | `impl Iterator` | Returns an iterator over transitions from the given state. | -| `transitions_from_vec(&self, state: State)` | `Vec<(Condition, State)>` | Returns a vector of transitions from the given state. | -| `transitions_to_vec(&self, state: State)` | `Vec<(State, Condition)>` | Returns a vector of transitions to the given state. | - - -### RegularExpression - -`RegularExpression` is used to directly build, manipulate and analyze regular expression patterns. Not all the set operations are available, for more advanced operation such as intersection, subtraction/difference and complement it is necessary to convert into a `FastAutomaton` with the method `to_automaton()`. - -#### Build/Manipulate -| Method | Return | Description | -| -------- | ------- | ------- | -| `concat(&self, other: &RegularExpression, append_back: bool)` | `RegularExpression` | Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. | -| `concat_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the concatenation of all expressions in `patterns`. | -| `new(pattern: &str)` | `Result` | Parses and simplifies the provided pattern and returns the resulting `RegularExpression`. | -| `new_empty()` | `RegularExpression` | Creates a regular expression that matches the empty language. | -| `new_empty_string()` | `RegularExpression` | Creates a regular expression that matches only the empty string `""`. | -| `new_total()` | `RegularExpression` | Creates a regular expression that matches all possible strings. | -| `parse(pattern: &str, simplify: bool)` | `Result` | Parses the provided pattern and returns the resulting `RegularExpression`. If `simplify` is `true`, the expression is simplified during parsing. | -| `repeat(&self, min: u32, max_opt: Option)` | `RegularExpression` | Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. | -| `simplify(&self)` | `RegularExpression` | Returns a simplified version by eliminating redundant constructs and applying canonical reductions. | -| `union(&self, other: &RegularExpression)` | `RegularExpression` | Returns a regular expression matching the union of `self` and `other`. | -| `union_all<'a, I: IntoIterator>(patterns: I)` | `RegularExpression` | Returns a regular expression that is the union of all expressions in `patterns`. | - -#### Analyze -| Method | Return | Description | -| -------- | ------- | ------- | -| `evaluate_complexity(&self)` | `f64` | Returns a heuristic score for the readability of the pattern. | -| `get_cardinality(&self)` | `Cardinality` | Returns the cardinality of the regular expression (i.e., the number of possible matched strings). | -| `get_length(&self)` | `(Option, Option)` | Returns the minimum and maximum length of possible matched strings. | -| `is_empty(&self)` | `bool` | Checks if the regular expression matches the empty language. | -| `is_empty_string(&self)` | `bool` | Checks if the regular expression only matches the empty string `""`. | -| `is_total(&self)` | `bool` | Checks if the regular expression matches all possible strings. | -| `to_automaton(&self)` | `Result` | Converts the regular expression to an equivalent `FastAutomaton`. | +The variants are freely constructible too; a hand-built repetition whose maximum is below its minimum denotes no valid language and is rejected with `EngineError::InvalidRepetitionBounds` when converted by `to_automaton()`. +Parsing (`new`, `parse`), the simplifying combinators (`concat`, `union`, `repeat`, `simplify`) and the analyses (`get_length`, `get_cardinality`, `evaluate_complexity`) are documented on [`RegularExpression`](https://docs.rs/regexsolver/latest/regexsolver/regex/enum.RegularExpression.html). ## Bound Execution -Use a thread-local `ExecutionProfile` to cap runtime or state explosion; hitting a limit returns a specific `EngineError`. +Automaton operations can blow up on adversarial inputs, so the engine is built to run untrusted patterns safely: a thread-local `ExecutionProfile` caps runtime and state explosion, and controls when the engine may determinize or minimize on its own. Hitting a limit returns a specific `EngineError` instead of hanging or panicking. ### Time-Bounded Execution @@ -274,7 +187,7 @@ let execution_profile = ExecutionProfileBuilder::new() // We run the operation with the defined limitation execution_profile.run(|| { - assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000).unwrap_err()); + assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000).unwrap_err()); }); ``` @@ -296,6 +209,52 @@ execution_profile.run(|| { }); ``` +### Disabling Implicit Determinization + +`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `get_cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working — that layer manages the underlying representation itself, so its determinizations count as explicit. + +```rust +use regexsolver::execution_profile::ExecutionProfileBuilder; +use regexsolver::error::EngineError; + +let execution_profile = ExecutionProfileBuilder::new() + .implicit_determinization(false) // default is true + .build(); + +// `nfa` is any non-deterministic FastAutomaton +execution_profile.run(|| { + assert_eq!(EngineError::DeterministicAutomatonRequired, nfa.clone().minimize().unwrap_err()); + + // Determinizing explicitly is always allowed. + let mut dfa = nfa.determinize().unwrap().into_owned(); + assert!(dfa.minimize().is_ok()); +}); +``` + +### Minimizing After Determinization + +Every determinization can be followed automatically by a minimization of the result (off by default: it costs an extra Hopcroft pass, but keeps downstream operations working on the smallest possible automata). Inputs that are already deterministic are returned untouched. + +```rust +use regexsolver::execution_profile::ExecutionProfileBuilder; + +let execution_profile = ExecutionProfileBuilder::new() + .minimize_after_determinization(true) // default is false + .build(); + +// `nfa` is any non-deterministic FastAutomaton +execution_profile.run(|| { + let dfa = nfa.determinize().unwrap(); + assert!(dfa.is_minimal()); +}); +``` + +## How it works + +- Patterns are parsed with [regex-syntax](https://docs.rs/regex-syntax/latest/regex_syntax/) and simplified into a small regular-expression AST; set operations run on finite automata; results convert back to patterns via state elimination. +- Transition labels are bitvectors over a per-automaton "spanning set" of disjoint character ranges, making label union/intersection/complement O(1): see [Optimizing Automaton Representation with Transition Conditions](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). +- Correctness is cross-validated against the `regex` crate and exercised by property-based tests over randomly generated automata and expressions, with brute-force oracles for the analyses. + ## Cross-Language Support If you want to use this library with other programming languages, we provide a wide range of wrappers: diff --git a/assets/automaton.svg b/assets/automaton.svg new file mode 100644 index 0000000..dabacdd --- /dev/null +++ b/assets/automaton.svg @@ -0,0 +1,70 @@ + + + + + + +Automaton + + + +1 + +1 + + + +2 + + +2 + + + +1->2 + + +b + + + +2->1 + + +a + + + +3 + +3 + + + +2->3 + + +c + + + +initial + + + +initial->2 + + + + + +3->2 + + +d + + + diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs deleted file mode 100644 index 2488acf..0000000 --- a/benches/my_benchmark.rs +++ /dev/null @@ -1,89 +0,0 @@ -use criterion::{Criterion, black_box, criterion_group, criterion_main}; -use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; - -fn parse_regex(regex: &str) -> RegularExpression { - RegularExpression::new(regex).unwrap() -} - -fn to_regex(automaton: &FastAutomaton) -> RegularExpression { - automaton.to_regex() -} - -fn determinize(automaton: &FastAutomaton) -> FastAutomaton { - automaton.determinize().unwrap().into_owned() -} - -fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> FastAutomaton { - automaton_1.intersection(automaton_2).unwrap() -} - -fn generate_strings(automaton: &FastAutomaton) -> Vec { - automaton.generate_strings(2000, 1000).unwrap() -} - -fn criterion_benchmark(c: &mut Criterion) { - { - c.bench_function("parse_regex", |b| { - b.iter(|| parse_regex(black_box("a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}"))) - }); - } - - { - let input_regex = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){2,3}").unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("to_regex", |b| { - b.iter(|| to_regex(black_box(&input_automaton))) - }); - } - - { - let input_regex = RegularExpression::new( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - ) - .unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("determinize", |b| { - b.iter(|| determinize(black_box(&input_automaton))) - }); - } - - /*{ - let input_regex = RegularExpression::new("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5}").unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("test_determinize", |b| { - b.iter(|| determinize(black_box(&input_automaton))) - }); - }*/ - - { - let automaton1 = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") - .unwrap() - .to_automaton() - .unwrap(); - let automaton2 = RegularExpression::new("a(bcfe|mkv|opr)*(abc){2,4}") - .unwrap() - .to_automaton() - .unwrap(); - - c.bench_function("intersection", |b| { - b.iter(|| intersection(black_box(&automaton1), black_box(&automaton2))) - }); - } - - { - let automaton = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") - .unwrap() - .to_automaton() - .unwrap(); - - c.bench_function("generate_strings", |b| { - b.iter(|| generate_strings(black_box(&automaton))) - }); - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/benches/operations.rs b/benches/operations.rs new file mode 100644 index 0000000..77ee226 --- /dev/null +++ b/benches/operations.rs @@ -0,0 +1,289 @@ +//! Benchmarks covering the main operation families of the library. +//! +//! Inputs come in named sizes so numbers stay comparable across versions: +//! +//! * `small` / `medium` / `large` — realistic patterns of increasing size. +//! * `blowup_N` — the classic `(a|b)*a(a|b){N}` family whose minimal DFA has +//! 2^N states: the worst case of subset construction. +//! +//! Mutating operations (`minimize`, `complement`) are measured with +//! `iter_batched` on a fresh clone per iteration, so flag short-circuits +//! (e.g. `minimize` early-returning on an already-minimal automaton) don't +//! skew the numbers. + +use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main}; +use regex_charclass::char::Char; +use regexsolver::fast_automaton::FastAutomaton; +use regexsolver::regex::RegularExpression; +use regexsolver::{CharRange, Term}; +use std::hint::black_box; + +const SMALL: (&str, &str) = ("small", "(abc|de){2}"); +const MEDIUM: (&str, &str) = ("medium", "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}"); +const LARGE: (&str, &str) = ( + "large", + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", +); + +fn automaton(pattern: &str) -> FastAutomaton { + RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap() +} + +fn dfa(pattern: &str) -> FastAutomaton { + automaton(pattern).determinize().unwrap().into_owned() +} + +/// `(a|b)*a(a|b){n}`: an n+2-state NFA whose minimal DFA has 2^(n+1) states. +fn blowup_pattern(n: usize) -> String { + format!("(a|b)*a(a|b){{{n}}}") +} + +fn bench_parse(c: &mut Criterion) { + let mut group = c.benchmark_group("parse"); + for (name, pattern) in [SMALL, MEDIUM, LARGE] { + group.bench_with_input(BenchmarkId::from_parameter(name), pattern, |b, pattern| { + b.iter(|| RegularExpression::new(black_box(pattern)).unwrap()) + }); + } + group.finish(); +} + +fn bench_to_automaton(c: &mut Criterion) { + let mut group = c.benchmark_group("to_automaton"); + for (name, pattern) in [SMALL, MEDIUM, LARGE] { + let regex = RegularExpression::new(pattern).unwrap(); + group.bench_with_input(BenchmarkId::from_parameter(name), ®ex, |b, regex| { + b.iter(|| black_box(regex).to_automaton().unwrap()) + }); + } + group.finish(); +} + +fn bench_determinize(c: &mut Criterion) { + let mut group = c.benchmark_group("determinize"); + for n in [5, 10] { + let nfa = automaton(&blowup_pattern(n)); + group.bench_with_input(BenchmarkId::new("blowup", n), &nfa, |b, nfa| { + b.iter(|| black_box(nfa).determinize().unwrap().into_owned()) + }); + } + let nfa = automaton(LARGE.1); + group.bench_with_input(BenchmarkId::from_parameter("large"), &nfa, |b, nfa| { + b.iter(|| black_box(nfa).determinize().unwrap().into_owned()) + }); + group.finish(); +} + +fn bench_minimize(c: &mut Criterion) { + let mut group = c.benchmark_group("minimize"); + for n in [5, 10] { + let blowup_dfa = dfa(&blowup_pattern(n)); + group.bench_with_input(BenchmarkId::new("blowup", n), &blowup_dfa, |b, dfa| { + b.iter_batched( + || dfa.clone(), + |mut automaton| { + automaton.minimize().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }); + } + let large_dfa = dfa(LARGE.1); + group.bench_with_input( + BenchmarkId::from_parameter("large"), + &large_dfa, + |b, dfa| { + b.iter_batched( + || dfa.clone(), + |mut automaton| { + automaton.minimize().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }, + ); + group.finish(); +} + +fn bench_set_operations(c: &mut Criterion) { + let mut group = c.benchmark_group("set_operations"); + + let a = automaton("a(bcfe|bcdg|mkv)*(abc){1,3}"); + let b_op = automaton("a(bcfe|mkv|opr)*(abc){2,4}"); + group.bench_function("intersection", |b| { + b.iter(|| black_box(&a).intersection(black_box(&b_op)).unwrap()) + }); + group.bench_function("union", |b| { + b.iter(|| black_box(&a).union(black_box(&b_op)).unwrap()) + }); + + let minuend = automaton(".*abc.*"); + let subtrahend = automaton(".*def.*"); + group.bench_function("difference", |b| { + b.iter(|| { + black_box(&minuend) + .difference(black_box(&subtrahend)) + .unwrap() + }) + }); + + let complement_input = dfa(".*abc.*"); + group.bench_function("complement", |b| { + b.iter_batched( + || complement_input.clone(), + |mut automaton| { + automaton.complement().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }); + + group.finish(); +} + +fn bench_decision(c: &mut Criterion) { + let mut group = c.benchmark_group("decision"); + + // Same language, structurally different automata: the `self == other` + // shortcut cannot fire, forcing the full check in both directions. + let left_form = automaton("(a|b)*abc(a|b)*"); + let right_form = automaton("(a*b*)*abc(b*a*)*"); + assert_ne!(left_form, right_form); + assert!(left_form.equivalent(&right_form).unwrap()); + group.bench_function("equivalent", |b| { + b.iter(|| { + black_box(&left_form) + .equivalent(black_box(&right_form)) + .unwrap() + }) + }); + + let smaller = automaton("abc(de|fg){1,3}"); + let bigger = automaton("abc.*"); + group.bench_function("subset", |b| { + b.iter(|| black_box(&smaller).subset(black_box(&bigger)).unwrap()) + }); + + let left = automaton(".*abc.*"); + let right = automaton(".*cba.*"); + group.bench_function("has_intersection", |b| { + b.iter(|| { + black_box(&left) + .has_intersection(black_box(&right)) + .unwrap() + }) + }); + + group.finish(); +} + +fn bench_analyze(c: &mut Criterion) { + let mut group = c.benchmark_group("analyze"); + + let finite = dfa("[a-z]{1,6}"); + group.bench_function("get_length/finite", |b| { + b.iter(|| black_box(&finite).get_length()) + }); + group.bench_function("get_cardinality/finite", |b| { + b.iter(|| black_box(&finite).get_cardinality().unwrap()) + }); + + let infinite = automaton(LARGE.1); + group.bench_function("get_length/large", |b| { + b.iter(|| black_box(&infinite).get_length()) + }); + + group.finish(); +} + +fn bench_to_regex(c: &mut Criterion) { + let mut group = c.benchmark_group("to_regex"); + + let nfa = automaton(MEDIUM.1); + group.bench_function("nfa", |b| b.iter(|| black_box(&nfa).to_regex())); + + let medium_dfa = dfa(MEDIUM.1); + group.bench_function("dfa", |b| b.iter(|| black_box(&medium_dfa).to_regex())); + + group.finish(); +} + +fn bench_generate_strings(c: &mut Criterion) { + let mut group = c.benchmark_group("generate_strings"); + + let automaton = dfa("[a-z]{1,4}"); + group.bench_function("first_2000", |b| { + b.iter(|| black_box(&automaton).generate_strings(2000, 0).unwrap()) + }); + + // The offset fast-skips whole subtrees by counting paths. + let deep = dfa("[a-z]{1,10}"); + group.bench_function("deep_offset", |b| { + b.iter(|| black_box(&deep).generate_strings(100, 1_000_000).unwrap()) + }); + + group.finish(); +} + +fn bench_construction(c: &mut Criterion) { + let mut group = c.benchmark_group("construction"); + + // A 64-transition chain over a growing alphabet: every few transitions + // extend the spanning set and re-project the existing conditions. + group.bench_function("add_transition_from_range/chain_64", |b| { + b.iter(|| { + let mut automaton = FastAutomaton::new_empty(); + let mut previous = 0; + for i in 0..64u8 { + let next = automaton.new_state(); + let character = Char::new(char::from(b'a' + (i % 26))); + let range = CharRange::new_from_range(character..=character); + automaton + .add_transition_from_range(previous, next, &range) + .unwrap(); + previous = next; + } + automaton.accept(previous); + automaton + }) + }); + + group.finish(); +} + +fn bench_end_to_end(c: &mut Criterion) { + let mut group = c.benchmark_group("end_to_end"); + + // The front-page scenario: parse two patterns, intersect, print back. + group.bench_function("intersection_to_pattern", |b| { + b.iter(|| { + let a = Term::from_pattern(black_box("(ab|xy){2}")).unwrap(); + let b_term = Term::from_pattern(black_box(".*xy")).unwrap(); + a.intersection(&[b_term]).unwrap().to_pattern() + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_parse, + bench_to_automaton, + bench_determinize, + bench_minimize, + bench_set_operations, + bench_decision, + bench_analyze, + bench_to_regex, + bench_generate_strings, + bench_construction, + bench_end_to_end, +); +criterion_main!(benches); diff --git a/examples/generate.rs b/examples/generate.rs new file mode 100644 index 0000000..9c9fa6e --- /dev/null +++ b/examples/generate.rs @@ -0,0 +1,25 @@ +//! Generate strings matching a regex pattern. +//! +//! ```text +//! cargo run --example generate -- "[a-z]{2}[0-9]" 20 +//! ``` + +use regexsolver::Term; + +fn main() -> Result<(), Box> { + let mut args = std::env::args().skip(1); + let Some(pattern) = args.next() else { + eprintln!("Usage: cargo run --example generate -- [count]"); + std::process::exit(2); + }; + let count: usize = args.next().map(|c| c.parse()).transpose()?.unwrap_or(10); + + // Minimize once: pagination over the same minimized term yields + // disjoint, consistent pages (see `Term::generate_strings`). + let term = Term::from_pattern(&pattern)?.minimize()?; + for string in term.generate_strings(count, 0)? { + println!("{string:?}"); + } + + Ok(()) +} diff --git a/examples/relate.rs b/examples/relate.rs new file mode 100644 index 0000000..25096de --- /dev/null +++ b/examples/relate.rs @@ -0,0 +1,51 @@ +//! Explore how two regex patterns relate as languages. +//! +//! ```text +//! cargo run --example relate -- "(abc|de){2}" ".*xy" +//! ``` + +use regexsolver::Term; + +fn main() -> Result<(), Box> { + let mut args = std::env::args().skip(1); + let (Some(a), Some(b)) = (args.next(), args.next()) else { + eprintln!("Usage: cargo run --example relate -- "); + std::process::exit(2); + }; + + let a_term = Term::from_pattern(&a)?; + let b_term = Term::from_pattern(&b)?; + + println!("a = {a}"); + println!("b = {b}"); + println!(); + + if a_term.equivalent(&b_term)? { + println!("a and b match exactly the same strings."); + return Ok(()); + } + println!("equivalent: no"); + println!("a subset of b: {}", a_term.subset(&b_term)?); + println!("b subset of a: {}", b_term.subset(&a_term)?); + println!(); + + let intersection = a_term.intersection(std::slice::from_ref(&b_term))?; + if intersection.is_empty()? { + println!("a ∩ b = [] (no string matches both)"); + } else { + println!("a ∩ b = {}", intersection.to_pattern()); + println!(" e.g. {:?}", intersection.generate_strings(5, 0)?); + } + + let pattern_or_empty = |term: Term| -> Result> { + Ok(if term.is_empty()? { + "[]".to_string() + } else { + term.to_pattern() + }) + }; + println!("a - b = {}", pattern_or_empty(a_term.difference(&b_term)?)?); + println!("b - a = {}", pattern_or_empty(b_term.difference(&a_term)?)?); + + Ok(()) +} diff --git a/src/error/mod.rs b/src/error/mod.rs index acaa9ef..d543728 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,6 +2,7 @@ use std::fmt::{self}; /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] +#[non_exhaustive] pub enum EngineError { /// Invalid character used in regex. InvalidCharacterInRegex, @@ -13,6 +14,13 @@ pub enum EngineError { RegexSyntaxError(String), /// The provided range can not be built from the spanning set. ConditionInvalidRange, + /// The repetition bounds are invalid: the maximum is below the minimum. + InvalidRepetitionBounds(u32, u32), + /// The condition does not match the spanning set it is evaluated against. + IncompatibleSpanningSet, + /// The operation requires a deterministic automaton, and implicit + /// determinization is disabled by the execution profile. + DeterministicAutomatonRequired, } impl fmt::Display for EngineError { @@ -28,6 +36,18 @@ impl fmt::Display for EngineError { f, "The provided range can not be built from the spanning set." ), + EngineError::InvalidRepetitionBounds(min, max) => write!( + f, + "The repetition maximum ({max}) is below its minimum ({min})." + ), + EngineError::IncompatibleSpanningSet => write!( + f, + "The condition does not match the spanning set it is evaluated against." + ), + EngineError::DeterministicAutomatonRequired => write!( + f, + "The operation requires a deterministic automaton, and implicit determinization is disabled by the execution profile." + ), } } } diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 58ec9cf..2850341 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -37,7 +37,50 @@ use crate::error::EngineError; /// .build(); /// /// execution_profile.run(|| { -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000, false).unwrap_err()); +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000).unwrap_err()); +/// }); +/// ``` +/// +/// ## Disabling implicit determinization +/// +/// [`FastAutomaton`](crate::fast_automaton::FastAutomaton) operations that +/// require a deterministic automaton (`minimize`, `complement`, +/// `difference`, `equivalent`, `subset`, `get_cardinality`, ...) +/// determinize a non-deterministic input on their own by default. Since +/// subset construction can blow up exponentially, this can be disabled; +/// those operations then fail fast and determinization only happens through +/// an explicit `determinize()` call. [`Term`](crate::Term) methods are not +/// affected: that layer manages the underlying representation itself, so +/// its determinizations count as explicit. +/// +/// ``` +/// use regexsolver::CharRange; +/// use regexsolver::fast_automaton::FastAutomaton; +/// use regexsolver::execution_profile::ExecutionProfileBuilder; +/// use regexsolver::error::EngineError; +/// +/// // Two overlapping transitions from the start state: non-deterministic. +/// let mut nfa = FastAutomaton::new_empty(); +/// let s1 = nfa.new_state(); +/// let s2 = nfa.new_state(); +/// nfa.add_transition_from_range(0, s1, &CharRange::total()).unwrap(); +/// nfa.add_transition_from_range(0, s2, &CharRange::total()).unwrap(); +/// nfa.accept(s1); +/// +/// let execution_profile = ExecutionProfileBuilder::new() +/// .implicit_determinization(false) +/// .build(); +/// +/// execution_profile.run(|| { +/// // `minimize` requires a DFA and refuses to determinize on its own. +/// assert_eq!( +/// EngineError::DeterministicAutomatonRequired, +/// nfa.clone().minimize().unwrap_err() +/// ); +/// +/// // Determinizing explicitly is always allowed. +/// let mut dfa = nfa.determinize().unwrap().into_owned(); +/// assert!(dfa.minimize().is_ok()); /// }); /// ``` #[derive(Clone, Debug)] @@ -48,12 +91,28 @@ pub struct ExecutionProfile { execution_timeout: Option, /// The time after when a [`EngineError::OperationTimeOutError`] should be thrown. execution_deadline: Option, + /// Whether [`FastAutomaton`](crate::fast_automaton::FastAutomaton) + /// operations that require a deterministic automaton may determinize a + /// non-deterministic input on their own (the default). When `false`, + /// those operations return + /// [`EngineError::DeterministicAutomatonRequired`] instead, so that the + /// potentially exponential subset construction only ever happens through + /// an explicit `determinize()` call. [`Term`](crate::Term) methods + /// always work: that layer manages the representation itself. + implicit_determinization: bool, + /// Whether every determinization is followed by a minimization of the + /// resulting automaton. Off by default: minimization costs an extra + /// Hopcroft pass, but keeps downstream operations working on the + /// smallest possible automata. + minimize_after_determinization: bool, } impl PartialEq for ExecutionProfile { fn eq(&self, other: &ExecutionProfile) -> bool { self.max_number_of_states == other.max_number_of_states && self.execution_timeout == other.execution_timeout + && self.implicit_determinization == other.implicit_determinization + && self.minimize_after_determinization == other.minimize_after_determinization } } @@ -97,6 +156,19 @@ impl ExecutionProfile { Ok(()) } + /// Assert that implicit determinization is allowed. + /// + /// Return empty if it is. + /// + /// Return [`EngineError::DeterministicAutomatonRequired`] otherwise. + pub(crate) fn assert_implicit_determinization_allowed(&self) -> Result<(), EngineError> { + if self.implicit_determinization { + Ok(()) + } else { + Err(EngineError::DeterministicAutomatonRequired) + } + } + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self @@ -107,6 +179,22 @@ impl ExecutionProfile { self } + pub fn with_implicit_determinization(mut self, allowed: bool) -> Self { + self.implicit_determinization = allowed; + self + } + + pub fn with_minimize_after_determinization(mut self, enabled: bool) -> Self { + self.minimize_after_determinization = enabled; + self + } + + /// Whether every determinization should be followed by a minimization of + /// the result. + pub(crate) fn should_minimize_after_determinization(&self) -> bool { + self.minimize_after_determinization + } + pub fn set(&self) -> &Self { self } @@ -149,6 +237,12 @@ pub struct ExecutionProfileBuilder { max_number_of_states: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. execution_timeout: Option, + /// Whether operations requiring a deterministic automaton may determinize + /// a non-deterministic input on their own. Defaults to `true`. + implicit_determinization: bool, + /// Whether every determinization is followed by a minimization of the + /// result. Defaults to `false`. + minimize_after_determinization: bool, } impl Default for ExecutionProfileBuilder { fn default() -> Self { @@ -161,6 +255,8 @@ impl ExecutionProfileBuilder { Self { max_number_of_states: None, execution_timeout: None, + implicit_determinization: true, + minimize_after_determinization: false, } } @@ -174,11 +270,35 @@ impl ExecutionProfileBuilder { self } + /// Whether [`FastAutomaton`](crate::fast_automaton::FastAutomaton) + /// operations that require a deterministic automaton may determinize a + /// non-deterministic input on their own (the default). When set to + /// `false`, those operations return + /// [`EngineError::DeterministicAutomatonRequired`] instead; explicit + /// `determinize()` calls — and [`Term`](crate::Term) methods, which + /// manage the representation themselves — are always allowed. + pub fn implicit_determinization(mut self, allowed: bool) -> Self { + self.implicit_determinization = allowed; + self + } + + /// Whether every determinization is followed by a minimization of the + /// resulting automaton. Off by default: minimization costs an extra + /// Hopcroft pass, but keeps downstream operations working on the + /// smallest possible automata. Inputs that are already deterministic are + /// not touched. + pub fn minimize_after_determinization(mut self, enabled: bool) -> Self { + self.minimize_after_determinization = enabled; + self + } + pub fn build(self) -> ExecutionProfile { ExecutionProfile { max_number_of_states: self.max_number_of_states, execution_timeout: self.execution_timeout, execution_deadline: None, + implicit_determinization: self.implicit_determinization, + minimize_after_determinization: self.minimize_after_determinization, } } } @@ -189,6 +309,8 @@ impl ThreadLocalParams { static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; + static IMPLICIT_DETERMINIZATION: RefCell = const { RefCell::new(true) }; + static MINIMIZE_AFTER_DETERMINIZATION: RefCell = const { RefCell::new(false) }; } /// Store on the current thread [`ExecutionProfile`]. @@ -204,6 +326,14 @@ impl ThreadLocalParams { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { *cell.borrow_mut() = profile.execution_timeout; }); + + ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| { + *cell.borrow_mut() = profile.implicit_determinization; + }); + + ThreadLocalParams::MINIMIZE_AFTER_DETERMINIZATION.with(|cell| { + *cell.borrow_mut() = profile.minimize_after_determinization; + }); } fn get_max_number_of_states() -> Option { @@ -218,12 +348,22 @@ impl ThreadLocalParams { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } + fn get_implicit_determinization() -> bool { + ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| *cell.borrow()) + } + + fn get_minimize_after_determinization() -> bool { + ThreadLocalParams::MINIMIZE_AFTER_DETERMINIZATION.with(|cell| *cell.borrow()) + } + /// Return the [`ExecutionProfile`] stored on the current thread. fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), + implicit_determinization: Self::get_implicit_determinization(), + minimize_after_determinization: Self::get_minimize_after_determinization(), } } } @@ -277,6 +417,192 @@ mod tests { Ok(()) } + /// A two-way acyclic automaton with overlapping transitions: the + /// smallest shape that is non-deterministic and reaches the + /// determinization paths of every DFA-requiring operation. + fn nondeterministic_automaton() -> crate::fast_automaton::FastAutomaton { + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + assert!(!a.is_deterministic()); + a + } + + #[test] + fn test_implicit_determinization_disabled() { + let nfa = nondeterministic_automaton(); + let dfa = nfa.determinize().unwrap().into_owned(); + + ExecutionProfileBuilder::new() + .implicit_determinization(false) + .build() + .run(|| { + let err = EngineError::DeterministicAutomatonRequired; + + // Every DFA-requiring operation refuses to determinize a + // non-deterministic input on its own... + assert_eq!(nfa.clone().minimize().unwrap_err(), err); + assert_eq!(nfa.clone().complement().unwrap_err(), err); + assert_eq!(dfa.difference(&nfa).unwrap_err(), err); + assert_eq!(nfa.equivalent(&dfa).unwrap_err(), err); + assert_eq!(dfa.subset(&nfa).unwrap_err(), err); + assert_eq!(nfa.get_cardinality().unwrap_err(), err); + + // ...but operations that work on NFAs directly are unaffected + // (difference only determinizes the subtrahend)... + assert!(nfa.difference(&dfa).is_ok()); + + // ...deterministic inputs keep working... + assert!(dfa.clone().minimize().is_ok()); + assert!(dfa.clone().complement().is_ok()); + assert!(dfa.get_cardinality().is_ok()); + assert!(dfa.equivalent(&dfa).is_ok()); + + // ...and explicit determinization is always allowed. + assert!(nfa.determinize().is_ok()); + }); + } + + #[test] + fn test_minimize_after_determinization() { + use crate::CharRange; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + use crate::fast_automaton::spanning_set::SpanningSet; + use regex_charclass::char::Char; + + // NFA over base 'a' + rest whose subset construction yields two + // distinct but language-equivalent accept states ({f1, f3} on 'a', + // {f2} on [^a]) — 3 determinized states, 2 after minimization. + let range_a = CharRange::new_from_range(Char::new('a')..=Char::new('a')); + let ss = SpanningSet::compute_spanning_set(std::slice::from_ref(&range_a)); + let mut nfa = FastAutomaton::new_empty(); + nfa.apply_new_spanning_set(&ss).unwrap(); + let f1 = nfa.new_state(); + let f2 = nfa.new_state(); + let f3 = nfa.new_state(); + let cond_a = Condition::from_range(&range_a, &ss).unwrap(); + let cond_rest = cond_a.complement(); + nfa.add_transition(0, f1, &cond_a); + nfa.add_transition(0, f3, &cond_a); // overlaps with f1: non-deterministic + nfa.add_transition(0, f2, &cond_rest); + nfa.accept(f1); + nfa.accept(f2); + nfa.accept(f3); + assert!(!nfa.is_deterministic()); + + // Default: determinize alone does not minimize. + let plain = nfa.determinize().unwrap().into_owned(); + assert!(plain.is_deterministic()); + assert!(!plain.is_minimal()); + assert_eq!(plain.get_number_of_states(), 3); + + ExecutionProfileBuilder::new() + .minimize_after_determinization(true) + .build() + .run(|| { + let minimized = nfa.determinize().unwrap().into_owned(); + assert!(minimized.is_deterministic()); + assert!(minimized.is_minimal()); + assert_eq!(minimized.get_number_of_states(), 2); + assert!(minimized.equivalent(&plain).unwrap()); + + // Already-deterministic inputs are returned untouched: the + // flag only applies when a determinization actually happens. + let same = plain.determinize().unwrap(); + assert!(!same.is_minimal()); + }); + } + + /// The `implicit_determinization` knob targets direct `FastAutomaton` + /// usage; `Term` manages the underlying representation itself, so its + /// whole public API must keep working when the knob is off. + #[test] + fn test_term_api_works_without_implicit_determinization() { + let term = Term::from_automaton(nondeterministic_automaton()); + let other = Term::from_pattern("a*").unwrap(); + + ExecutionProfileBuilder::new() + .implicit_determinization(false) + .build() + .run(|| { + // Methods that need a DFA internally determinize on Term's + // behalf (an explicit choice of the Term layer)... + assert!(term.difference(&other).is_ok()); + assert!(other.difference(&term).is_ok()); + assert!(term.complement().is_ok()); + assert!(term.equivalent(&other).is_ok()); + assert!(term.subset(&other).is_ok()); + assert!(other.subset(&term).is_ok()); + assert!(term.is_total().is_ok()); + assert!(term.get_cardinality().is_ok()); + assert!(term.minimize().is_ok()); + assert!(term.generate_strings(5, 0).is_ok()); + + // ...and the rest of the API never needed one. + assert!(term.concat(std::slice::from_ref(&other)).is_ok()); + assert!(term.union(std::slice::from_ref(&other)).is_ok()); + assert!(term.intersection(std::slice::from_ref(&other)).is_ok()); + assert!(term.repeat(0, Some(2)).is_ok()); + assert!(term.is_empty().is_ok()); + assert!(term.is_empty_string().is_ok()); + let _ = term.get_length(); + let _ = term.to_regex(); + let _ = term.to_pattern(); + assert!(term.to_automaton().is_ok()); + + // The override is scoped: direct FastAutomaton usage stays + // gated afterwards. + assert_eq!( + nondeterministic_automaton().minimize().unwrap_err(), + EngineError::DeterministicAutomatonRequired + ); + }); + } + + /// The two determinization knobs compose: implicit determinization + /// stays gated, while an explicit `determinize()` both works and + /// minimizes its result. + #[test] + fn test_minimize_after_determinization_with_implicit_disabled() { + let nfa = nondeterministic_automaton(); + + ExecutionProfileBuilder::new() + .implicit_determinization(false) + .minimize_after_determinization(true) + .build() + .run(|| { + assert_eq!( + nfa.clone().minimize().unwrap_err(), + EngineError::DeterministicAutomatonRequired + ); + + let dfa = nfa.determinize().unwrap(); + assert!(dfa.is_deterministic()); + assert!(dfa.is_minimal()); + assert!(dfa.equivalent(&nfa.determinize().unwrap()).unwrap()); + }); + } + + #[test] + fn test_implicit_determinization_default() { + let nfa = nondeterministic_automaton(); + + // Without the profile knob the historical behavior is unchanged. + assert!(nfa.clone().minimize().is_ok()); + assert!(nfa.clone().complement().is_ok()); + assert!(nfa.get_cardinality().is_ok()); + assert!(nfa.equivalent(&nfa.clone()).is_ok()); + } + #[test] fn test_execution_timeout_generate_strings() -> Result<(), String> { let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); @@ -289,7 +615,7 @@ mod tests { .run(|| { assert_eq!( EngineError::OperationTimeOutError, - term.generate_strings(100, 1_000_000, false).unwrap_err() + term.generate_strings(100, 1_000_000).unwrap_err() ); let run_duration = Instant::now().duration_since(start_time).as_millis(); diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 7c36bb3..20dd6c4 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -4,26 +4,49 @@ use super::*; impl FastAutomaton { /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). - pub fn get_cardinality(&self) -> Cardinality { + /// + /// Works on non-deterministic automata too: acyclic NFAs are determinized + /// internally (the only fallible step, subject to the + /// [`crate::execution_profile::ExecutionProfile`] budget — and rejected + /// with [`EngineError::DeterministicAutomatonRequired`] when the profile + /// disables implicit determinization). + /// + /// As in [`get_length`](Self::get_length), only cycles **on accepting + /// paths** make the count infinite: cycles among dead or unreachable + /// states don't add a single matched string. + pub fn get_cardinality(&self) -> Result, EngineError> { if self.is_empty() { - return Cardinality::Integer(0); + return Ok(Cardinality::Integer(0)); } else if self.is_total() { - return Cardinality::Infinite; + return Ok(Cardinality::Infinite); } - // A cycle means infinitely many strings. `topological_sorted_states` - // returns `None` exactly when the transition graph is cyclic and needs - // no determinism, so this also covers cyclic non-deterministic inputs. - let topologically_sorted_states = match self.topological_sorted_states() { - None => return Cardinality::Infinite, + // Only states on an accepting path (reachable from the start AND + // able to reach an accept) contribute strings; everything else is + // excluded from both the cycle check and the count. + let live = self.get_live_states(); + let relevant: IntSet = self + .forward_reachable_states() + .intersection(&live) + .copied() + .collect(); + + // A cycle among relevant states means infinitely many strings. + // `topological_sorted_states` returns `None` exactly when that + // subgraph is cyclic and needs no determinism, so this also covers + // cyclic non-deterministic inputs. + let topologically_sorted_states = match self.topological_sorted_states(&relevant) { + None => return Ok(Cardinality::Infinite), Some(states) => states, }; - // The finite count below assumes deterministic (single-path) transitions. - assert!( - self.is_deterministic(), - "The automaton should be deterministic." - ); + // The finite count below assumes deterministic (single-path) + // transitions. Determinizing an automaton with a finite language + // yields one whose relevant subgraph is acyclic too, so the + // recursion takes the deterministic path on the second call. + if !self.is_deterministic() { + return self.determinize_implicit()?.get_cardinality(); + } let len = self.transitions.len(); let mut distances: IntMap = @@ -34,6 +57,9 @@ impl FastAutomaton { let current_distance = *distances.entry(state).or_insert(0); if let Some(to_states) = self.transitions.get(state) { for (to_state, condition) in to_states { + if !relevant.contains(to_state) { + continue; + } if let Some(distance) = current_distance.checked_mul( condition .get_cardinality(&self.spanning_set) @@ -45,7 +71,7 @@ impl FastAutomaton { continue; } - return Cardinality::BigInteger; + return Ok(Cardinality::BigInteger); } } } @@ -57,22 +83,33 @@ impl FastAutomaton { temp_cardinality = add; continue; } - return Cardinality::BigInteger; + return Ok(Cardinality::BigInteger); } } - Cardinality::Integer(temp_cardinality) + Ok(Cardinality::Integer(temp_cardinality)) } - fn topological_sorted_states(&self) -> Option> { - let len = self.get_number_of_states(); + /// Kahn's algorithm restricted to the `relevant` subgraph (transitions + /// with empty conditions can't be taken and are ignored). Returns `None` + /// when that subgraph contains a cycle. + fn topological_sorted_states(&self, relevant: &IntSet) -> Option> { + let len = relevant.len(); let mut in_degree: IntMap = IntMap::with_capacity_and_hasher(len, BuildHasherDefault::default()); let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for &from_state in &self.states_vec() { + let successors = |from_state: State| { + self.transitions_from(from_state) + .filter(|(condition, to_state)| { + !condition.is_empty() && relevant.contains(to_state) + }) + .map(|(_, to_state)| *to_state) + }; + + for &from_state in relevant { in_degree.entry(from_state).or_insert(0); - for to_state in self.direct_states(from_state) { + for to_state in successors(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -85,7 +122,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.direct_states(from_state) { + for to_state in successors(from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { @@ -101,3 +138,53 @@ impl FastAutomaton { } } } + +#[cfg(test)] +mod tests { + use crate::cardinality::Cardinality; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + // Regression (found by the brute-force enumeration proptest): the cycle + // check used to run over ALL states, so a cycle among dead states made + // the cardinality of a finite language Infinite. Only cycles on + // accepting paths count. + #[test] + fn get_cardinality_ignores_dead_cycles() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.accept(0); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.add_transition(s2, s1, &cond); + // s1, s2 can't reach an accept → language is {""} only. + + assert_eq!(a.get_cardinality().unwrap(), Cardinality::Integer(1)); + } + + // Regression: `get_cardinality` used to `assert!` determinism and panic + // on acyclic NFAs (the only nondeterministic inputs that reach the finite + // count; cyclic ones return Infinite earlier). It now determinizes + // internally. + #[test] + fn get_cardinality_determinizes_acyclic_nfas() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + // Two overlapping transitions from the start: nondeterministic, but + // both lead to accepting states after exactly one character. + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + assert!(!a.is_deterministic()); + + let cardinality = a.get_cardinality().unwrap(); + let expected = a.determinize().unwrap().get_cardinality().unwrap(); + assert_eq!(cardinality, expected); + assert!(matches!(cardinality, Cardinality::Integer(n) if n > 0)); + } +} diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 070e206..857131b 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -4,6 +4,10 @@ use super::*; impl FastAutomaton { /// Returns `true` if both automata accept the same language. + /// + /// Non-deterministic operands are determinized internally — unless the + /// execution profile disables implicit determinization, in which case + /// [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); @@ -11,14 +15,14 @@ impl FastAutomaton { return Ok(true); } - let mut other_complement = other.determinize()?.into_owned(); + let mut other_complement = other.determinize_implicit()?.into_owned(); other_complement.complement()?; if self.has_intersection(&other_complement)? { return Ok(false); } - let mut self_complement = self.determinize()?.into_owned(); + let mut self_complement = self.determinize_implicit()?.into_owned(); self_complement.complement()?; Ok(!self_complement.has_intersection(other)?) @@ -77,9 +81,6 @@ mod tests { let automaton_2 = regex_2.to_automaton().unwrap(); assert!(automaton_2.equivalent(&automaton_2).unwrap()); - assert_eq!( - expected, - automaton_1.equivalent(&automaton_2).unwrap() - ); + assert_eq!(expected, automaton_1.equivalent(&automaton_2).unwrap()); } } diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 5b1fb7c..00997ac 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -7,80 +7,98 @@ impl FastAutomaton { /// accepting path. Cycles among dead states (states that can't reach any /// accept) don't extend the language and therefore don't make the max /// infinite. + /// + /// Runs in O(V + E): the minimum is a BFS distance; the maximum is a + /// longest path over the subgraph of states lying on accepting paths, + /// which is unbounded exactly when that subgraph has a cycle (any such + /// cycle can be pumped). + #[must_use] pub fn get_length(&self) -> (Option, Option) { - if self.is_empty() { - return (None, None); - } else if self.is_total() { - return (Some(0), None); - } - - // States that lie on some accepting path. Walking only these prunes - // dead branches whose cycles cannot extend the language. + // States that can reach an accept state. If the start state can't, + // the language is empty. let live = self.get_live_states(); if !live.contains(&self.start_state) { return (None, None); } + // BFS from the start over live states only — every state on an + // accepting path is live, so this loses no accepting path. BFS visits + // in non-decreasing depth, hence the first accept hit is the minimum. + // The visited set (reachable ∩ live) is exactly the subgraph relevant + // for the maximum. let mut min = None; - let mut is_infinite = false; - + let mut visited = IntSet::default(); let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); - worklist.push_back((self.start_state, 0, IntSet::default())); - - while let Some(element) = worklist.pop_front() { - let state = element.0; - let length = element.1; - let mut seen = element.2; - if min.is_some() && length > min.unwrap() { - continue; - } - if self.accept_states.contains(&state) && (min.is_none() || length < min.unwrap()) { + visited.insert(self.start_state); + worklist.push_back((self.start_state, 0u32)); + while let Some((state, length)) = worklist.pop_front() { + if min.is_none() && self.accept_states.contains(&state) { min = Some(length); } - seen.insert(state); - - for to_state in self.direct_states(state) { - if !live.contains(&to_state) { + for (condition, to_state) in self.transitions_from(state) { + if condition.is_empty() || !live.contains(to_state) { continue; } - if to_state == state || seen.contains(&to_state) { - is_infinite = true; - continue; + if visited.insert(*to_state) { + worklist.push_back((*to_state, length + 1)); } - worklist.push_back((to_state, length + 1, seen.clone())); } } - if is_infinite || min.is_none() { - return (min, None); + // Longest path via Kahn's algorithm on the visited subgraph. In the + // acyclic case the topological order covers all visited states and + // every state's longest distance is final when it is dequeued. + let mut in_degree: IntMap = IntMap::default(); + for &from in &visited { + in_degree.entry(from).or_insert(0); + for (condition, to_state) in self.transitions_from(from) { + if condition.is_empty() || !visited.contains(to_state) { + continue; + } + *in_degree.entry(*to_state).or_insert(0) += 1; + } } - let mut max = None; - - worklist.clear(); - worklist.push_back((self.start_state, 0, IntSet::default())); + let mut queue: VecDeque = in_degree + .iter() + .filter(|&(_, °ree)| degree == 0) + .map(|(&state, _)| state) + .collect(); - while let Some(element) = worklist.pop_back() { - let state = element.0; - let length = element.1; - let mut seen = element.2; - if self.accept_states.contains(&state) && (max.is_none() || length > max.unwrap()) { - max = Some(length); + let mut longest: IntMap = IntMap::default(); + longest.insert(self.start_state, 0); + let mut max = None; + let mut processed = 0usize; + while let Some(from) = queue.pop_front() { + processed += 1; + let length = *longest.get(&from).unwrap_or(&0); + if self.accept_states.contains(&from) { + max = Some(max.map_or(length, |m: u32| m.max(length))); } - seen.insert(state); - - for to_state in self.direct_states(state) { - if !live.contains(&to_state) { + for (condition, to_state) in self.transitions_from(from) { + if condition.is_empty() || !visited.contains(to_state) { continue; } - if to_state == state || seen.contains(&to_state) { - max = None; - break; + longest + .entry(*to_state) + .and_modify(|l| *l = (*l).max(length + 1)) + .or_insert(length + 1); + let degree = in_degree + .get_mut(to_state) + .expect("every visited target was counted above"); + *degree -= 1; + if *degree == 0 { + queue.push_back(*to_state); } - worklist.push_back((to_state, length + 1, seen.clone())); } } + if processed != visited.len() { + // A cycle on an accepting path: matched strings can be pumped + // arbitrarily, the maximum is unbounded. + return (min, None); + } + (min, max) } } @@ -115,4 +133,54 @@ mod tests { "max length of {{\"\"}} is 0; got {max:?} (cycle is dead, shouldn't extend the language)" ); } -} \ No newline at end of file + + #[test] + fn get_length_finite_and_infinite() { + // Chain 0 -> 1 -> 2, accepts {0, 2}: min 0, max 2. + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.accept(0); + a.accept(s2); + assert_eq!(a.get_length(), (Some(0), Some(2))); + + // Live cycle 0 <-> 1, accept {1}: min 1, max unbounded. + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let cond = Condition::total(a.get_spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(s1, 0, &cond); + a.accept(s1); + assert_eq!(a.get_length(), (Some(1), None)); + } + + // Regression: `get_length` used to enumerate paths with a cloned `seen` + // set per branch — exponential time and memory on branching DAGs. A chain + // of diamonds has 2^k paths; the linear algorithm must handle it + // instantly. + #[test] + fn get_length_linear_on_branching_dag() { + const DIAMONDS: usize = 24; + + let mut a = FastAutomaton::new_empty(); + let cond = Condition::total(a.get_spanning_set()); + let mut current = 0; + for _ in 0..DIAMONDS { + let upper = a.new_state(); + let lower = a.new_state(); + let next = a.new_state(); + a.add_transition(current, upper, &cond); + a.add_transition(current, lower, &cond); + a.add_transition(upper, next, &cond); + a.add_transition(lower, next, &cond); + current = next; + } + a.accept(current); + + let expected = 2 * DIAMONDS as u32; + assert_eq!(a.get_length(), (Some(expected), Some(expected))); + } +} diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 9dd124c..e3eef3a 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -4,6 +4,10 @@ use super::*; impl FastAutomaton { /// Returns `true` if all strings accepted by `self` are also accepted by `other`. + /// + /// A non-deterministic `other` is determinized internally — unless the + /// execution profile disables implicit determinization, in which case + /// [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); @@ -13,10 +17,10 @@ impl FastAutomaton { // self ⊆ other iff Σ* ⊆ other iff other = Σ*. We already failed // the cheap `other.is_total()` check above; that check is sound // but conservative on NFAs, so retry on the determinized form. - return Ok(other.determinize()?.is_total()); + return Ok(other.determinize_implicit()?.is_total()); } - let mut other = other.determinize()?.into_owned(); + let mut other = other.determinize_implicit()?.into_owned(); other.complement()?; Ok(!self.has_intersection(&other)?) @@ -90,13 +94,7 @@ mod tests { let automaton_2 = regex_2.to_automaton().unwrap(); assert!(automaton_2.subset(&automaton_2).unwrap()); - assert_eq!( - expected_1_2, - automaton_1.subset(&automaton_2).unwrap() - ); - assert_eq!( - expected_2_1, - automaton_2.subset(&automaton_1).unwrap() - ); + assert_eq!(expected_1_2, automaton_1.subset(&automaton_2).unwrap()); + assert_eq!(expected_2_1, automaton_2.subset(&automaton_1).unwrap()); } } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 615c146..488bef1 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -89,6 +89,10 @@ impl FastAutomaton { /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. /// + /// If you don't want to deal with conditions and spanning sets, use + /// [`add_transition_from_range`](Self::add_transition_from_range), which + /// handles the bookkeeping for you. + /// /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: /// ```rust /// # use regexsolver::CharRange; @@ -163,6 +167,69 @@ impl FastAutomaton { }; } + /// Adds a transition labeled with the given character range, taking care + /// of the spanning-set bookkeeping. + /// + /// This is the convenient counterpart to + /// [`add_transition`](Self::add_transition): the range is converted to a + /// [`Condition`] automatically, and when it is not exactly expressible + /// in the automaton's current spanning set, the spanning set is extended + /// and every existing condition is re-projected first. + /// + /// An empty range matches no character, so no transition is added. + /// + /// # Example + /// + /// ``` + /// use regexsolver::CharRange; + /// use regexsolver::fast_automaton::FastAutomaton; + /// use regex_charclass::char::Char; + /// + /// let mut automaton = FastAutomaton::new_empty(); + /// let s1 = automaton.new_state(); + /// automaton.accept(s1); + /// + /// let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); + /// automaton.add_transition_from_range(0, s1, &a_to_c).unwrap(); + /// + /// assert!(automaton.is_match("b")); + /// assert!(!automaton.is_match("d")); + /// ``` + pub fn add_transition_from_range( + &mut self, + from_state: State, + to_state: State, + range: &CharRange, + ) -> Result<(), EngineError> { + if range.is_empty() { + return Ok(()); + } + + // Fast path: the range is exactly expressible in the current + // spanning set. `Condition::from_range` alone cannot tell us that — + // it silently drops partially-covered bases — so round-trip the + // condition to check exactness. + if let Ok(condition) = Condition::from_range(range, &self.spanning_set) + && condition.to_range(&self.spanning_set)? == *range + { + self.add_transition(from_state, to_state, &condition); + return Ok(()); + } + + // The range is not (fully) covered: extend the spanning set, + // re-project the existing conditions, then add. + let new_spanning_set = + self.spanning_set + .merge(&SpanningSet::compute_spanning_set(std::slice::from_ref( + range, + ))); + self.apply_new_spanning_set(&new_spanning_set)?; + + let condition = Condition::from_range(range, &self.spanning_set)?; + self.add_transition(from_state, to_state, &condition); + Ok(()) + } + /// Adds a transition, but refuses if it would turn a DFA into an NFA. /// /// On `Err(DeterminismLost)` the automaton is left untouched; on `Ok`, @@ -197,6 +264,12 @@ impl FastAutomaton { } /// Creates a new epsilon transition between the two states. + /// Adds an epsilon transition by eagerly folding `to_state`'s **current** + /// transitions (and acceptance) into `from_state`. + /// + /// This is a snapshot: transitions added to `to_state` *afterwards* are + /// not propagated retroactively. When building automata incrementally, + /// add epsilon transitions last. pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; @@ -414,6 +487,113 @@ mod tests { use crate::fast_automaton::condition::Condition; use crate::regex::RegularExpression; + fn rng(a: char, b: char) -> crate::CharRange { + use regex_charclass::char::Char; + crate::CharRange::new_from_range(Char::new(a)..=Char::new(b)) + } + + #[test] + fn small_mutators_and_queries() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + a.add_transition_from_range(0, s1, &rng('a', 'a')).unwrap(); + a.accept(s1); + + assert!(a.is_accepted(s1)); + assert!(a.has_transition(0, s1)); + assert!(a.get_condition(0, s1).is_some()); + assert_eq!(a.in_degree(s1), 1); + assert_eq!(a.out_degree(0), 1); + assert!(a.is_match("a")); + + // try_add_transition: refuses determinism-breaking additions and + // leaves the automaton untouched on Err. + let condition_a = Condition::from_range(&rng('a', 'a'), a.get_spanning_set()).unwrap(); + assert!(a.is_deterministic()); + assert!(a.try_add_transition(0, s2, &condition_a).is_err()); + assert!(a.is_deterministic()); + assert!(!a.has_transition(0, s2)); + // ...but accepts disjoint conditions. + let condition_not_a = condition_a.complement(); + a.try_add_transition(0, s2, &condition_not_a).unwrap(); + assert!(a.is_deterministic()); + assert!(a.has_transition(0, s2)); + + // unaccept flips membership and the language. + a.unaccept(s1); + assert!(!a.is_accepted(s1)); + assert!(!a.is_match("a")); + a.accept(s1); + assert!(a.is_match("a")); + + // remove_transition removes the edge and updates queries. + a.remove_transition(0, s1); + assert!(!a.has_transition(0, s1)); + assert!(a.get_condition(0, s1).is_none()); + assert_eq!(a.in_degree(s1), 0); + assert!(!a.is_match("a")); + } + + #[test] + fn add_transition_from_range_extends_the_spanning_set() { + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + let s2 = automaton.new_state(); + automaton.accept(s2); + + // Both ranges extend the (initially empty) spanning set. + automaton + .add_transition_from_range(0, s1, &rng('a', 'c')) + .unwrap(); + automaton + .add_transition_from_range(s1, s2, &rng('x', 'z')) + .unwrap(); + + assert!(automaton.is_match("ax")); + assert!(automaton.is_match("cz")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("x")); + + // An exactly-covered range takes the fast path: same spanning set. + let before = automaton.get_spanning_set().clone(); + automaton + .add_transition_from_range(0, s1, &rng('x', 'z')) + .unwrap(); + assert_eq!(&before, automaton.get_spanning_set()); + assert!(automaton.is_match("zx")); + + // An empty range adds nothing. + automaton + .add_transition_from_range(0, s2, &crate::CharRange::empty()) + .unwrap(); + assert!(!automaton.is_match("a")); + } + + // Regression guard: `Condition::from_range` silently drops + // partially-covered bases, so a naive "convert, merge only on error" + // implementation would truncate [a-e] to the existing [a-c] base. The + // exactness round-trip must force a spanning-set refinement instead. + #[test] + fn add_transition_from_range_is_exact_on_partial_coverage() { + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + automaton.accept(s1); + + automaton + .add_transition_from_range(0, s1, &rng('a', 'c')) + .unwrap(); + // Contains the whole [a-c] base but only part of the rest. + automaton + .add_transition_from_range(0, s1, &rng('a', 'e')) + .unwrap(); + + for accepted in ["a", "b", "c", "d", "e"] { + assert!(automaton.is_match(accepted), "{accepted:?} must match"); + } + assert!(!automaton.is_match("f")); + } + // Regression: `remove_states` used to skip the `transitions_in` cleanup // that the single-state variant `remove_state` performs (drop entries // keyed by removed states; purge them from surviving predecessor sets). diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 9fabd11..43beb02 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -15,7 +15,12 @@ pub struct ConditionConverter<'a, 'b> { impl<'a, 'b> ConditionConverter<'a, 'b> { /// Build a converter to project [`Condition`] from `from_spanning_set` to `to_spanning_set`. /// - /// Currently this method does not check that the provided [`SpanningSet`] are actually convertible. + /// Two directions are legitimate: refinement (a merged spanning set + /// before a binary operation) and coarsening (a recomputed minimal + /// spanning set, where bases no transition uses fold into the rest). The + /// pair is therefore not validated here; instead [`convert`](Self::convert) + /// asserts in debug builds that each projection preserves the + /// condition's character range. pub fn new( from_spanning_set: &'a SpanningSet, to_spanning_set: &'b SpanningSet, @@ -54,9 +59,12 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { /// Project the given [`Condition`] from `from_spanning_set` to `to_spanning_set`. /// - /// If `from_spanning_set` is not convertible to `to_spanning_set` or if the given [`Condition`] is not based on `from_spanning_set`, - /// the resulting [`Condition`] will not have any relevance. + /// Returns [`EngineError::IncompatibleSpanningSet`] if the given + /// [`Condition`] was not built over `from_spanning_set`. pub fn convert(&self, condition: &Condition) -> Result { + if condition.0.len() != self.from_spanning_set.spanning_ranges_with_rest_len() { + return Err(EngineError::IncompatibleSpanningSet); + } let mut new_condition = Condition::empty(self.to_spanning_set); for (from_index, to_indexes) in self.equivalence_map.iter().enumerate() { if condition.0.get(from_index) && !to_indexes.is_empty() { @@ -66,6 +74,20 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { } } + // The one invariant every legitimate use (refining and coarsening + // alike) must uphold: the projection denotes the same character set. + // A violation means a condition referenced a base the target spanning + // set cannot express — a silent language corruption in release. + debug_assert_eq!( + condition + .to_range(self.from_spanning_set) + .expect("the length was checked above"), + new_condition + .to_range(self.to_spanning_set) + .expect("the condition was built over the target spanning set"), + "the projection changed the condition's character range" + ); + Ok(new_condition) } diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index c02c522..8caf9b6 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -80,8 +80,22 @@ impl FastBitVec { self.fix_last_block(); } + /// The binary operations combine blocks pairwise with `zip`, which would + /// silently truncate to the shorter operand if two bitvectors built over + /// different spanning sets were ever combined — producing a wrong + /// language instead of a loud failure. Catch that in debug builds (and + /// therefore in every test run). + #[inline] + fn assert_same_len(&self, other: &Self) { + debug_assert_eq!( + self.n, other.n, + "conditions built over different spanning sets cannot be combined" + ); + } + #[inline] pub fn union(&mut self, other: &Self) { + self.assert_same_len(other); for (a, b) in self.bits.iter_mut().zip(&other.bits) { let w = *a | b; *a = w; @@ -90,6 +104,7 @@ impl FastBitVec { #[inline] pub fn intersection(&mut self, other: &Self) { + self.assert_same_len(other); for (a, b) in self.bits.iter_mut().zip(&other.bits) { let w = *a & b; *a = w; @@ -98,6 +113,7 @@ impl FastBitVec { #[inline] pub fn has_intersection(&self, other: &Self) -> bool { + self.assert_same_len(other); for (a, b) in self.bits.iter().zip(&other.bits) { if *a & b != 0 { return true; diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 3cbeea5..644a05d 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -69,6 +69,13 @@ impl Condition { } pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { + // A condition only carries meaning relative to the spanning set it + // was built from. Evaluating it against a differently-sized one used + // to panic (too short) or silently drop bits (too long). + if self.0.len() != spanning_set.spanning_ranges_with_rest_len() { + return Err(EngineError::IncompatibleSpanningSet); + } + let mut range = CharRange::empty(); for (i, base) in spanning_set @@ -183,6 +190,46 @@ mod tests { ] } + // Regression: a condition evaluated against a spanning set it was not + // built from used to panic (when too short) or silently drop bits (when + // too long); it now reports the incompatibility. + #[test] + fn to_range_rejects_incompatible_spanning_set() { + let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )]); + let large = get_spanning_set(); + + let condition = Condition::total(&small); + assert_eq!( + condition.to_range(&large), + Err(EngineError::IncompatibleSpanningSet) + ); + + let condition = Condition::total(&large); + assert_eq!( + condition.to_range(&small), + Err(EngineError::IncompatibleSpanningSet) + ); + } + + // Regression: `ConditionConverter::convert` used to panic on a condition + // that was not built over its source spanning set. + #[test] + fn convert_rejects_incompatible_condition() { + let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )]); + let merged = small.merge(&get_spanning_set()); + let converter = ConditionConverter::new(&small, &merged).unwrap(); + + let foreign = Condition::total(&merged); + assert_eq!( + converter.convert(&foreign), + Err(EngineError::IncompatibleSpanningSet) + ); + } + #[test] fn test_empty_total() -> Result<(), String> { let spanning_set = get_spanning_set(); diff --git a/src/fast_automaton/convert/to_regex/state_elimination/builder.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs index 54cec00..a8f4054 100644 --- a/src/fast_automaton/convert/to_regex/state_elimination/builder.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -10,7 +10,7 @@ impl Gnfa { transitions: Vec::with_capacity(automaton.get_number_of_states()), transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), removed_states: IntSet::with_capacity(automaton.get_number_of_states()), - empty: false + empty: false, }; if automaton.is_empty() { @@ -121,8 +121,8 @@ impl Gnfa { .insert(from_state); match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { - //o.insert(RegularExpression::Alternation(vec![transition, o.get().clone()])); - o.insert(transition.union(o.get())); + let merged = transition.union(o.get()); + *o.get_mut() = merged; } Entry::Vacant(v) => { v.insert(transition); diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs index c587a99..8d5f587 100644 --- a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -21,45 +21,54 @@ impl Gnfa { .filter(|&s| s != self.start_state && s != self.accept_state) .collect(); - states + let score_state = |state: usize| -> Option<(u128, usize)> { + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = (state as u128) & 0xFF; + return Some((score, state)); + } + + let mut score: u128 = in_deg * out_deg; + + if self.has_self_loop(state) { + score = score + (score >> 1); + } + + let mut label_cost: u128 = 0; + + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } + + score = score.saturating_add(label_cost); + + let tie = (state as u128) & 0xFFFF; + Some((score.saturating_add(tie), state)) + }; + + #[cfg(feature = "parallel")] + let best = states .into_par_iter() - .filter_map(|state| { - let preds = self.transitions_to_vec(state); - let succs = self.transitions_from_vec(state); - - let in_deg = preds.len() as u128; - let out_deg = succs.len() as u128; - - if in_deg == 0 || out_deg == 0 { - let score = (state as u128) & 0xFF; - return Some((score, state)); - } - - let mut score: u128 = in_deg * out_deg; - - if self.has_self_loop(state) { - score = score + (score >> 1); - } - - let mut label_cost: u128 = 0; - - for (_, regex) in &preds { - label_cost += regex.evaluate_complexity() as u128; - } - for (regex, _) in &succs { - label_cost += regex.evaluate_complexity() as u128; - } - if let Some(re) = self.get_transition(state, state) { - label_cost += (re.evaluate_complexity() as u128) * 2; - } - - score = score.saturating_add(label_cost); - - let tie = (state as u128) & 0xFFFF; - Some((score.saturating_add(tie), state)) - }) - .reduce_with(|a, b| if a.0 < b.0 { a } else { b }) - .map(|(_, state)| state) + .filter_map(score_state) + .reduce_with(|a, b| if a.0 < b.0 { a } else { b }); + #[cfg(not(feature = "parallel"))] + let best = states + .into_iter() + .filter_map(score_state) + .reduce(|a, b| if a.0 < b.0 { a } else { b }); + + best.map(|(_, state)| state) } fn eliminate_state(&mut self, k: usize) { diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 7eb57f0..a7e3413 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -33,8 +33,14 @@ impl PartialOrd for QueueItem { } impl FastAutomaton { - /// Generates `count` strings matched by the automaton, skipping the first `offset` strings. - /// If the provided automaton is not deterministic, it is possible to get multiple time the same strings over multiple call with different offset. + /// Generates up to `limit` distinct strings matched by the automaton, skipping the first `offset` strings. + /// + /// Strings are only guaranteed to be distinct **within a single call**: + /// the offset fast-skips by counting paths, and in a non-deterministic + /// automaton the same string can be reached through several paths, so + /// calls with different offsets may repeat strings (or skip some). + /// [`determinize`](Self::determinize) (and ideally + /// [`minimize`](Self::minimize)) first to make pages disjoint. pub fn generate_strings( &self, limit: usize, @@ -433,7 +439,7 @@ mod tests { "Chunked generation did not match bulk generation" ); - let cardinality = automaton.get_cardinality(); + let cardinality = automaton.get_cardinality().unwrap(); if let Cardinality::Integer(count) = cardinality { let empty_chunk = automaton.generate_strings(10, count as usize).unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index e8b4a60..79a1e36 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -24,6 +24,7 @@ pub mod spanning_set; /// Represent a finite state automaton. #[derive(Clone, Debug, PartialEq, Eq)] +#[must_use = "non-`_mut` operations return a new automaton"] pub struct FastAutomaton { transitions: Vec, transitions_in: IntMap>, @@ -68,15 +69,13 @@ impl Display for FastAutomaton { writeln!(sb, "\tinitial -> {from_state}")?; } for (cond, to_state) in self.transitions_from(from_state) { - writeln!( - sb, - "\t{from_state} -> {to_state} [label=\"{}\"]", - cond.to_range(&self.spanning_set) - .expect("Cannot convert condition to range.") - .to_regex() - .replace('\\', "\\\\") - .replace('"', "\\\"") - )?; + // The automata most worth printing are the broken ones: + // never panic mid-format, label desynced conditions instead. + let label = match cond.to_range(&self.spanning_set) { + Ok(range) => range.to_regex().replace('\\', "\\\\").replace('"', "\\\""), + Err(_) => String::from(""), + }; + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{label}\"]")?; } } write!(sb, "}}") @@ -141,13 +140,19 @@ impl FastAutomaton { /// Returns a vector of transitions to the given state. pub fn transitions_to_vec(&self, state: State) -> Vec<(State, Condition)> { + // Direct `(from, state)` lookups: scanning each predecessor's whole + // out-list made this O(predecessors × out-degree), and `minimize` + // builds its inverse-transition table through here. + if !self.has_state(state) { + return vec![]; + } let mut in_transitions = vec![]; for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { - for (condition, to_state) in self.transitions_from_vec(*from_state) { - if to_state == state { - in_transitions.push((*from_state, condition)); - break; - } + if !self.has_state(*from_state) { + continue; + } + if let Some(condition) = self.get_condition(*from_state, state) { + in_transitions.push((*from_state, condition.clone())); } } in_transitions diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 8060b8c..a241ed4 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -5,6 +5,18 @@ use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { + /// [`determinize`](Self::determinize) on behalf of an operation that + /// requires a deterministic automaton: when the execution profile + /// disables implicit determinization, a non-deterministic input is + /// rejected with [`EngineError::DeterministicAutomatonRequired`] instead + /// of being converted. Already-deterministic automata always pass. + pub(crate) fn determinize_implicit(&self) -> Result, EngineError> { + if !self.deterministic { + ExecutionProfile::get().assert_implicit_determinization_allowed()?; + } + self.determinize() + } + /// Determinizes the automaton and returns the result. pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { @@ -39,7 +51,7 @@ impl FastAutomaton { execution_profile.assert_max_number_of_states(new_states.len())?; if !states.is_disjoint(&accept_states) { - new_automaton.accept_states.insert(r); + new_automaton.accept(r); } for base in &bases { @@ -72,6 +84,12 @@ impl FastAutomaton { } } } + + // Optionally fold the freshly built DFA down to its minimal form + if execution_profile.should_minimize_after_determinization() { + new_automaton.minimize()?; + } + Ok(Cow::Owned(new_automaton)) } } diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 8d4f8cf..740a68e 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -47,10 +47,13 @@ impl FastAutomaton { /// Complements the automaton. /// - /// If `self` is non-deterministic, it is determinized in place first. + /// If `self` is non-deterministic, it is determinized in place first — + /// unless the execution profile disables implicit determinization, in + /// which case [`EngineError::DeterministicAutomatonRequired`] is + /// returned. pub fn complement(&mut self) -> Result<(), EngineError> { if !self.deterministic { - *self = self.determinize()?.into_owned(); + *self = self.determinize_implicit()?.into_owned(); } self.totalize()?; @@ -68,9 +71,11 @@ impl FastAutomaton { /// Computes the difference between `self` and `other`. /// - /// If `other` is non-deterministic, it is determinized first. + /// If `other` is non-deterministic, it is determinized first — unless + /// the execution profile disables implicit determinization, in which + /// case [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn difference(&self, other: &FastAutomaton) -> Result { - let mut complement = other.determinize()?.into_owned(); + let mut complement = other.determinize_implicit()?.into_owned(); complement.complement()?; self.intersection(&complement) } diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index c4335e9..8bf8bb3 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +#[cfg(feature = "parallel")] use rayon::prelude::*; use condition::converter::ConditionConverter; @@ -32,6 +33,9 @@ impl FastAutomaton { } /// Computes in parallel the intersection of all automata in the given iterator. + /// + /// Only available with the `parallel` feature (enabled by default). + #[cfg(feature = "parallel")] pub fn intersection_all_par<'a, I: IntoParallelIterator>( automata: I, ) -> Result { @@ -160,6 +164,7 @@ impl FastAutomaton { while let Some(p) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; if self.accept_states.contains(&p.1) && other.accept_states.contains(&p.2) { return Ok(true); } @@ -213,6 +218,33 @@ impl FastAutomaton { mod tests { use crate::regex::RegularExpression; + // Regression: `has_intersection` enforced the timeout but not the state + // budget, unlike `intersection` — the product pair map could grow + // unchecked. + #[test] + fn has_intersection_respects_state_budget() { + use crate::error::EngineError; + use crate::execution_profile::ExecutionProfileBuilder; + + let a = RegularExpression::parse("abcd", false) + .unwrap() + .to_automaton() + .unwrap(); + let b = RegularExpression::parse("abcd", false) + .unwrap() + .to_automaton() + .unwrap(); + + let result = ExecutionProfileBuilder::new() + .max_number_of_states(2) + .build() + .run(|| a.has_intersection(&b)); + assert!(matches!( + result, + Err(EngineError::AutomatonHasTooManyStates) + )); + } + // a* ∩ a* = a*: the intersection keeps the (infinite) looping language. #[test] fn intersection_keeps_infinite_language() { diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs index b676864..9385930 100644 --- a/src/fast_automaton/operation/minimize.rs +++ b/src/fast_automaton/operation/minimize.rs @@ -1,23 +1,33 @@ +use crate::execution_profile::ExecutionProfile; + use super::*; impl FastAutomaton { /// Minimizes the automaton using Hopcroft's Algorithm. /// - /// If `self` is non-deterministic, it is determinized in place first. + /// If `self` is non-deterministic, it is determinized in place first — + /// unless the [`ExecutionProfile`] disables implicit determinization, in + /// which case [`EngineError::DeterministicAutomatonRequired`] is + /// returned. pub fn minimize(&mut self) -> Result<(), EngineError> { + // The `minimal` flag is conservatively cleared on every mutation, so + // it can be trusted here; this also keeps the + // `minimize_after_determinization` profile from paying a second + // Hopcroft pass when callers minimize an already-minimized result. + if self.minimal { + return Ok(()); + } if !self.deterministic { - *self = self.determinize()?.into_owned(); + *self = self.determinize_implicit()?.into_owned(); } + let execution_profile = ExecutionProfile::get(); // Drop states unreachable from the start. A minimal automaton has none, // and downstream invariants rely on it — in particular `is_empty`'s // fast path treats any minimal automaton with an accept state as // non-empty, which only holds if every accept state is reachable. let reachable = self.forward_reachable_states(); - let unreachable: IntSet = self - .states() - .filter(|s| !reachable.contains(s)) - .collect(); + let unreachable: IntSet = self.states().filter(|s| !reachable.contains(s)).collect(); if !unreachable.is_empty() { self.remove_states(&unreachable); } @@ -57,6 +67,7 @@ impl FastAutomaton { let mut touched_partitions: Vec = Vec::with_capacity(max_states); while let Some(a_idx) = worklist.pop() { + execution_profile.assert_not_timed_out()?; in_worklist[a_idx] = false; let a = partitions[a_idx].clone(); diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 6045ef0..fdf8afb 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -40,11 +40,20 @@ impl FastAutomaton { } // Empty language: ∅⁰ = {""}, ∅ⁿ = ∅ for n ≥ 1. The general algorithm - // below assumes at least one accept state when installing loop-backs - // for unbounded repeats; bail out here before it can panic. - if self.accept_states.is_empty() { + // below assumes a non-empty language; bail out before it can panic. + // This must be the semantic `is_empty()` check, not just + // `accept_states.is_empty()`: an automaton whose accept states are + // all unreachable is the empty language too, and the construction + // below breaks on it (concatenation prunes the dead accepts, leaving + // stale state ids in the accept frontier). + if self.is_empty() { if min == 0 { - self.accept(self.start_state); + // ∅⁰ is exactly {""} — replace the whole automaton instead + // of marking the start accepting: a dead automaton can still + // have reachable transitions (e.g. a self-loop on a + // non-accepting start), and an accepting start would wrongly + // revive them into (label)*. + self.make_empty_string(); } return Ok(()); } @@ -74,7 +83,9 @@ impl FastAutomaton { && max == 1 { if min == 0 { - self.accept_states.insert(self.start_state); + // Through `accept()`, not a direct insert: the language + // changes (it gains ""), so the `minimal` flag must clear. + self.accept(self.start_state); } return Ok(()); } @@ -141,14 +152,15 @@ impl FastAutomaton { // copies (e.g. `(a*b){1,3}` matching "ba"). In that case we force a // non-merging concatenation so each boundary is a clean accept state // reached by an epsilon transition. - let force_no_merge = - automaton_to_repeat.in_degree(automaton_to_repeat.start_state) > 0; + let force_no_merge = automaton_to_repeat.in_degree(automaton_to_repeat.start_state) > 0; let mut end_states = self.accept_states.iter().cloned().collect::>(); for _ in cmp::max(min, 1)..max_opt.unwrap() { self.concat_mut_with(&automaton_to_repeat, force_no_merge)?; end_states.extend(self.accept_states.iter()); } - self.accept_states.extend(end_states); + for end_state in end_states { + self.accept(end_state); + } if min == 0 { self.accept(self.start_state); } @@ -179,8 +191,14 @@ impl FastAutomaton { let in_deg_start = self.in_degree(self.start_state) > 0; // --- REUSE CONCAT HEURISTIC HERE --- - // Calculate the state delta for a single concatenation. - let concat_cost = self.concat_state_count_heuristic(self) - v_original; + // Calculate the state delta for a single concatenation. The concat + // heuristic short-circuits to a *smaller* value than `v_original` + // for degenerate languages (∅ → 1, {""} → the operand size), so the + // delta must saturate: `repeat_mut` early-returns for those inputs + // right after this estimate anyway. + let concat_cost = self + .concat_state_count_heuristic(self) + .saturating_sub(v_original); // 2. Early state allocation for 0-minimum repeats with incoming start edges if min == 0 && in_deg_start { @@ -228,8 +246,7 @@ impl FastAutomaton { match self.repeat(0, None) { Ok(star) => { let star_states = star.get_number_of_states(); - let not_mergeable = - star.in_degree(star.start_state) > 0 && acc_out_gt_0; + let not_mergeable = star.in_degree(star.start_state) > 0 && acc_out_gt_0; let final_concat_cost = if not_mergeable { star_states } else { @@ -261,6 +278,81 @@ impl FastAutomaton { #[cfg(test)] mod tests { + // Regression: the r{0,1} fast path used to insert into `accept_states` + // directly, leaving a stale `minimal = true` on a mutated automaton — + // `minimize()` (which trusts the flag) then silently refused to + // minimize it. + // Regression (found by the repeat decomposition-oracle proptest): the + // empty-language guard checked `accept_states.is_empty()` only, so an + // automaton whose accepts are all unreachable (language ∅ too) fell + // through to the general construction, which panicked on the stale + // accept ids after concatenation pruned them. + #[test] + fn repeat_of_unreachable_accept_empty_language() { + let mut a = crate::fast_automaton::FastAutomaton::new_empty(); + let s1 = a.new_state(); + a.accept(s1); // unreachable accept: the language is ∅ + assert!(a.is_empty()); + + let star = a.repeat(0, None).unwrap(); // ∅* = {""} + assert!(star.is_match("")); + assert!(!star.is_match("a")); + + assert!(a.repeat(1, Some(2)).unwrap().is_empty()); // ∅{1,2} = ∅ + assert!(a.repeat(2, None).unwrap().is_empty()); // ∅{2,} = ∅ + + // A dead automaton with REACHABLE transitions: ∅* must still be + // exactly {""} — marking the start accepting used to revive the + // dead self-loop into b*. + let range_b = crate::CharRange::new_from_range( + regex_charclass::char::Char::new('b')..=regex_charclass::char::Char::new('b'), + ); + let mut dead_loop = crate::fast_automaton::FastAutomaton::new_empty(); + dead_loop.add_transition_from_range(0, 0, &range_b).unwrap(); + assert!(dead_loop.is_empty()); + + let star = dead_loop.repeat(0, None).unwrap(); + assert!(star.is_match("")); + assert!(!star.is_match("b"), "∅* must not contain \"b\""); + assert!(dead_loop.repeat(1, None).unwrap().is_empty()); + } + + // state-count heuristic underflowed on empty-language automata with + // more than one state, because the concat heuristic short-circuits ∅ + // to 1 — panicking in the public `repeat` before the empty-language + // early-return could run. + #[test] + fn repeat_of_multi_state_empty_language_does_not_underflow() { + let mut a = crate::fast_automaton::FastAutomaton::new_empty(); + a.new_state(); // ≥ 2 states, no accept states: the empty language + + let star = a.repeat(0, None).unwrap(); // ∅* = {""} + assert!(star.is_match("")); + assert!(!star.is_match("a")); + + let plus = a.repeat(1, None).unwrap(); // ∅⁺ = ∅ + assert!(plus.is_empty()); + + let bounded = a.repeat(2, Some(3)).unwrap(); // ∅{2,3} = ∅ + assert!(bounded.is_empty()); + } + + #[test] + fn repeat_zero_or_one_clears_the_minimal_flag() { + let mut a = crate::regex::RegularExpression::new("ab") + .unwrap() + .to_automaton() + .unwrap(); + a.minimize().unwrap(); + assert!(a.is_minimal()); + assert!(!a.is_match("")); + + a.repeat_mut(0, Some(1)).unwrap(); + assert!(a.is_match("")); + assert!(a.is_match("ab")); + assert!(!a.is_minimal(), "the language changed: the flag must clear"); + } + use crate::fast_automaton::FastAutomaton; use crate::regex::RegularExpression; @@ -298,7 +390,10 @@ mod tests { ] { let r = empty_string.repeat(min, max).unwrap(); assert!(r.is_match(""), "{{\"\"}}{{{min},{max:?}}} must match \"\""); - assert!(!r.is_match("a"), "{{\"\"}}{{{min},{max:?}}} must match only \"\""); + assert!( + !r.is_match("a"), + "{{\"\"}}{{{min},{max:?}}} must match only \"\"" + ); } } diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index e3162d7..8792c59 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -1,6 +1,7 @@ use std::hash::BuildHasherDefault; use condition::converter::ConditionConverter; +#[cfg(feature = "parallel")] use rayon::prelude::*; use crate::{error::EngineError, execution_profile::ExecutionProfile}; @@ -25,6 +26,9 @@ impl FastAutomaton { } /// Computes in parallel the union of all automata in the given iterator. + /// + /// Only available with the `parallel` feature (enabled by default). + #[cfg(feature = "parallel")] pub fn union_all_par<'a, I: IntoParallelIterator>( automata: I, ) -> Result { @@ -334,12 +338,19 @@ mod tests { let u = empty_string.union(&a_plus).unwrap(); assert!(u.is_match(""), "union must keep \"\""); - assert!(u.is_match("a"), "union dropped the other operand's language"); + assert!( + u.is_match("a"), + "union dropped the other operand's language" + ); assert!(u.is_match("aaa")); // It must be equivalent regardless of operand order. let u2 = a_plus.union(&empty_string).unwrap(); - assert!(Term::from_automaton(u).equivalent(&Term::from_automaton(u2)).unwrap()); + assert!( + Term::from_automaton(u) + .equivalent(&Term::from_automaton(u2)) + .unwrap() + ); } // Regression: `prepare_accept_states` merges accept states without diff --git a/src/lib.rs b/src/lib.rs index 2ccfe0c..3813b12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ use cardinality::Cardinality; use error::EngineError; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; +#[cfg(feature = "parallel")] use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; @@ -62,11 +63,11 @@ pub type CharRange = RangeSet; /// /// // Analyze /// assert_eq!(rep.get_length(), (Some(6), Some(12))); -/// assert!(!rep.is_empty().unwrap()); +/// assert!(!rep.is_empty()?); /// /// // Generate examples /// let samples = Term::from_pattern("(x|y){1,3}")? -/// .generate_strings(5, 0, false)?; +/// .generate_strings(5, 0)?; /// println!("Some matches: {:?}", samples); /// /// // Equivalence & subset @@ -82,6 +83,7 @@ pub type CharRange = RangeSet; /// /// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. #[derive(Clone, PartialEq, Eq, Debug)] +#[must_use = "terms are immutable; operations return a new term"] pub enum Term { RegularExpression(RegularExpression), Automaton(FastAutomaton), @@ -97,6 +99,17 @@ impl Display for Term { } impl Term { + /// `Term` operations manage the underlying representation themselves, so + /// the determinizations they perform are by definition explicit: + /// they run with the profile's `implicit_determinization` setting + /// re-enabled (that knob targets direct [`FastAutomaton`] usage). The + /// rest of the profile — deadline, state budget — is preserved. + fn run_with_implicit_determinization(f: impl FnOnce() -> R) -> R { + ExecutionProfile::get() + .with_implicit_determinization(true) + .apply(f) + } + /// Creates a term that matches the empty language. pub fn new_empty() -> Self { Term::RegularExpression(RegularExpression::new_empty()) @@ -217,17 +230,20 @@ impl Term { } if has_automaton { - let parallel = terms.len() > 3; + let parallel = cfg!(feature = "parallel") && terms.len() > 3; let automaton_list = self.get_automata(terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + #[cfg(feature = "parallel")] let return_automaton = if parallel { FastAutomaton::union_all_par(automaton_list) } else { FastAutomaton::union_all(automaton_list) }?; + #[cfg(not(feature = "parallel"))] + let return_automaton = FastAutomaton::union_all(automaton_list)?; Ok(Term::Automaton(return_automaton)) } else { @@ -261,23 +277,32 @@ impl Term { /// } /// ``` pub fn intersection(&self, terms: &[Term]) -> Result { - let parallel = terms.len() > 3; + let parallel = cfg!(feature = "parallel") && terms.len() > 3; let automaton_list = self.get_automata(terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + #[cfg(feature = "parallel")] let return_automaton = if parallel { FastAutomaton::intersection_all_par(automaton_list) } else { FastAutomaton::intersection_all(automaton_list) }?; + #[cfg(not(feature = "parallel"))] + let return_automaton = FastAutomaton::intersection_all(automaton_list)?; Ok(Term::Automaton(return_automaton)) } /// Computes the difference between `self` and `other`. /// + /// Unlike [`union`](Self::union) and [`intersection`](Self::intersection) + /// this deliberately takes a single operand: difference is neither + /// associative nor commutative, so a variadic form would be ambiguous + /// (`a - b - c` could mean `(a - b) - c` or `a - (b - c)`). Chain calls — + /// or subtract a union — to remove several languages. + /// /// # Example: /// /// ``` @@ -293,12 +318,14 @@ impl Term { /// } /// ``` pub fn difference(&self, other: &Term) -> Result { - let minuend_automaton = self.to_automaton()?; - let subtrahend_automaton = other.to_automaton()?; - // `FastAutomaton::difference` determinizes the subtrahend itself. - let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; + Self::run_with_implicit_determinization(|| { + let minuend_automaton = self.to_automaton()?; + let subtrahend_automaton = other.to_automaton()?; + // `FastAutomaton::difference` determinizes the subtrahend itself. + let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; - Ok(Term::Automaton(return_automaton)) + Ok(Term::Automaton(return_automaton)) + }) } /// Computes the complement of `self`. @@ -316,11 +343,13 @@ impl Term { /// assert!(term.union(&[complement]).unwrap().is_total().unwrap()); /// ``` pub fn complement(&self) -> Result { - // `FastAutomaton::complement` determinizes `self` itself. - let mut automaton = self.to_automaton()?.into_owned(); - automaton.complement()?; + Self::run_with_implicit_determinization(|| { + // `FastAutomaton::complement` determinizes `self` itself. + let mut automaton = self.to_automaton()?.into_owned(); + automaton.complement()?; - Ok(Term::Automaton(automaton)) + Ok(Term::Automaton(automaton)) + }) } /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. @@ -358,56 +387,53 @@ impl Term { /// Generates up to `limit` distinct strings matched by the term, skipping the first `offset` strings. /// - /// When paginating through a large set of generated strings, you should set `return_stable_term` - /// to `true` on the initial call. This instructs the engine to compile the term into a deterministic - /// and minimized state (a "stable term"). - /// - /// Replacing your current term with this returned stable term for subsequent calls guarantees - /// that no strings are repeated. + /// Strings are only guaranteed to be distinct **within a single call**: + /// the offset fast-skips by counting paths, and in a non-deterministic + /// automaton the same string can be reached through several paths, so + /// calls with different offsets may repeat strings (or skip some). The + /// enumeration order also depends on the automaton's structure, so + /// offsets are only consistent across calls made on the same term. /// - /// The stable term is returned as the first element of the tuple (`Some(Term)`). If the term is - /// already stable, or if `return_stable_term` is `false`, it returns `None` to save resources. + /// For reliable pagination, call [`minimize`](Self::minimize) once and + /// generate from the minimized term: it is deterministic — paths and + /// strings are then one-to-one, making pages disjoint — and its fixed + /// structure keeps offsets consistent, without re-converting the term on + /// every page. /// /// # Example: /// /// ``` /// use regexsolver::Term; /// - /// let mut term = Term::from_pattern("(abc|de){2}").unwrap(); + /// // Minimize once, then paginate with consistent offsets. + /// let term = Term::from_pattern("(abc|de){2}").unwrap().minimize().unwrap(); /// - /// // Generate the first 2 matched strings and request a stable term - /// let (stable_term, batch) = term.generate_strings(2, 0, true).unwrap(); + /// let batch = term.generate_strings(2, 0).unwrap(); /// assert_eq!(2, batch.len()); // ["dede", "deabc"] /// - /// // Update the term if a newly compiled stable term was returned - /// if let Some(t) = stable_term { - /// term = t; - /// } - /// - /// // Generate the next 2 matched strings by setting the offset using the stable term - /// let (_, batch) = term.generate_strings(2, 2, false).unwrap(); + /// let batch = term.generate_strings(2, 2).unwrap(); /// assert_eq!(2, batch.len()); // ["abcde", "abcabc"] /// ``` pub fn generate_strings( &self, limit: usize, offset: usize, - return_stable_term: bool, - ) -> Result<(Option, Vec), EngineError> { - let automaton = self.to_automaton()?; - if !return_stable_term || automaton.is_deterministic() { - Ok((None, self.to_automaton()?.generate_strings(limit, offset)?)) - } else { - // `minimize` determinizes first, yielding the deterministic, - // minimal "stable" automaton. - let mut automaton = automaton.into_owned(); - if !automaton.is_minimal() { - automaton.minimize()?; - } + ) -> Result, EngineError> { + self.to_automaton()?.generate_strings(limit, offset) + } - let generated_strings = automaton.generate_strings(limit, offset)?; - Ok((Some(Term::Automaton(automaton)), generated_strings)) - } + /// Returns an equivalent term backed by the minimal deterministic + /// automaton. + /// + /// Useful before paginating with + /// [`generate_strings`](Self::generate_strings) (see there), or to + /// compact a term after a chain of operations. + pub fn minimize(&self) -> Result { + Self::run_with_implicit_determinization(|| { + let mut automaton = self.to_automaton()?.into_owned(); + automaton.minimize()?; + Ok(Term::Automaton(automaton)) + }) } /// Returns `true` if both terms accept the same language. @@ -427,9 +453,11 @@ impl Term { return Ok(true); } - let automaton_1 = self.to_automaton()?; - let automaton_2 = term.to_automaton()?; - automaton_1.equivalent(&automaton_2) + Self::run_with_implicit_determinization(|| { + let automaton_1 = self.to_automaton()?; + let automaton_2 = term.to_automaton()?; + automaton_1.equivalent(&automaton_2) + }) } /// Returns `true` if all strings matched by the current term are also matched by the given term. @@ -449,9 +477,11 @@ impl Term { return Ok(true); } - let automaton_1 = self.to_automaton()?; - let automaton_2 = term.to_automaton()?; - automaton_1.subset(&automaton_2) + Self::run_with_implicit_determinization(|| { + let automaton_1 = self.to_automaton()?; + let automaton_2 = term.to_automaton()?; + automaton_1.subset(&automaton_2) + }) } /// Checks if the term matches the empty language. @@ -472,6 +502,8 @@ impl Term { } else if automaton.is_deterministic() { Ok(false) } else { + // `Term` manages the representation itself: this is an + // explicit determinization, never gated by the profile. Ok(automaton.determinize()?.is_total()) } } @@ -487,6 +519,7 @@ impl Term { } /// Returns the minimum and maximum length of matched strings. + #[must_use] pub fn get_length(&self) -> (Option, Option) { match self { Term::RegularExpression(regex) => regex.get_length(), @@ -498,11 +531,9 @@ impl Term { pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), - Term::Automaton(automaton) => Ok(if !automaton.is_deterministic() { - automaton.determinize()?.get_cardinality() - } else { - automaton.get_cardinality() - }), + Term::Automaton(automaton) => { + Self::run_with_implicit_determinization(|| automaton.get_cardinality()) + } } } @@ -515,6 +546,7 @@ impl Term { } /// Converts the term to a [`RegularExpression`]. + #[must_use] pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -523,6 +555,7 @@ impl Term { } /// Converts the term to a regular expression pattern. + #[must_use] pub fn to_pattern(&self) -> String { self.to_regex().to_string() } @@ -535,6 +568,7 @@ impl Term { let mut automaton_list = Vec::with_capacity(terms.len() + 1); automaton_list.push(self.to_automaton()?); + #[cfg(feature = "parallel")] let mut terms_automata = if parallel { let execution_profile = ExecutionProfile::get(); terms @@ -547,6 +581,14 @@ impl Term { .map(Term::to_automaton) .collect::, _>>() }?; + #[cfg(not(feature = "parallel"))] + let mut terms_automata = { + let _ = parallel; + terms + .iter() + .map(Term::to_automaton) + .collect::, EngineError>>()? + }; automaton_list.append(&mut terms_automata); Ok(automaton_list) diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 34aa401..bb8a5ee 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -46,27 +46,27 @@ impl RegularExpression { let other_regex; match (self, other) { - (RegularExpression::Concat(_), _) => { + (RegularExpression::Concat(..), _) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_concat_and_other(self, other, is_prefix); } - (_, RegularExpression::Concat(_)) => { + (_, RegularExpression::Concat(..)) => { (common_affix, (other_regex, self_regex)) = Self::opaffix_concat_and_other(other, self, is_prefix); } - (RegularExpression::Character(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Character(..), RegularExpression::Repetition(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_character_and_repetition(self, other); } - (RegularExpression::Repetition(_, _, _), RegularExpression::Character(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Character(..)) => { (common_affix, (other_regex, self_regex)) = Self::opaffix_character_and_repetition(other, self); } - (RegularExpression::Repetition(_, _, _), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Repetition(..), RegularExpression::Repetition(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_repetition_and_repetition(self, other); } - (RegularExpression::Alternation(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Alternation(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_alternation_and_alternation(self, other); } @@ -86,11 +86,17 @@ impl RegularExpression { (RegularExpression, RegularExpression), ) { if let ( - RegularExpression::Character(_), + RegularExpression::Character(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_character, that_repetition) { - if this_character == &**that_regex && *that_min == 1 { + // The `max != 0` guard keeps a directly-constructed invalid + // repetition (`r{1,0}`) from underflowing; such trees are + // rejected by `to_automaton`, the simplifier just must not panic. + if this_character == &**that_regex + && *that_min == 1 + && that_max_opt.is_none_or(|that_max| that_max >= 1) + { let new_max = that_max_opt.as_ref().map(|that_max| that_max - 1); ( Some(this_character.clone()), diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 57593a9..9a143df 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -7,6 +7,7 @@ mod number_of_states; impl RegularExpression { /// Returns the minimum and maximum length of possible matched strings. + #[must_use] pub fn get_length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { @@ -171,7 +172,9 @@ mod tests { assert_length("(at?)"); assert_length("(ot){3,4}"); assert_length("(ot?d){1,4}"); - assert_length("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_length( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); assert_eq!( FastAutomaton::new_empty().get_length(), @@ -214,7 +217,9 @@ mod tests { assert_cardinality("(ot){3,4}"); assert_cardinality("(t){1,3}"); assert_cardinality("(ot?d){1,4}"); - assert_cardinality("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_cardinality( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); Ok(()) } @@ -225,17 +230,10 @@ mod tests { let cardinality = regex.get_cardinality(); let automaton = regex.to_automaton().unwrap(); - // `get_cardinality` needs a DFA for an exact finite count, but returns - // `Infinite` for cyclic automata without requiring determinism. Only - // determinize the finite (bounded-length) ones — determinizing a large - // cyclic automaton can blow up. - let automaton = if automaton.get_length().1.is_some() { - automaton.determinize().unwrap().into_owned() - } else { - automaton - }; - - let expected = automaton.get_cardinality(); + // `get_cardinality` returns `Infinite` for cyclic automata without + // determinizing and only determinizes the finite (acyclic) + // non-deterministic ones internally. + let expected = automaton.get_cardinality().unwrap(); assert_eq!(expected, cardinality); } diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 36ebff2..e9c62fe 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -105,7 +105,13 @@ impl AbstractNFAMetadata { return_accepted.push(return_start.clone()); if max_opt.is_none() { let return_number_of_states = if !start_state_or_accept_states_not_mergeable { - self.number_of_states - 1 + // An automaton always has at least one state. Degenerate + // sub-expressions denoting {""} (e.g. an unsimplified + // `(a{0,0})*`) reach this point with a single state, and + // the merge discount must not drive the count to zero — + // every later `- 1` in this module relies on counts + // staying >= 1. + (self.number_of_states - 1).max(1) } else { self.number_of_states }; @@ -190,7 +196,10 @@ impl AbstractNFAMetadata { AbstractNFAMetadata { start: return_start, accepted: return_accepted, - number_of_states: return_number_of_states, + // Both merge discounts can apply to two single-state {""} + // operands (e.g. `a{0,0}|b{0,0}`); clamp so the count never + // reaches zero (see `repeat`). + number_of_states: return_number_of_states.max(1), } } } @@ -202,7 +211,7 @@ impl RegularExpression { fn evaluate_number_of_states_in_nfa(&self) -> AbstractNFAMetadata { match self { - RegularExpression::Character(_) => AbstractNFAMetadata::new(), + RegularExpression::Character(..) => AbstractNFAMetadata::new(), RegularExpression::Repetition(regex, min, max_opt) => regex .evaluate_number_of_states_in_nfa() .repeat(*min, max_opt), @@ -302,10 +311,48 @@ mod tests { assert_number_of_states_in_nfa("q(ab|ca|ab|abc)x"); assert_number_of_states_in_nfa("a*(aad|ads|a)abc.*def.*ghi"); - assert_number_of_states_in_nfa("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_number_of_states_in_nfa( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); Ok(()) } + // Regression: directly-constructed (unsimplified) repetitions over {""} + // sub-expressions — shapes the string parser simplifies away but any user + // of the public enum can build — used to drive the abstract state count + // to zero, after which the merge discounts underflowed and panicked. + #[test] + fn degenerate_repetitions_do_not_underflow() { + use std::collections::VecDeque; + + let atom = RegularExpression::new("a").unwrap(); + // a{0,0} denotes {""} without being the canonical empty-string form. + let empty_string = RegularExpression::Repetition(Box::new(atom), 0, Some(0)); + let star_of_alternation = RegularExpression::Repetition( + Box::new(RegularExpression::Alternation(vec![ + empty_string.clone(), + empty_string.clone(), + ])), + 0, + None, + ); + let star_of_concat = RegularExpression::Repetition( + Box::new(RegularExpression::Concat(VecDeque::from([ + empty_string.clone(), + RegularExpression::Repetition(Box::new(empty_string), 0, None), + ]))), + 0, + None, + ); + + for regex in [star_of_alternation, star_of_concat] { + let estimate = regex.get_number_of_states_in_nfa(); + assert!(estimate >= 1, "state estimate of {regex} must be >= 1"); + let automaton = regex.to_automaton().unwrap(); + assert!(automaton.get_number_of_states() >= 1); + } + } + fn assert_number_of_states_in_nfa(regex: &str) { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); diff --git a/src/regex/builder.rs b/src/regex/builder.rs index a77871d..6edf4af 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -1,15 +1,8 @@ -use ::regex::Regex; -use lazy_static::lazy_static; use regex_charclass::irange::range::AnyRange; use regex_syntax::ParserBuilder; use super::*; -lazy_static! { - static ref RE_FLAG_DETECTION: Regex = - Regex::new(r"\(\?[imsx]*-?[imsx]*\)").expect("Can not compile flag detection regex."); -} - impl RegularExpression { /// Parses and simplifies the provided pattern and returns the resulting [`RegularExpression`]. pub fn new(pattern: &str) -> Result { @@ -34,8 +27,39 @@ impl RegularExpression { } } + /// Strips inline flag groups like `(?i)`, `(?m-s)` or `(?-s)` from the + /// pattern: the engine treats all characters uniformly, so the flags are + /// meaningless here. Equivalent to deleting every match of + /// `\(\?[imsx]*-?[imsx]*\)`; anything else — including non-capturing + /// groups `(?:...)` — is left untouched. fn remove_flags(regex: &str) -> String { - RE_FLAG_DETECTION.replace_all(regex, "").to_string() + let bytes = regex.as_bytes(); + let mut result = String::with_capacity(regex.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'(' && i + 1 < bytes.len() && bytes[i + 1] == b'?' { + let mut j = i + 2; + while j < bytes.len() && matches!(bytes[j], b'i' | b'm' | b's' | b'x') { + j += 1; + } + if j < bytes.len() && bytes[j] == b'-' { + j += 1; + while j < bytes.len() && matches!(bytes[j], b'i' | b'm' | b's' | b'x') { + j += 1; + } + } + if j < bytes.len() && bytes[j] == b')' { + // a flag group: skip it entirely + i = j + 1; + continue; + } + } + // not a flag group: copy the whole character (UTF-8 safe) + let char_len = regex[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1); + result.push_str(®ex[i..i + char_len]); + i += char_len; + } + result } /// Creates a regular expression that matches all possible strings. @@ -155,6 +179,79 @@ impl RegularExpression { mod tests { use crate::regex::RegularExpression; + // The hand-rolled flag stripper must delete exactly the matches of + // `\(\?[imsx]*-?[imsx]*\)` (the regex it replaced) and nothing else. + #[test] + fn remove_flags_strips_flag_groups_only() { + let strip = RegularExpression::remove_flags; + + assert_eq!(strip("(?i)a"), "a"); + assert_eq!(strip("a(?m-s)b"), "ab"); + assert_eq!(strip("a(?-s)b"), "ab"); + assert_eq!(strip("(?imsx)(?)a(?i-)"), "a"); + + // Non-flag constructs are untouched. + assert_eq!(strip("(?:ab|c)d"), "(?:ab|c)d"); + assert_eq!(strip("(a?)b"), "(a?)b"); + assert_eq!(strip("a(?i-s"), "a(?i-s"); // unterminated: not a flag group + assert_eq!(strip("héllo(?i)é"), "hélloé"); // multi-byte safe + } + + // The variants are freely constructible (open enum); invalid bounds are + // rejected at the conversion boundary instead. + #[test] + fn to_automaton_rejects_invalid_repetition_bounds() { + use crate::error::EngineError; + + let a = RegularExpression::new("a").unwrap(); + let invalid = RegularExpression::Repetition(Box::new(a.clone()), 5, Some(2)); + assert_eq!( + invalid.to_automaton().unwrap_err(), + EngineError::InvalidRepetitionBounds(5, 2) + ); + + // Nested invalid repetitions are caught by the recursion. + let nested = RegularExpression::Concat([a.clone(), invalid].into()); + assert_eq!( + nested.to_automaton().unwrap_err(), + EngineError::InvalidRepetitionBounds(5, 2) + ); + + // The simplifying combinators must not panic on invalid trees either + // (regression: the affix factoring of `r{1,0}` used to underflow). + let degenerate = RegularExpression::Repetition(Box::new(a.clone()), 1, Some(0)); + let _ = a.union(°enerate); + let _ = a.concat(°enerate, true); + } + + // Regression (found by the proptest generators): singleton + // Alternation/Concat wrappers print transparently, so quantified + // expressions must be parenthesized by looking through them — + // `((.a))*` used to print as `.a*` instead of `(.a)*`, changing the + // language. + #[test] + fn display_parenthesizes_through_singleton_wrappers() { + use regex_charclass::char::Char; + + let dot = RegularExpression::Character(crate::CharRange::total()); + let a = RegularExpression::Character(crate::CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )); + let wrapped = + RegularExpression::Alternation(vec![RegularExpression::Concat([dot, a].into())]); + let star = RegularExpression::Repetition(Box::new(wrapped), 0, None); + assert_eq!(star.to_string(), "(.a)*"); + + // The printed pattern must denote the same language as the tree. + let reparsed = RegularExpression::parse(&star.to_string(), false).unwrap(); + assert!( + star.to_automaton() + .unwrap() + .equivalent(&reparsed.to_automaton().unwrap()) + .unwrap() + ); + } + #[test] fn test_parse() -> Result<(), String> { assert_parse("abc+"); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index bdc5484..5e1c8b1 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -13,11 +13,38 @@ mod builder; mod operation; /// Represent a regular expression. +/// +/// The variants are public and freely constructible and matchable. Values +/// can also be built with the parser ([`new`](Self::new) / +/// [`parse`](Self::parse)) or the simplifying combinators +/// ([`concat`](Self::concat), [`union`](Self::union), +/// [`repeat`](Self::repeat)). A directly-constructed repetition whose +/// maximum is below its minimum denotes no valid language and is rejected +/// with [`EngineError::InvalidRepetitionBounds`] when converted by +/// [`to_automaton`](Self::to_automaton). +/// +/// ``` +/// use regexsolver::regex::RegularExpression; +/// +/// let regex = RegularExpression::new("a{2,3}").unwrap(); +/// if let RegularExpression::Repetition(inner, min, max) = ®ex { +/// assert_eq!((*min, *max), (2, Some(3))); +/// assert_eq!(inner.to_string(), "a"); +/// } +/// ``` #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] +#[must_use = "regular expressions are immutable; operations return a new expression"] pub enum RegularExpression { + /// A single character drawn from the given range; an empty range denotes + /// the empty language `[]`. Character(CharRange), + /// `r{min,max}`; `None` means unbounded. Expected invariant: `max >= min` + /// when bounded (checked by [`to_automaton`](Self::to_automaton)). Repetition(Box, u32, Option), + /// The concatenation of the parts in order; no parts denotes the empty + /// string `""`. Concat(VecDeque), + /// The union of the parts; no parts denotes the empty language `[]`. Alternation(Vec), } @@ -48,14 +75,10 @@ impl Display for RegularExpression { } else { multiplicator_part = format!("{{{min},}}"); } - match **regular_expression { - RegularExpression::Repetition(_, _, _) => { - format!("({regex_part}){multiplicator_part}") - } - RegularExpression::Concat(_) => { - format!("({regex_part}){multiplicator_part}") - } - _ => format!("{regex_part}{multiplicator_part}"), + if RegularExpression::quantifier_needs_parens(regular_expression) { + format!("({regex_part}){multiplicator_part}") + } else { + format!("{regex_part}{multiplicator_part}") } } RegularExpression::Concat(concat) => { @@ -88,6 +111,32 @@ impl Display for RegularExpression { } impl RegularExpression { + /// Whether applying a quantifier to the printed form of `r` requires + /// wrapping it in a group. Singleton `Concat`/`Alternation` wrappers + /// print transparently, so the decision must look through them instead + /// of matching on the direct child's variant. + fn quantifier_needs_parens(r: &RegularExpression) -> bool { + match r { + // Prints as a single char or a [class]: one token. + RegularExpression::Character(..) => false, + RegularExpression::Repetition(..) => true, + RegularExpression::Concat(parts) => match parts.len() { + 1 => Self::quantifier_needs_parens(&parts[0]), + // Covers both the empty concatenation — which prints as "" + // and needs the explicit group, `()*` is valid but a bare + // `*` is not — and real multi-part concatenations. + _ => true, + }, + RegularExpression::Alternation(parts) => match parts.len() { + // The empty alternation prints as "[]": one token. + 0 => false, + 1 => Self::quantifier_needs_parens(&parts[0]), + // Multi-part alternations print self-parenthesized. + _ => false, + }, + } + } + /// Checks if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { @@ -129,6 +178,13 @@ impl RegularExpression { match self { RegularExpression::Character(range) => Ok(FastAutomaton::new_from_range(range)), RegularExpression::Repetition(regular_expression, min, max_opt) => { + // The variants are freely constructible; invalid bounds are + // rejected at this boundary instead. + if let Some(max) = max_opt + && max < min + { + return Err(EngineError::InvalidRepetitionBounds(*min, *max)); + } let mut automaton = regular_expression.to_automaton()?; automaton.repeat_mut(*min, *max_opt)?; Ok(automaton) diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 2063e49..30b4bee 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -31,11 +31,11 @@ impl RegularExpression { }; match (front, back) { - (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { + (RegularExpression::Concat(..), RegularExpression::Concat(..)) => { Self::opconcat_concat_and_concat(front, back) } - (RegularExpression::Concat(_), _) => Self::opconcat_concat_and_other(front, back), - (_, RegularExpression::Concat(_)) => Self::opconcat_other_and_concat(front, back), + (RegularExpression::Concat(..), _) => Self::opconcat_concat_and_other(front, back), + (_, RegularExpression::Concat(..)) => Self::opconcat_other_and_concat(front, back), (_, _) => Self::opconcat_other_and_other(front, back), } } @@ -67,12 +67,14 @@ impl RegularExpression { return merged; } - let mut vec = that_elements.clone(); - let that_index = 0; - - if let Some(merged) = Self::opconcat_can_be_merged(this, &that_elements[that_index]) { - vec[that_index] = merged; + // Clone the surviving elements only: the boundary element is + // either replaced by the merge or kept alongside `this`. + let mut vec: VecDeque; + if let Some(merged) = Self::opconcat_can_be_merged(this, &that_elements[0]) { + vec = that_elements.iter().skip(1).cloned().collect(); + vec.push_front(merged); } else { + vec = that_elements.clone(); vec.push_front(this.clone()); } @@ -99,12 +101,14 @@ impl RegularExpression { return merged; } - let mut vec = this_elements.clone(); + // Clone the surviving elements only (see opconcat_other_and_concat). let this_index = this_elements.len() - 1; - + let mut vec: VecDeque; if let Some(merged) = Self::opconcat_can_be_merged(&this_elements[this_index], that) { - vec[this_index] = merged; + vec = this_elements.iter().take(this_index).cloned().collect(); + vec.push_back(merged); } else { + vec = this_elements.clone(); vec.push_back(that.clone()); } @@ -137,15 +141,17 @@ impl RegularExpression { return merged; } - let mut vec = this_elements.clone(); + // Clone the surviving elements only (see opconcat_other_and_concat). let (this_index, that_index) = (this_elements.len() - 1, 0); - + let mut vec: VecDeque; if let Some(merged) = Self::opconcat_can_be_merged(&this_elements[this_index], &that_elements[that_index]) { - vec[this_index] = merged; + vec = this_elements.iter().take(this_index).cloned().collect(); + vec.push_back(merged); vec.extend(that_elements.iter().skip(1).cloned()); } else { + vec = this_elements.clone(); vec.extend(that_elements.iter().cloned()); } @@ -159,6 +165,24 @@ impl RegularExpression { } } + /// Merges the bounds of two adjacent repetitions of the same expression, + /// `r{a,b}r{c,d}` → `r{a+c,b+d}`. Returns `None` — "cannot be merged", + /// falling back to plain concatenation — when an addition would overflow. + fn merge_repetition_bounds( + this_min: u32, + this_max_opt: &Option, + that_min: u32, + that_max_opt: &Option, + ) -> Option<(u32, Option)> { + let new_min = this_min.checked_add(that_min)?; + let new_max_opt = if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { + Some(this_max.checked_add(*that_max)?) + } else { + None + }; + Some((new_min, new_max_opt)) + } + fn opconcat_can_be_merged( this: &RegularExpression, that: &RegularExpression, @@ -169,13 +193,12 @@ impl RegularExpression { RegularExpression::Repetition(_, that_min, that_max_opt), ) = (this, that) { - let new_min = this_min + that_min; - let new_max_opt = - if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { - Some(this_max + that_max) - } else { - None - }; + let (new_min, new_max_opt) = Self::merge_repetition_bounds( + *this_min, + this_max_opt, + *that_min, + that_max_opt, + )?; Some(this_regex.repeat(new_min, new_max_opt)) } else { Some(this.repeat(2, Some(2))) @@ -186,22 +209,19 @@ impl RegularExpression { ) = (this, that) { if this_regex == that_regex { - let new_min = this_min + that_min; - let new_max_opt = - if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { - Some(this_max + that_max) - } else { - None - }; - + let (new_min, new_max_opt) = Self::merge_repetition_bounds( + *this_min, + this_max_opt, + *that_min, + that_max_opt, + )?; Some(this_regex.repeat(new_min, new_max_opt)) } else if let ( RegularExpression::Character(this_range), RegularExpression::Character(that_range), - ) = (*this_regex.clone(), *that_regex.clone()) + ) = (&**this_regex, &**that_regex) { - if this_range.contains_all(&that_range) && that_min == &0 && this_max_opt.is_none() - { + if this_range.contains_all(that_range) && that_min == &0 && this_max_opt.is_none() { Some(this.clone()) } else { None @@ -211,16 +231,16 @@ impl RegularExpression { } } else if let RegularExpression::Repetition(this_regex, this_min, this_max_opt) = this { if **this_regex == *that { - let new_min = this_min + 1; - let new_max_opt = this_max_opt.as_ref().map(|this_max| this_max + 1); + let (new_min, new_max_opt) = + Self::merge_repetition_bounds(*this_min, this_max_opt, 1, &Some(1))?; Some(this_regex.repeat(new_min, new_max_opt)) } else { None } } else if let RegularExpression::Repetition(that_regex, that_min, that_max_opt) = that { if **that_regex == *this { - let new_min = that_min + 1; - let new_max_opt = that_max_opt.as_ref().map(|this_max| this_max + 1); + let (new_min, new_max_opt) = + Self::merge_repetition_bounds(*that_min, that_max_opt, 1, &Some(1))?; Some(that_regex.repeat(new_min, new_max_opt)) } else { None @@ -235,6 +255,20 @@ impl RegularExpression { mod tests { use super::*; + // Regression: merging adjacent repetitions used to add bounds unchecked; + // huge (but valid) bounds must fall back to plain concatenation instead + // of overflowing. + #[test] + fn concat_merge_bound_overflow_falls_back_to_concat() { + let a = RegularExpression::new("a").unwrap(); + let big = RegularExpression::Repetition(Box::new(a), u32::MAX, None); + let result = big.concat(&big, true); + assert!(matches!( + &result, + RegularExpression::Concat(parts) if parts.len() == 2 + )); + } + #[test] fn test_concat() -> Result<(), String> { assert_concat("xxx", "x{3}"); diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 7364d65..0147fc9 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -3,4 +3,4 @@ use super::*; mod concat; mod repeat; mod simplify; -mod union; \ No newline at end of file +mod union; diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 3f42b31..363e00d 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -1,7 +1,11 @@ use super::*; impl RegularExpression { - /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// + /// When `max_opt` is below `min` there is no valid repetition count and + /// the result is the empty language, consistently with + /// [`FastAutomaton::repeat`](crate::fast_automaton::FastAutomaton::repeat). pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); @@ -10,7 +14,12 @@ impl RegularExpression { } else if self.is_empty_string() { return Self::new_empty_string(); } else if let Some(max) = max_opt { - if max < min || max == 0 { + if max < min { + // No valid repetition count: the language is empty. This + // matches `FastAutomaton::repeat`, which disagreed with the + // {""} previously returned here. + return RegularExpression::new_empty(); + } else if max == 0 { return RegularExpression::new_empty_string(); } else if min == 1 && max == 1 { return self.clone(); @@ -19,17 +28,25 @@ impl RegularExpression { match self { RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { - let new_max = if let (Some(o_max), Some(i_max)) = (max_opt, i_max_opt) { - Some(o_max * i_max) - } else { - None - }; - + // Only collapse (r{i_min,i_max}){min,max} into + // r{min·i_min,max·i_max} when the bounds are gap-free AND the + // multiplications don't overflow; the nested form is always a + // correct fallback. if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { - RegularExpression::Repetition(regular_expression.clone(), min * i_min, new_max) - } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + let new_min = min.checked_mul(*i_min); + let new_max = match (max_opt, i_max_opt) { + (Some(o_max), Some(i_max)) => o_max.checked_mul(*i_max).map(Some), + _ => Some(None), + }; + if let (Some(new_min), Some(new_max)) = (new_min, new_max) { + return RegularExpression::Repetition( + regular_expression.clone(), + new_min, + new_max, + ); + } } + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) } _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), } @@ -68,6 +85,33 @@ mod tests { use crate::{CharRange, regex::RegularExpression}; + // Regression: the nested-repetition simplification used to multiply + // bounds unchecked; huge (but valid) bounds must fall back to the nested + // form instead of overflowing. + #[test] + fn repeat_bound_overflow_keeps_nested_form() { + let a = RegularExpression::new("a").unwrap(); + let inner = a.repeat(2, Some(2)); // a{2} + let outer = inner.repeat(u32::MAX, Some(u32::MAX)); // 2·u32::MAX overflows + assert!(matches!( + &outer, + RegularExpression::Repetition(r, u32::MAX, Some(u32::MAX)) + if matches!(&**r, RegularExpression::Repetition(..)) + )); + } + + // r{min,max} with max < min has no valid repetition count: the language + // is empty, consistently with `FastAutomaton::repeat` (the regex side + // used to return {""} instead). + #[test] + fn repeat_with_max_below_min_is_empty() { + let a = RegularExpression::new("a").unwrap(); + assert!(a.repeat(5, Some(2)).is_empty()); + + let automaton = a.to_automaton().unwrap().repeat(5, Some(2)).unwrap(); + assert!(automaton.is_empty()); + } + #[test] fn test_parse_and_simplify() -> Result<(), String> { assert_parse_and_simplify("(xxx)*", "(x{3})*"); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index bb3f73c..6d9ed5f 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -4,7 +4,7 @@ impl RegularExpression { /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. pub fn simplify(&self) -> Self { match self { - RegularExpression::Character(_) => self.clone(), + RegularExpression::Character(..) => self.clone(), RegularExpression::Repetition(regex, min, max_opt) => { // Delegate to `repeat`, which guards the nested-repetition // collapse with `can_simplify_nested_repetition`. Collapsing diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index fe37ae6..fdc731f 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -42,49 +42,49 @@ impl RegularExpression { RegularExpression::Character(self_range), RegularExpression::Character(other_range), ) => RegularExpression::Character(self_range.union(other_range)), - (RegularExpression::Character(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Character(..), RegularExpression::Repetition(..)) => { Self::opunion_character_and_repetition(self, other) } - (RegularExpression::Character(_), RegularExpression::Concat(_)) => { + (RegularExpression::Character(..), RegularExpression::Concat(..)) => { Self::opunion_character_and_concat(self, other) } - (RegularExpression::Character(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Character(..), RegularExpression::Alternation(..)) => { Self::opunion_character_and_alternation(self, other) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Character(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Character(..)) => { Self::opunion_character_and_repetition(other, self) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Repetition(..), RegularExpression::Repetition(..)) => { Self::opunion_repetition_and_repetition(self, other) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Concat(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Concat(..)) => { Self::opunion_concat_and_repetition(other, self) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Alternation(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Alternation(..)) => { Self::opunion_repetition_and_alternation(self, other) } - (RegularExpression::Concat(_), RegularExpression::Character(_)) => { + (RegularExpression::Concat(..), RegularExpression::Character(..)) => { Self::opunion_character_and_concat(other, self) } - (RegularExpression::Concat(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Concat(..), RegularExpression::Repetition(..)) => { Self::opunion_concat_and_repetition(self, other) } - (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { + (RegularExpression::Concat(..), RegularExpression::Concat(..)) => { Self::opunion_common_affixes(self, other) } - (RegularExpression::Concat(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Concat(..), RegularExpression::Alternation(..)) => { Self::opunion_concat_and_alternation(self, other) } - (RegularExpression::Alternation(_), RegularExpression::Character(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Character(..)) => { Self::opunion_character_and_alternation(other, self) } - (RegularExpression::Alternation(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Alternation(..), RegularExpression::Repetition(..)) => { Self::opunion_repetition_and_alternation(other, self) } - (RegularExpression::Alternation(_), RegularExpression::Concat(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Concat(..)) => { Self::opunion_concat_and_alternation(other, self) } - (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(_)) => { + (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(..)) => { let mut new_alternation = Cow::Borrowed(other); for self_element in self_elements { new_alternation = new_alternation.union_(self_element); @@ -100,7 +100,7 @@ impl RegularExpression { that_repetition: &RegularExpression, ) -> RegularExpression { if let ( - RegularExpression::Character(_), + RegularExpression::Character(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_character, that_repetition) { @@ -168,10 +168,10 @@ impl RegularExpression { if let RegularExpression::Character(range) = element { set.insert(RegularExpression::Character(this_range.union(range))); had_character_union = true; - } else if matches!(element, RegularExpression::Repetition(_, _, _)) { + } else if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_character_and_repetition(this_character, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_character_union = true; } else { @@ -194,7 +194,7 @@ impl RegularExpression { this_character: &RegularExpression, that_concat: &RegularExpression, ) -> RegularExpression { - if let (RegularExpression::Character(_), RegularExpression::Concat(that_elements)) = + if let (RegularExpression::Character(..), RegularExpression::Concat(that_elements)) = (this_character, that_concat) { if that_elements.len() == 1 && that_elements[0] == *this_character { @@ -212,7 +212,7 @@ impl RegularExpression { that_repetition: &RegularExpression, ) -> RegularExpression { if let ( - RegularExpression::Concat(_), + RegularExpression::Concat(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_concat, that_repetition) { @@ -230,16 +230,16 @@ impl RegularExpression { this_concat: &RegularExpression, that_alternation: &RegularExpression, ) -> RegularExpression { - if let (RegularExpression::Concat(_), RegularExpression::Alternation(that_elements)) = + if let (RegularExpression::Concat(..), RegularExpression::Alternation(that_elements)) = (this_concat, that_alternation) { let mut set = BTreeSet::new(); let mut had_concat_union = false; for element in that_elements { - if matches!(element, RegularExpression::Repetition(_, _, _)) { + if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_concat_and_repetition(this_concat, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_concat_union = true; } else { @@ -320,28 +320,28 @@ impl RegularExpression { let mut had_repetition_union = false; for element in that_elements { - if matches!(element, RegularExpression::Repetition(_, _, _)) { + if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_repetition_and_repetition(this_repetition, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { set.insert(element.clone()); } - } else if matches!(element, RegularExpression::Character(_)) { + } else if matches!(element, RegularExpression::Character(..)) { let repetition = Self::opunion_character_and_repetition(element, this_repetition); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { set.insert(element.clone()); } - } else if matches!(element, RegularExpression::Concat(_)) { + } else if matches!(element, RegularExpression::Concat(..)) { let repetition = Self::opunion_concat_and_repetition(element, this_repetition); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { diff --git a/src/regex/serializer.rs b/src/regex/serializer.rs deleted file mode 100644 index 0832756..0000000 --- a/src/regex/serializer.rs +++ /dev/null @@ -1,25 +0,0 @@ -use serde::{de, Deserializer, Serializer}; - -use super::*; - -impl serde::Serialize for RegularExpression { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - serializer.serialize_str(&self.to_string()) - } -} - -impl<'de> serde::Deserialize<'de> for RegularExpression { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let regex_string = String::deserialize(deserializer)?; - match RegularExpression::new(®ex_string) { - Ok(regex) => Ok(regex), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } -} diff --git a/tests/proptest_strategies.rs b/tests/proptest_strategies.rs index dbf9bd7..5f638cf 100644 --- a/tests/proptest_strategies.rs +++ b/tests/proptest_strategies.rs @@ -24,52 +24,50 @@ //! determinization / set operations under test stay cheap. //! //! While *coverage* is uniform-in-support, the *distribution* is deliberately -//! shaped: per-automaton edge/epsilon/accept densities are sampled and the -//! character-class strategy favors single letters, so that degenerate -//! languages (∅, {""}) and near-complete blobs are occasional rather than -//! dominant. All weights stay strictly inside (0, 1), preserving the +//! shaped — the `inspect::stats` test measures the result and asserts floors: +//! +//! * Per-automaton edge/epsilon/accept densities are sampled, with the accept +//! density centered on ½ (where accepting/rejecting states are hardest to +//! merge, keeping minimal DFAs — and therefore the work done by minimize / +//! equivalence / state elimination — large). +//! * An optional "anchor" state is forced accepting so the empty language is +//! an occasional edge case instead of a fifth of the sample. +//! * A per-automaton acyclic mode (≈⅓ of cases) generates DAGs, whose finite +//! languages exercise the topological-sort paths of the cardinality and +//! max-length analyses that cyclic automata never reach. +//! * The character-class strategy favors single letters so that a `[]` +//! (empty-language) leaf does not collapse most expressions. +//! +//! All weights stay strictly inside (0, 1), preserving the //! non-null-probability guarantee. use proptest::prelude::*; use regex_charclass::char::Char; use regexsolver::CharRange; +use regexsolver::cardinality::Cardinality; use regexsolver::error::EngineError; use regexsolver::execution_profile::ExecutionProfileBuilder; use regexsolver::fast_automaton::FastAutomaton; -use regexsolver::fast_automaton::condition::Condition; -use regexsolver::fast_automaton::spanning_set::SpanningSet; use regexsolver::regex::RegularExpression; /// Fixed alphabet the strategies draw transition labels from. pub const ALPHABET: &[char] = &['a', 'b']; /// Maximum number of states a generated automaton can have. -pub const MAX_STATES: usize = 4; +pub const MAX_STATES: usize = 5; -/// The single-character range for the `i`-th alphabet letter. -fn letter(i: usize) -> CharRange { - let c = Char::new(ALPHABET[i]); +/// The single-character range for the `i`-th letter of `alphabet`. +fn letter_over(alphabet: &[char], i: usize) -> CharRange { + let c = Char::new(alphabet[i]); CharRange::new_from_range(c..=c) } -/// The spanning set induced by the alphabet (one base per letter + a "rest"). -fn spanning_set() -> SpanningSet { - let ranges: Vec = (0..ALPHABET.len()).map(letter).collect(); - SpanningSet::compute_spanning_set(&ranges) -} - -/// The transition-label bases: one [`CharRange`] per alphabet letter. -/// -/// We deliberately exclude the spanning set's "rest" range. The spanning set's -/// contract is that the rest holds exactly the characters that **no** transition -/// uses, so a label may only be a subset of the alphabet letters; that is also -/// precisely the standard "automaton over Σ" model. -fn bases(ss: &SpanningSet) -> Vec { - let _ = ss; - (0..ALPHABET.len()).map(letter).collect() +/// The single-character range for the `i`-th [`ALPHABET`] letter. +fn letter(i: usize) -> CharRange { + letter_over(ALPHABET, i) } -/// Number of transition-label bases (one per alphabet letter). +/// Number of transition-label letters (one per [`ALPHABET`] letter). fn num_bases() -> usize { ALPHABET.len() } @@ -82,25 +80,22 @@ fn arb_num_states() -> impl Strategy { (1usize..=MAX_STATES, 1usize..=MAX_STATES).prop_map(|(a, b)| a.max(b)) } -/// Builds an automaton from a structural description using only the public API. +/// Builds an automaton over `alphabet` from a structural description using +/// only the public API. /// /// `accepts[s]` marks state `s` accepting; each `(from, to, mask)` adds a -/// transition whose label is the union of the bases selected by `mask`; each -/// `(from, to)` in `eps` adds an epsilon transition. The start state is `0`. -fn build( +/// transition whose label is the union of the letters selected by `mask` +/// (`add_transition_from_range` grows the automaton's spanning set as +/// needed); each `(from, to)` in `eps` adds an epsilon transition. The start +/// state is `0`. +fn build_over( + alphabet: &[char], n: usize, accepts: &[bool], char_edges: &[(usize, usize, Vec)], eps: &[(usize, usize)], ) -> FastAutomaton { - let ss = spanning_set(); - let bs = bases(&ss); - let mut a = FastAutomaton::new_empty(); - // `new_empty` already owns state 0; the spanning set is applied before any - // transition so the conditions we add line up with it. - a.apply_new_spanning_set(&ss) - .expect("applying a spanning set to an empty automaton never fails"); for _ in 1..n { a.new_state(); } @@ -115,15 +110,11 @@ fn build( let mut range = CharRange::empty(); for (i, &on) in mask.iter().enumerate() { if on { - range = range.union(&bs[i]); + range = range.union(&letter_over(alphabet, i)); } } - if range.is_empty() { - continue; - } - let cond = Condition::from_range(&range, &ss) - .expect("a union of full spanning bases is always a valid condition"); - a.add_transition(*from, *to, &cond); + a.add_transition_from_range(*from, *to, &range) + .expect("adding a union of alphabet letters never fails"); } // Epsilon transitions are added last: `add_epsilon_transition` eagerly @@ -135,6 +126,16 @@ fn build( a } +/// [`build_over`] with the default [`ALPHABET`]. +fn build( + n: usize, + accepts: &[bool], + char_edges: &[(usize, usize, Vec)], + eps: &[(usize, usize)], +) -> FastAutomaton { + build_over(ALPHABET, n, accepts, char_edges, eps) +} + /// Strategy producing every DFA over the alphabet with `1..=MAX_STATES` states. /// /// Determinism is structural: for each state and each letter we choose at most @@ -143,22 +144,43 @@ fn build( /// An edge density and an accept density are sampled per automaton (both /// bounded away from 0 and 1, so every DFA keeps a positive probability). /// Mostly-total transition functions keep the states connected, which makes -/// degenerate (∅ / {""}) languages the exception rather than the rule. +/// degenerate (∅ / {""}) languages the exception rather than the rule. The +/// accept density is centered on ½ because that is where accepting/rejecting +/// states are hardest to merge, i.e. where minimal DFAs stay large. +/// +/// A per-automaton `acyclic` flag (≈⅓ of cases) remaps every chosen target +/// into the forward range `from+1..n`, producing a DAG and therefore a +/// **finite** language. Without it nearly every random automaton contains a +/// cycle, and the finite-language paths of the cardinality and max-length +/// analyses go untested. Cyclic mode still reaches every DFA, so coverage is +/// preserved. pub fn arb_dfa() -> impl Strategy { - (arb_num_states(), 0.6f64..0.97, 0.4f64..0.9) - .prop_flat_map(|(n, edge_density, accept_density)| { + ( + arb_num_states(), + 0.6f64..0.97, + 0.25f64..0.75, + prop::bool::weighted(0.35), + ) + .prop_flat_map(|(n, edge_density, accept_density, acyclic)| { let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // An "anchor" state forced accepting most of the time: without it + // the whole accept vector samples all-false often enough that the + // empty language eats a fifth of the sample. Non-start states are + // preferred — anchoring the start only inflates the {""} corner. + // The `None` branch keeps every accept subset (incl. all-false) + // reachable. + let anchor = prop::option::weighted(0.85, 1usize.min(n - 1)..n); // transition function: tf[state][base] = optional target state let tf = prop::collection::vec( - prop::collection::vec( - prop::option::weighted(edge_density, 0usize..n), - num_bases(), - ), + prop::collection::vec(prop::option::weighted(edge_density, 0usize..n), num_bases()), n, ); - (Just(n), accepts, tf) + (Just(n), accepts, anchor, tf, Just(acyclic)) }) - .prop_map(|(n, accepts, tf)| { + .prop_map(|(n, mut accepts, anchor, tf, acyclic)| { + if let Some(k) = anchor { + accepts[k] = true; + } let nb = num_bases(); let mut edges: Vec<(usize, usize, Vec)> = Vec::new(); for (from, row) in tf.iter().enumerate() { @@ -168,7 +190,18 @@ pub fn arb_dfa() -> impl Strategy { std::collections::BTreeMap::new(); for (base, target) in row.iter().enumerate() { if let Some(t) = target { - by_target.entry(*t).or_insert_with(|| vec![false; nb])[base] = true; + let t = if acyclic { + if from + 1 >= n { + // the last state of a DAG has no outgoing edge + continue; + } + // remap into the forward range; every forward + // target keeps a positive probability + from + 1 + (*t % (n - from - 1)) + } else { + *t + }; + by_target.entry(t).or_insert_with(|| vec![false; nb])[base] = true; } } for (to, mask) in by_target { @@ -188,36 +221,72 @@ pub fn arb_dfa() -> impl Strategy { /// comparable across sizes. Epsilon and accept densities are sampled too. All /// densities stay strictly inside (0, 1), so every NFA keeps a positive /// probability. +/// +/// As in [`arb_dfa`], the accept density is centered on ½ and a per-automaton +/// `acyclic` flag (≈⅓ of cases) keeps only forward (`from < to`) edges, +/// producing finite languages; the density is rescaled to the smaller target +/// pool so the out-degree stays comparable. Epsilon transitions are kept rare +/// because [`FastAutomaton::add_epsilon_transition`] eagerly folds the target +/// state into the source, which merges languages and shrinks minimal DFAs. pub fn arb_nfa() -> impl Strategy { + arb_nfa_over(ALPHABET) +} + +/// [`arb_nfa`] generalized to an arbitrary alphabet, so two operands of a +/// binary operation can be generated over *different* alphabets — the only +/// way to exercise `SpanningSet::merge` and the `ConditionConverter` +/// re-projection (same-alphabet operands share an identical spanning set and +/// the conversion is the identity). +pub fn arb_nfa_over(alphabet: &'static [char]) -> impl Strategy { ( arb_num_states(), 1.0f64..2.8, - 0.02f64..0.18, - 0.4f64..0.9, + 0.02f64..0.12, + 0.25f64..0.75, + prop::bool::weighted(0.35), ) - .prop_flat_map(|(n, target_out_degree, eps_density, accept_density)| { - let label_density = - (target_out_degree / (n as f64 * num_bases() as f64)).clamp(0.02, 0.95); - let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); - // labels[from][to] = mask over the alphabet letters - let labels = prop::collection::vec( - prop::collection::vec( - prop::collection::vec(prop::bool::weighted(label_density), num_bases()), + .prop_flat_map( + move |(n, target_out_degree, eps_density, accept_density, acyclic)| { + // In acyclic mode only the upper triangle of the matrix survives, + // so the average target pool is half as big. + let effective_targets = if acyclic { + (n as f64 / 2.0).max(1.0) + } else { + n as f64 + }; + let label_density = (target_out_degree + / (effective_targets * alphabet.len() as f64)) + .clamp(0.02, 0.95); + let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // see arb_dfa: keeps the empty language an edge case, not a fifth + // of the sample + let anchor = prop::option::weighted(0.85, 1usize.min(n - 1)..n); + // labels[from][to] = mask over the alphabet letters + let labels = prop::collection::vec( + prop::collection::vec( + prop::collection::vec(prop::bool::weighted(label_density), alphabet.len()), + n, + ), n, - ), - n, - ); - // eps[from][to] = whether an epsilon transition is present - let eps = prop::collection::vec( - prop::collection::vec(prop::bool::weighted(eps_density), n), - n, - ); - (Just(n), accepts, labels, eps) - }) - .prop_map(|(n, accepts, labels, eps)| { + ); + // eps[from][to] = whether an epsilon transition is present + let eps = prop::collection::vec( + prop::collection::vec(prop::bool::weighted(eps_density), n), + n, + ); + (Just(n), accepts, anchor, labels, eps, Just(acyclic)) + }, + ) + .prop_map(move |(n, mut accepts, anchor, labels, eps, acyclic)| { + if let Some(k) = anchor { + accepts[k] = true; + } let mut char_edges = Vec::new(); for (from, row) in labels.iter().enumerate() { for (to, mask) in row.iter().enumerate() { + if acyclic && to <= from { + continue; + } if mask.iter().any(|&b| b) { char_edges.push((from, to, mask.clone())); } @@ -226,12 +295,13 @@ pub fn arb_nfa() -> impl Strategy { let mut eps_edges = Vec::new(); for (from, row) in eps.iter().enumerate() { for (to, &on) in row.iter().enumerate() { - if on && from != to { + let backward = acyclic && to <= from; + if on && from != to && !backward { eps_edges.push((from, to)); } } } - build(n, &accepts, &char_edges, &eps_edges) + build_over(alphabet, n, &accepts, &char_edges, &eps_edges) }) } @@ -260,19 +330,26 @@ fn arb_charrange() -> impl Strategy { /// Strategy producing regular expressions over the four [`RegularExpression`] /// variants up to a bounded recursion depth. +/// +/// Repetition gets the heaviest weight: it is the variant that feeds the +/// `{n,m}` expansion, the simplifier and the loop handling of state +/// elimination, and stacking it (`(a*){2}`-style nesting) is where those +/// paths historically break. Concat and alternation still keep substantial +/// weight so all shapes appear. pub fn arb_regex() -> impl Strategy { let leaf = arb_charrange().prop_map(RegularExpression::Character); - leaf.prop_recursive(3, 24, 3, |inner| { + leaf.prop_recursive(4, 48, 3, |inner| { prop_oneof![ - (inner.clone(), 0u32..=2, 0u32..=2, any::()).prop_map( + 3 => (inner.clone(), 0u32..=2, 0u32..=2, any::()).prop_map( |(r, min, extra, has_max)| { + // max is min + extra, so the bounds are always valid let max = if has_max { Some(min + extra) } else { None }; RegularExpression::Repetition(Box::new(r), min, max) } ), - prop::collection::vec(inner.clone(), 1..=3) - .prop_map(|v| RegularExpression::Concat(v.into_iter().collect())), - prop::collection::vec(inner, 1..=3).prop_map(RegularExpression::Alternation), + 2 => prop::collection::vec(inner.clone(), 1..=3) + .prop_map(|v| RegularExpression::Concat(v.into())), + 2 => prop::collection::vec(inner, 1..=3).prop_map(RegularExpression::Alternation), ] }) } @@ -304,14 +381,15 @@ fn complemented(a: &FastAutomaton) -> Option { }) } -/// All strings up to length 4 over the alphabet (plus the empty string). -fn probes() -> Vec { +/// All strings up to length `max_len` over `alphabet` (plus the empty +/// string). +fn probes_over(alphabet: &[char], max_len: usize) -> Vec { let mut all = vec![String::new()]; let mut frontier = vec![String::new()]; - for _ in 0..4 { + for _ in 0..max_len { let mut next = Vec::new(); for w in &frontier { - for &c in ALPHABET { + for &c in alphabet { let mut s = w.clone(); s.push(c); next.push(s); @@ -323,6 +401,96 @@ fn probes() -> Vec { all } +/// All strings up to length 4 over [`ALPHABET`] (plus the empty string). +fn probes() -> Vec { + probes_over(ALPHABET, 4) +} + +/// Asserts that intersection / union / difference of `a` and `b` agree with +/// the boolean combination of the operands on every probe string. +fn assert_set_ops_membership( + a: &FastAutomaton, + b: &FastAutomaton, + probes: &[String], +) -> Result<(), TestCaseError> { + if let Some(inter) = bounded(|| a.intersection(b)) { + for s in probes { + prop_assert_eq!( + inter.is_match(s), + a.is_match(s) && b.is_match(s), + "intersection membership for {:?}", + s + ); + } + } + if let Some(union) = bounded(|| a.union(b)) { + for s in probes { + prop_assert_eq!( + union.is_match(s), + a.is_match(s) || b.is_match(s), + "union membership for {:?}", + s + ); + } + } + // `difference` determinizes the subtrahend itself. + if let Some(diff) = bounded(|| a.difference(b)) { + for s in probes { + prop_assert_eq!( + diff.is_match(s), + a.is_match(s) && !b.is_match(s), + "difference membership for {:?}", + s + ); + } + } + Ok(()) +} + +/// Decomposition oracle for repetition: `s` is in L(a){min,max} iff `s` +/// splits into k pieces, each in L(a), for some valid k. Piece counts +/// saturate at `min` once they can only grow (relevant for unbounded +/// maxima and for "" ∈ L(a), which allows padding with empty pieces). +fn repeat_decomposition_oracle(a: &FastAutomaton, s: &str, min: u32, max: Option) -> bool { + let min = min as usize; + let cap = max.map(|m| m as usize).unwrap_or(min).max(min); + let accepts_empty = a.is_match(""); + let len = s.len(); + + // reach[i][k]: the prefix of length i splits into exactly k pieces + // (k saturated at cap + 1 to keep the table finite). + let k_slots = cap + 2; + let mut reach = vec![vec![false; k_slots]; len + 1]; + reach[0][0] = true; + for i in 0..=len { + for k in 0..k_slots { + if !reach[i][k] { + continue; + } + let next_k = (k + 1).min(cap + 1); + // Pad with an empty piece. + if accepts_empty { + reach[i][next_k] = true; + } + // Consume a non-empty piece. + for j in i + 1..=len { + if a.is_match(&s[i..j]) { + reach[j][next_k] = true; + } + } + } + } + + let k_ok = |k: usize| { + k >= min + && match max { + Some(m) => k <= m as usize, + None => true, + } + }; + (0..k_slots).any(|k| reach[len][k] && k_ok(k)) +} + proptest! { #![proptest_config(ProptestConfig::with_cases(192))] @@ -382,35 +550,167 @@ proptest! { /// combination of the operands on every probe string. #[test] fn set_ops_membership(a in arb_nfa(), b in arb_nfa()) { - let ps = probes(); + assert_set_ops_membership(&a, &b, &probes())?; + } + + /// Set operations across operands built over *overlapping but different* + /// alphabets ({a,b} vs {b,c}): the operands carry different spanning + /// sets, so `SpanningSet::merge` and the `ConditionConverter` + /// re-projection do real work (same-alphabet pairs convert via the + /// identity). The shared letter `b` keeps the intersections non-trivial. + #[test] + fn set_ops_membership_overlapping_alphabets( + a in arb_nfa_over(&['a', 'b']), + b in arb_nfa_over(&['b', 'c']), + ) { + assert_set_ops_membership(&a, &b, &probes_over(&['a', 'b', 'c'], 4))?; + } + + /// Set operations across operands built over *disjoint* alphabets + /// ({a,b} vs {c,d}): the merged spanning set shares no base with either + /// source, the most extreme re-projection. The intersection collapses to + /// at most {""} — itself a worthwhile edge case. + #[test] + fn set_ops_membership_disjoint_alphabets( + a in arb_nfa_over(&['a', 'b']), + b in arb_nfa_over(&['c', 'd']), + ) { + assert_set_ops_membership(&a, &b, &probes_over(&['a', 'b', 'c', 'd'], 3))?; + } + + /// `get_length` and `get_cardinality` agree with brute-force enumeration. + /// The probes cover *every* string up to length 4, so they are exactly + /// the language whenever the maximum length is ≤ 4, and a complete + /// census of its short strings otherwise. + #[test] + fn length_cardinality_match_brute_force(a in arb_nfa()) { + let (min, max) = a.get_length(); + let matched_lengths: Vec = probes() + .iter() + .filter(|s| a.is_match(s)) + .map(|s| s.chars().count() as u32) + .collect(); + + // Minimum: any string of length ≤ 4 is a probe, so a language with + // min ≤ 4 has a matched probe of exactly that length. + match (min, matched_lengths.iter().min()) { + (Some(min_len), Some(&shortest)) => { + prop_assert_eq!(min_len, shortest, "min length disagrees with enumeration"); + } + (Some(min_len), None) => { + prop_assert!(min_len > 4, "min ≤ 4 but no probe matched"); + } + (None, Some(_)) => prop_assert!(false, "empty language matched a probe"), + (None, None) => {} + } - if let Some(inter) = bounded(|| a.intersection(&b)) { - for s in &ps { + if let Some(max_len) = max + && max_len <= 4 + { + // The probes enumerate the whole language. + prop_assert_eq!( + Some(max_len), + matched_lengths.iter().max().copied(), + "max length disagrees with enumeration" + ); + if let Some(cardinality) = bounded(|| a.get_cardinality()) { prop_assert_eq!( - inter.is_match(s), a.is_match(s) && b.is_match(s), - "intersection membership for {:?}", s + cardinality, + Cardinality::Integer(matched_lengths.len() as u32), + "cardinality disagrees with enumeration" ); } + } else if max.is_none() + && min.is_some() + && let Some(cardinality) = bounded(|| a.get_cardinality()) + { + // A cycle on an accepting path means infinitely many strings. + prop_assert_eq!( + cardinality, + Cardinality::Infinite, + "infinite language with non-infinite cardinality" + ); } - if let Some(union) = bounded(|| a.union(&b)) { - for s in &ps { + } + + /// `FastAutomaton::concat` agrees with the split-membership oracle: + /// s ∈ L(a)·L(b) iff some split s = u·v has u ∈ L(a) and v ∈ L(b). + #[test] + fn automaton_concat_matches_split_oracle(a in arb_nfa(), b in arb_nfa()) { + if let Some(concat) = bounded(|| a.concat(&b)) { + for s in probes() { + let expected = + (0..=s.len()).any(|i| a.is_match(&s[..i]) && b.is_match(&s[i..])); prop_assert_eq!( - union.is_match(s), a.is_match(s) || b.is_match(s), - "union membership for {:?}", s + concat.is_match(&s), expected, + "concat membership for {:?}", s ); } } - // `difference` determinizes the subtrahend itself. - if let Some(diff) = bounded(|| a.difference(&b)) { - for s in &ps { + } + + /// `FastAutomaton::repeat` agrees with a decomposition oracle computed by + /// dynamic programming over (position, piece-count) — independent of the + /// engine's own repeat construction (which the regex route would reuse). + #[test] + fn automaton_repeat_matches_decomposition_oracle( + a in arb_nfa(), + min in 0u32..3, + extra in 0u32..2, + unbounded in any::(), + ) { + let max = if unbounded { None } else { Some(min + extra) }; + if let Some(repeated) = bounded(|| a.repeat(min, max)) { + for s in probes() { + let expected = repeat_decomposition_oracle(&a, &s, min, max); prop_assert_eq!( - diff.is_match(s), a.is_match(s) && !b.is_match(s), - "difference membership for {:?}", s + repeated.is_match(&s), expected, + "repeat({}, {:?}) membership for {:?}", min, max, s ); } } } + /// `Term::union` / `Term::intersection` over more than 3 operands (the + /// parallel dispatch path when the `parallel` feature is on) agree with + /// sequential pairwise folds. + #[test] + fn many_operand_term_ops_match_pairwise_folds( + a in arb_nfa(), b in arb_nfa(), c in arb_nfa(), d in arb_nfa(), e in arb_nfa(), + ) { + use regexsolver::Term; + + let operands: Vec = [&b, &c, &d, &e] + .into_iter() + .map(|x| Term::from_automaton(x.clone())) + .collect(); + let first = Term::from_automaton(a.clone()); + + if let Some(many) = bounded(|| { + Ok(first.union(&operands)?.to_automaton()?.into_owned()) + }) && let Some(pairwise) = bounded(|| { + let mut acc = a.clone(); + for x in [&b, &c, &d, &e] { + acc = acc.union(x)?; + } + Ok(acc) + }) && let Some(eq) = bounded(|| many.equivalent(&pairwise)) { + prop_assert!(eq, "5-operand union disagrees with pairwise folds"); + } + + if let Some(many) = bounded(|| { + Ok(first.intersection(&operands)?.to_automaton()?.into_owned()) + }) && let Some(pairwise) = bounded(|| { + let mut acc = a.clone(); + for x in [&b, &c, &d, &e] { + acc = acc.intersection(x)?; + } + Ok(acc) + }) && let Some(eq) = bounded(|| many.equivalent(&pairwise)) { + prop_assert!(eq, "5-operand intersection disagrees with pairwise folds"); + } + } + /// `subset` and `equivalent` agree: mutual subset iff equivalent; both are /// reflexive. #[test] @@ -477,6 +777,7 @@ mod inspect { use super::*; use proptest::strategy::{Strategy, ValueTree}; use proptest::test_runner::TestRunner; + use regex_charclass::CharacterClass; fn samples(strat: S, n: usize) -> Vec { let mut runner = TestRunner::deterministic(); @@ -506,61 +807,369 @@ mod inspect { } } - /// Classifies the language of an automaton for the quality summary. - fn classify(a: &FastAutomaton) -> &'static str { - if a.is_empty() { - "empty" - } else if a.is_empty_string() { - "{\"\"}" - } else if a - .determinize() - .map(|d| d.is_total()) - .unwrap_or(false) - { - "total" + /// The language class of a generated entity, ordered from degenerate to + /// rich. + /// + /// `Empty`, `EmptyString` and `Total` are the corners of the language + /// lattice: useful as occasional edge cases (they hit the `is_empty` / + /// complement / difference fast paths) but they exercise nothing else. + /// `Finite` languages take the topological-sort path of the cardinality + /// and max-length analyses; `Infinite` ones take the cycle paths of state + /// elimination and repeat synthesis. A quality sample needs both in bulk. + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] + enum LangClass { + Empty, + EmptyString, + Total, + Finite, + Infinite, + } + + /// Reduces an automaton to its canonical minimal DFA. + fn minimal_dfa(a: &FastAutomaton) -> FastAutomaton { + let mut m = determinized(a).expect("generated automata always determinize in budget"); + bounded(|| m.minimize()).expect("generated automata always minimize in budget"); + m + } + + /// Classifies the language of a **minimal DFA**. + /// + /// Unlike the trivial-vs-"interesting" split this used to be, the + /// non-trivial bulk is split into finite and infinite languages, which + /// exercise disjoint code paths (see [`LangClass`]). + fn classify(m: &FastAutomaton) -> LangClass { + if m.is_empty() { + LangClass::Empty + } else if m.is_empty_string() { + LangClass::EmptyString + } else if m.is_total() { + // exact on a DFA + LangClass::Total + } else if m.get_length().1.is_some() { + LangClass::Finite } else { - "interesting" + LangClass::Infinite } } - fn automaton_stats(name: &str, autos: &[FastAutomaton]) { - let n = autos.len() as f64; - let mut counts = std::collections::BTreeMap::new(); - let mut states = 0usize; - let mut edges = 0usize; - for a in autos { - *counts.entry(classify(a)).or_insert(0usize) += 1; - states += a.get_number_of_states(); - edges += a - .states_vec() - .iter() - .map(|&s| a.transitions_from_vec(s).len()) - .sum::(); + /// Canonical fingerprint of a language: the minimal DFA, renumbered in + /// BFS order with the outgoing transitions of each state sorted by label. + /// Minimal DFAs are unique up to isomorphism, so two automata share a key + /// iff they accept the same language — this is what lets the stats count + /// *distinct* languages instead of distinct syntax trees. + fn language_key(m: &FastAutomaton) -> String { + use std::fmt::Write; + let ss = m.get_spanning_set(); + let mut order = vec![m.get_start_state()]; + let mut ids = std::collections::HashMap::new(); + ids.insert(m.get_start_state(), 0usize); + let mut key = String::new(); + let mut i = 0; + while i < order.len() { + let s = order[i]; + i += 1; + // In a DFA the labels leaving a state are disjoint, hence unique, + // so sorting by label gives a deterministic traversal order. + let mut out: Vec<(String, usize)> = m + .transitions_from_vec(s) + .into_iter() + .map(|(c, t)| { + ( + c.to_range(ss) + .expect("condition always converts to a range") + .to_regex(), + t, + ) + }) + .collect(); + out.sort(); + write!(key, "{}", if m.is_accepted(s) { 'A' } else { 'r' }).unwrap(); + for (label, t) in out { + let id = match ids.get(&t) { + Some(&id) => id, + None => { + let id = order.len(); + ids.insert(t, id); + order.push(t); + id + } + }; + write!(key, " {label}>{id}").unwrap(); + } + key.push(';'); + } + key + } + + /// Everything we measure about one generated automaton. + struct Measure { + // Structure of the generated entity itself. + edges: usize, + multi_base_edges: usize, + deterministic: bool, + // Properties of its *language*, computed on the minimal DFA. + class: LangClass, + minimal_states: usize, + accepts_empty_string: bool, + key: String, + } + + fn measure(a: &FastAutomaton) -> Measure { + let mut edges = 0; + let mut multi_base_edges = 0; + for s in a.states_vec() { + for (cond, _) in a.transitions_from_vec(s) { + edges += 1; + if cond + .get_binary_representation() + .iter() + .filter(|&&b| b) + .count() + > 1 + { + multi_base_edges += 1; + } + } + } + let m = minimal_dfa(a); + Measure { + edges, + multi_base_edges, + deterministic: a.is_deterministic(), + class: classify(&m), + minimal_states: m.get_number_of_states(), + accepts_empty_string: m.is_match(""), + key: language_key(&m), + } + } + + /// Aggregated quality metrics over a sample; what the strategies are + /// evaluated (and asserted) on. + struct Quality { + /// Share of `Empty` + `EmptyString` + `Total` languages. Wanted as a + /// small minority: present (they are real edge cases) but not eating + /// the sample. + degenerate_pct: f64, + finite_pct: f64, + infinite_pct: f64, + /// Distinct languages (by canonical minimal DFA) over sample size. + /// Duplicates re-test the same language and are wasted cases. + distinct_pct: f64, + /// Average minimal-DFA size: the number of Myhill-Nerode classes is + /// what minimize / equivalence / state elimination actually scale + /// with, so this — not the raw state count — is language complexity. + avg_minimal_states: f64, + /// Share of languages needing a minimal DFA of ≥ 3 states. + rich_pct: f64, + accepts_empty_string_pct: f64, + /// Share of transitions whose condition spans more than one base of + /// the spanning set (exercises the bitvector paths beyond single + /// bits). + multi_base_edge_pct: f64, + /// Share of genuinely nondeterministic automata (only meaningful for + /// the NFA strategy: a "NFA" that is already deterministic never + /// exercises subset construction). + nondeterministic_pct: f64, + } + + fn quality(name: &str, measures: &[Measure]) -> Quality { + let n = measures.len() as f64; + let count = |f: &dyn Fn(&Measure) -> bool| { + 100.0 * measures.iter().filter(|m| f(m)).count() as f64 / n + }; + + let distinct: std::collections::HashSet<&str> = + measures.iter().map(|m| m.key.as_str()).collect(); + let edges: usize = measures.iter().map(|m| m.edges).sum(); + let multi: usize = measures.iter().map(|m| m.multi_base_edges).sum(); + + let mut histogram = std::collections::BTreeMap::new(); + for m in measures { + *histogram.entry(m.minimal_states).or_insert(0usize) += 1; } - let pct = |k: &str| 100.0 * *counts.get(k).unwrap_or(&0) as f64 / n; + + let q = Quality { + degenerate_pct: count(&|m| { + matches!( + m.class, + LangClass::Empty | LangClass::EmptyString | LangClass::Total + ) + }), + finite_pct: count(&|m| m.class == LangClass::Finite), + infinite_pct: count(&|m| m.class == LangClass::Infinite), + distinct_pct: 100.0 * distinct.len() as f64 / n, + avg_minimal_states: measures.iter().map(|m| m.minimal_states).sum::() as f64 / n, + rich_pct: count(&|m| m.minimal_states >= 3), + accepts_empty_string_pct: count(&|m| m.accepts_empty_string), + multi_base_edge_pct: 100.0 * multi as f64 / edges.max(1) as f64, + nondeterministic_pct: count(&|m| !m.deterministic), + }; + + println!( + "{name}: degenerate {:>4.1}% (∅ {:.1}% | {{\"\"}} {:.1}% | Σ* {:.1}%) | finite {:>4.1}% | infinite {:>4.1}%", + q.degenerate_pct, + count(&|m| m.class == LangClass::Empty), + count(&|m| m.class == LangClass::EmptyString), + count(&|m| m.class == LangClass::Total), + q.finite_pct, + q.infinite_pct, + ); println!( - "{name}: empty {:>5.1}% | {{\"\"}} {:>5.1}% | total {:>5.1}% | interesting {:>5.1}% | avg states {:.2} | avg edges {:.2}", - pct("empty"), pct("{\"\"}"), pct("total"), pct("interesting"), - states as f64 / n, edges as f64 / n, + "{name}: distinct languages {:>4.1}% | accepts \"\" {:>4.1}% | nondet {:>4.1}% | multi-base edges {:>4.1}%", + q.distinct_pct, + q.accepts_empty_string_pct, + q.nondeterministic_pct, + q.multi_base_edge_pct, ); + println!( + "{name}: minimal-DFA states avg {:.2}, ≥3 {:>4.1}%, histogram {:?}", + q.avg_minimal_states, q.rich_pct, histogram, + ); + q } - /// Quantitative quality summary over a larger sample. + /// Operator coverage of a generated regular expression; the round-trip + /// (state elimination) and simplification code paths are keyed on these + /// shapes. + #[derive(Default)] + struct RegexFacets { + unbounded_repetition: bool, + bounded_repetition: bool, + nested_repetition: bool, + alternation: bool, + multi_char_class: bool, + } + + fn regex_facets(r: &RegularExpression, inside_repetition: bool, f: &mut RegexFacets) { + match r { + RegularExpression::Character(range) => { + let letters = (0..ALPHABET.len()) + .filter(|&i| !range.intersection(&letter(i)).is_empty()) + .count(); + if letters > 1 || range.is_total() { + f.multi_char_class = true; + } + } + RegularExpression::Repetition(inner, _, max) => { + if max.is_some() { + f.bounded_repetition = true; + } else { + f.unbounded_repetition = true; + } + if inside_repetition { + f.nested_repetition = true; + } + regex_facets(inner, true, f); + } + RegularExpression::Concat(parts) => { + for p in parts { + regex_facets(p, inside_repetition, f); + } + } + RegularExpression::Alternation(parts) => { + f.alternation = true; + for p in parts { + regex_facets(p, inside_repetition, f); + } + } + } + } + + /// Quantitative quality summary over a larger sample, with floors the + /// strategies must keep. The sample runner is deterministic, so the + /// numbers — and therefore the assertions — are reproducible. #[test] fn stats() { const N: usize = 300; let regexes = samples(arb_regex(), N); - let regex_autos: Vec = regexes + let regex_measures: Vec = regexes .iter() - .map(|r| r.to_automaton().expect("small regexes always convert")) + .map(|r| measure(&r.to_automaton().expect("small regexes always convert"))) .collect(); - let avg_len = - regexes.iter().map(|r| r.to_string().len()).sum::() as f64 / N as f64; - automaton_stats("regex", ®ex_autos); - println!("regex: avg pattern length {avg_len:.1}"); + let regex_q = quality("regex", ®ex_measures); + + let avg_len = regexes.iter().map(|r| r.to_string().len()).sum::() as f64 / N as f64; + let mut facet_counts = [0usize; 5]; + for r in ®exes { + let mut f = RegexFacets::default(); + regex_facets(r, false, &mut f); + for (i, hit) in [ + f.unbounded_repetition, + f.bounded_repetition, + f.nested_repetition, + f.alternation, + f.multi_char_class, + ] + .into_iter() + .enumerate() + { + facet_counts[i] += hit as usize; + } + } + let fpct = |i: usize| 100.0 * facet_counts[i] as f64 / N as f64; + println!( + "regex: avg pattern length {avg_len:.1} | unbounded-rep {:.1}% | bounded-rep {:.1}% | nested-rep {:.1}% | alternation {:.1}% | multi-char class {:.1}%", + fpct(0), + fpct(1), + fpct(2), + fpct(3), + fpct(4), + ); + + let dfa_q = quality( + "dfa ", + &samples(arb_dfa(), N) + .iter() + .map(measure) + .collect::>(), + ); + let nfa_q = quality( + "nfa ", + &samples(arb_nfa(), N) + .iter() + .map(measure) + .collect::>(), + ); - automaton_stats("dfa ", &samples(arb_dfa(), N)); - automaton_stats("nfa ", &samples(arb_nfa(), N)); + for (name, q) in [("regex", ®ex_q), ("dfa", &dfa_q), ("nfa", &nfa_q)] { + // The trivial corner languages should be present but a minority. + assert!( + q.degenerate_pct < 25.0, + "{name}: too many degenerate languages" + ); + // Both bulk classes must be well represented. + assert!( + q.finite_pct >= 15.0, + "{name}: finite languages under-represented" + ); + assert!( + q.infinite_pct >= 15.0, + "{name}: infinite languages under-represented" + ); + // The sample must not keep re-testing the same languages. + assert!( + q.distinct_pct >= 45.0, + "{name}: not enough distinct languages" + ); + // Language complexity: minimal DFAs must not collapse to 1-2 states. + assert!(q.rich_pct >= 35.0, "{name}: minimal DFAs too small"); + // Both "" ∈ L and "" ∉ L need bulk representation. + assert!( + (20.0..=80.0).contains(&q.accepts_empty_string_pct), + "{name}: empty-string acceptance unbalanced" + ); + // Conditions spanning several bases must show up regularly. + assert!( + q.multi_base_edge_pct >= 10.0, + "{name}: multi-base conditions too rare" + ); + } + // An NFA strategy that mostly produces DFAs never exercises subset + // construction. + assert!( + nfa_q.nondeterministic_pct >= 50.0, + "nfa: mostly deterministic" + ); } } diff --git a/tests/readme_examples.rs b/tests/readme_examples.rs new file mode 100644 index 0000000..c4e1f99 --- /dev/null +++ b/tests/readme_examples.rs @@ -0,0 +1,77 @@ +//! Keeps the README's examples honest: these tests are the README snippets, +//! verbatim. If one fails, update the README. + +use regexsolver::Term; +use regexsolver::error::EngineError; + +#[test] +fn readme_automaton_building_example() -> Result<(), EngineError> { + use regex_charclass::char::Char; + use regexsolver::CharRange; + use regexsolver::fast_automaton::FastAutomaton; + + // Build an automaton matching "[a-c][0-9]*" by hand: + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + automaton.accept(s1); + + let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); + let digits = CharRange::new_from_range(Char::new('0')..=Char::new('9')); + automaton.add_transition_from_range(0, s1, &a_to_c)?; + automaton.add_transition_from_range(s1, s1, &digits)?; + + assert!(automaton.is_match("b42")); + assert!(!automaton.is_match("4b")); + assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); + + Ok(()) +} + +#[test] +fn readme_hero_example() -> Result<(), EngineError> { + let a = Term::from_pattern("(ab|xy){2}")?; + let b = Term::from_pattern(".*xy")?; + + // Which strings match BOTH patterns? Get the answer as a regex: + let both = a.intersection(&[b])?; + assert_eq!(both.to_pattern(), "(ab|xy)xy"); + + // ...and sample them: + assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); + + Ok(()) +} + +#[test] +fn readme_regular_expression_example() -> Result<(), EngineError> { + use regexsolver::cardinality::Cardinality; + use regexsolver::regex::RegularExpression; + + // A validation pattern for an order id, e.g. "ORD-2024-12345". + let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; + + // How long can matching ids get? Size your database column accordingly. + assert_eq!(pattern.get_length(), (Some(13), Some(15))); + + // How many distinct ids does the pattern allow? + assert_eq!(pattern.get_cardinality(), Cardinality::Integer(111_000_000)); + + // The AST is a plain enum: walk it to lint patterns, e.g. reject + // validation rules that accept unboundedly long input. + fn has_unbounded_repetition(regex: &RegularExpression) -> bool { + match regex { + RegularExpression::Character(_) => false, + RegularExpression::Repetition(inner, _, max) => { + max.is_none() || has_unbounded_repetition(inner) + } + RegularExpression::Concat(parts) => parts.iter().any(has_unbounded_repetition), + RegularExpression::Alternation(parts) => parts.iter().any(has_unbounded_repetition), + } + } + assert!(!has_unbounded_repetition(&pattern)); + assert!(has_unbounded_repetition(&RegularExpression::new( + ".*@example\\.com" + )?)); + + Ok(()) +} From 4c2f06397db9aa11e53b9b2056a4af5d1a0151fb Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:03:32 +0200 Subject: [PATCH 59/62] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0cfb51a..e985ce0 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); Under the hood, every pattern compiles to a finite automaton: -

the minimal automaton of (ab|cd)*

+

the minimal automaton of (ab|cd)*

(ab|cd)* compiled to its minimal automaton — generated with this library's as_dot()

## Try it From eacd8a9a9d97796e68f8ae16f05d458a22fa8baf Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:42:58 +0200 Subject: [PATCH 60/62] Remove minimize execution profile --- README.md | 60 +++------- src/execution_profile.rs | 117 -------------------- src/fast_automaton/operation/determinize.rs | 5 - 3 files changed, 14 insertions(+), 168 deletions(-) diff --git a/README.md b/README.md index e985ce0..b2c4ae3 100644 --- a/README.md +++ b/README.md @@ -23,16 +23,16 @@ assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); ## What would you use this for? -- **Safe migrations** — `old_rule.subset(&new_rule)?`: does the new validation pattern accept *everything* the old one did? -- **Test-data generation** — `term.generate_strings(100, 0)?`: produce strings matching any pattern, with pagination. -- **Rule analysis** — find shadowed or overlapping routes, firewall rules, and validators with `intersection` / `difference`. -- **Equivalence proofs** — `a.equivalent(&b)?`: show that two differently-written patterns match exactly the same strings. -- **Pattern simplification** — every operation returns a `Term` you can turn back into a clean pattern with `to_pattern()`. +- **Safe migrations** - `old_rule.subset(&new_rule)?`: does the new validation pattern accept *everything* the old one did? +- **Test-data generation** - `term.generate_strings(100, 0)?`: produce strings matching any pattern, with pagination. +- **Rule analysis**: find shadowed or overlapping routes, firewall rules, and validators with `intersection` / `difference`. +- **Equivalence proofs** - `a.equivalent(&b)?`: show that two differently-written patterns match exactly the same strings. +- **Pattern simplification**: every operation returns a `Term` you can turn back into a clean pattern with `to_pattern()`. Under the hood, every pattern compiles to a finite automaton:

the minimal automaton of (ab|cd)*

-

(ab|cd)* compiled to its minimal automaton — generated with this library's as_dot()

+

(ab|cd)* compiled to its minimal automaton, generated with this library's as_dot()

## Try it @@ -42,24 +42,10 @@ git clone https://github.com/RegexSolver/regexsolver && cd regexsolver # How do two patterns relate? (equivalence, subsets, intersection, differences) cargo run --example relate -- "(ab|xy){2}" ".*xy" -# Sample strings matching a pattern +# Generate n sample strings matching a pattern cargo run --example generate -- "[a-z]{2}[0-9]" 20 ``` -```text -a = (ab|xy){2} -b = .*xy - -equivalent: no -a subset of b: false -b subset of a: false - -a ∩ b = (ab|xy)xy - e.g. ["xyxy", "abxy"] -a - b = (ab|xy)ab -b - a = (x{1,2}|ax|([^ax]|a[^b]|x[^y]).*x|(ab|xy)(x{2}|ax|([^ax]|a[^b]|x[^y]|(ab|xy).).*x|(ab|xy)x))y -``` - Or in your own project: ```bash @@ -74,15 +60,15 @@ regexsolver = { version = "1", default-features = false } ## Semantics in 30 seconds -RegexSolver implements **pure regular languages**, which differs from typical regex engines in two ways that surprise people: +RegexSolver implements **pure regular languages**, which differs from typical regex engines in two ways: -- **Everything is anchored**: `abc` matches the string "abc" — not "xabc" or "abcx". Patterns describe *whole strings*. +- **Everything is anchored**: `abc` matches the string "abc", not "xabc" or "abcx". Patterns describe *whole strings*. - **`.` matches any character**, including line feed (`\n`). The rest follows from regular-language theory: - **Backreferences** (`\1`, `\2`, ...) go beyond regular languages and return an error, as do **lookahead/lookbehind** assertions (`(?=...)`, `(?<=...)`). -- **All quantifiers are greedy**: ungreedy markers (`*?`, `+?`, `??`) are ignored — as *sets of strings*, `a*` and `a*?` are the same language. +- **All quantifiers are greedy**: ungreedy markers (`*?`, `+?`, `??`) are ignored as *sets of strings*, `a*` and `a*?` are the same language. - **The empty language** (matches no string at all) is written `[]` (empty character class). This is distinct from the empty string `""`. RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. @@ -102,7 +88,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `generate_strings(limit, offset)` | Enumerate matching strings (call `minimize()` once first when paginating). | | `to_pattern()` / `to_automaton()` / `to_regex()` | Convert back out. | -All fallible operations return `Result<_, EngineError>` — nothing panics on adversarial input. +All fallible operations return `Result<_, EngineError>`. ### Building automata by hand @@ -129,9 +115,9 @@ assert!(automaton.is_match("b42")); assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); ``` -Internally, transition labels are bitvector `Condition`s over the automaton's `SpanningSet` of disjoint character ranges — that is what makes label union/intersection/complement O(1) ([article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation)). `add_transition_from_range` maintains that representation for you; for full manual control over conditions and spanning sets, see the [`add_transition` documentation](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html#method.add_transition). +Internally, transition labels are bitvector `Condition`s over the automaton's `SpanningSet` of disjoint character ranges, that is what makes label union/intersection/complement O(1) ([article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation)). `add_transition_from_range` maintains that representation for you; for full manual control over conditions and spanning sets, see the [`add_transition` documentation](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html#method.add_transition). -Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html) — `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex` — plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `as_dot`, ...). +Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html), including `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex`, plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `as_dot`, ...). ### Working with patterns as ASTs @@ -211,7 +197,7 @@ execution_profile.run(|| { ### Disabling Implicit Determinization -`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `get_cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working — that layer manages the underlying representation itself, so its determinizations count as explicit. +`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `get_cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working since that layer manages the underlying representation itself, so its determinizations count as explicit. ```rust use regexsolver::execution_profile::ExecutionProfileBuilder; @@ -231,24 +217,6 @@ execution_profile.run(|| { }); ``` -### Minimizing After Determinization - -Every determinization can be followed automatically by a minimization of the result (off by default: it costs an extra Hopcroft pass, but keeps downstream operations working on the smallest possible automata). Inputs that are already deterministic are returned untouched. - -```rust -use regexsolver::execution_profile::ExecutionProfileBuilder; - -let execution_profile = ExecutionProfileBuilder::new() - .minimize_after_determinization(true) // default is false - .build(); - -// `nfa` is any non-deterministic FastAutomaton -execution_profile.run(|| { - let dfa = nfa.determinize().unwrap(); - assert!(dfa.is_minimal()); -}); -``` - ## How it works - Patterns are parsed with [regex-syntax](https://docs.rs/regex-syntax/latest/regex_syntax/) and simplified into a small regular-expression AST; set operations run on finite automata; results convert back to patterns via state elimination. diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 2850341..df2bd01 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -100,11 +100,6 @@ pub struct ExecutionProfile { /// an explicit `determinize()` call. [`Term`](crate::Term) methods /// always work: that layer manages the representation itself. implicit_determinization: bool, - /// Whether every determinization is followed by a minimization of the - /// resulting automaton. Off by default: minimization costs an extra - /// Hopcroft pass, but keeps downstream operations working on the - /// smallest possible automata. - minimize_after_determinization: bool, } impl PartialEq for ExecutionProfile { @@ -112,7 +107,6 @@ impl PartialEq for ExecutionProfile { self.max_number_of_states == other.max_number_of_states && self.execution_timeout == other.execution_timeout && self.implicit_determinization == other.implicit_determinization - && self.minimize_after_determinization == other.minimize_after_determinization } } @@ -184,17 +178,6 @@ impl ExecutionProfile { self } - pub fn with_minimize_after_determinization(mut self, enabled: bool) -> Self { - self.minimize_after_determinization = enabled; - self - } - - /// Whether every determinization should be followed by a minimization of - /// the result. - pub(crate) fn should_minimize_after_determinization(&self) -> bool { - self.minimize_after_determinization - } - pub fn set(&self) -> &Self { self } @@ -240,9 +223,6 @@ pub struct ExecutionProfileBuilder { /// Whether operations requiring a deterministic automaton may determinize /// a non-deterministic input on their own. Defaults to `true`. implicit_determinization: bool, - /// Whether every determinization is followed by a minimization of the - /// result. Defaults to `false`. - minimize_after_determinization: bool, } impl Default for ExecutionProfileBuilder { fn default() -> Self { @@ -256,7 +236,6 @@ impl ExecutionProfileBuilder { max_number_of_states: None, execution_timeout: None, implicit_determinization: true, - minimize_after_determinization: false, } } @@ -282,23 +261,12 @@ impl ExecutionProfileBuilder { self } - /// Whether every determinization is followed by a minimization of the - /// resulting automaton. Off by default: minimization costs an extra - /// Hopcroft pass, but keeps downstream operations working on the - /// smallest possible automata. Inputs that are already deterministic are - /// not touched. - pub fn minimize_after_determinization(mut self, enabled: bool) -> Self { - self.minimize_after_determinization = enabled; - self - } - pub fn build(self) -> ExecutionProfile { ExecutionProfile { max_number_of_states: self.max_number_of_states, execution_timeout: self.execution_timeout, execution_deadline: None, implicit_determinization: self.implicit_determinization, - minimize_after_determinization: self.minimize_after_determinization, } } } @@ -310,7 +278,6 @@ impl ThreadLocalParams { static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; static IMPLICIT_DETERMINIZATION: RefCell = const { RefCell::new(true) }; - static MINIMIZE_AFTER_DETERMINIZATION: RefCell = const { RefCell::new(false) }; } /// Store on the current thread [`ExecutionProfile`]. @@ -330,10 +297,6 @@ impl ThreadLocalParams { ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| { *cell.borrow_mut() = profile.implicit_determinization; }); - - ThreadLocalParams::MINIMIZE_AFTER_DETERMINIZATION.with(|cell| { - *cell.borrow_mut() = profile.minimize_after_determinization; - }); } fn get_max_number_of_states() -> Option { @@ -352,10 +315,6 @@ impl ThreadLocalParams { ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| *cell.borrow()) } - fn get_minimize_after_determinization() -> bool { - ThreadLocalParams::MINIMIZE_AFTER_DETERMINIZATION.with(|cell| *cell.borrow()) - } - /// Return the [`ExecutionProfile`] stored on the current thread. fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { @@ -363,7 +322,6 @@ impl ThreadLocalParams { execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), implicit_determinization: Self::get_implicit_determinization(), - minimize_after_determinization: Self::get_minimize_after_determinization(), } } } @@ -471,57 +429,6 @@ mod tests { }); } - #[test] - fn test_minimize_after_determinization() { - use crate::CharRange; - use crate::fast_automaton::FastAutomaton; - use crate::fast_automaton::condition::Condition; - use crate::fast_automaton::spanning_set::SpanningSet; - use regex_charclass::char::Char; - - // NFA over base 'a' + rest whose subset construction yields two - // distinct but language-equivalent accept states ({f1, f3} on 'a', - // {f2} on [^a]) — 3 determinized states, 2 after minimization. - let range_a = CharRange::new_from_range(Char::new('a')..=Char::new('a')); - let ss = SpanningSet::compute_spanning_set(std::slice::from_ref(&range_a)); - let mut nfa = FastAutomaton::new_empty(); - nfa.apply_new_spanning_set(&ss).unwrap(); - let f1 = nfa.new_state(); - let f2 = nfa.new_state(); - let f3 = nfa.new_state(); - let cond_a = Condition::from_range(&range_a, &ss).unwrap(); - let cond_rest = cond_a.complement(); - nfa.add_transition(0, f1, &cond_a); - nfa.add_transition(0, f3, &cond_a); // overlaps with f1: non-deterministic - nfa.add_transition(0, f2, &cond_rest); - nfa.accept(f1); - nfa.accept(f2); - nfa.accept(f3); - assert!(!nfa.is_deterministic()); - - // Default: determinize alone does not minimize. - let plain = nfa.determinize().unwrap().into_owned(); - assert!(plain.is_deterministic()); - assert!(!plain.is_minimal()); - assert_eq!(plain.get_number_of_states(), 3); - - ExecutionProfileBuilder::new() - .minimize_after_determinization(true) - .build() - .run(|| { - let minimized = nfa.determinize().unwrap().into_owned(); - assert!(minimized.is_deterministic()); - assert!(minimized.is_minimal()); - assert_eq!(minimized.get_number_of_states(), 2); - assert!(minimized.equivalent(&plain).unwrap()); - - // Already-deterministic inputs are returned untouched: the - // flag only applies when a determinization actually happens. - let same = plain.determinize().unwrap(); - assert!(!same.is_minimal()); - }); - } - /// The `implicit_determinization` knob targets direct `FastAutomaton` /// usage; `Term` manages the underlying representation itself, so its /// whole public API must keep working when the knob is off. @@ -568,30 +475,6 @@ mod tests { }); } - /// The two determinization knobs compose: implicit determinization - /// stays gated, while an explicit `determinize()` both works and - /// minimizes its result. - #[test] - fn test_minimize_after_determinization_with_implicit_disabled() { - let nfa = nondeterministic_automaton(); - - ExecutionProfileBuilder::new() - .implicit_determinization(false) - .minimize_after_determinization(true) - .build() - .run(|| { - assert_eq!( - nfa.clone().minimize().unwrap_err(), - EngineError::DeterministicAutomatonRequired - ); - - let dfa = nfa.determinize().unwrap(); - assert!(dfa.is_deterministic()); - assert!(dfa.is_minimal()); - assert!(dfa.equivalent(&nfa.determinize().unwrap()).unwrap()); - }); - } - #[test] fn test_implicit_determinization_default() { let nfa = nondeterministic_automaton(); diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index a241ed4..1099d45 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -85,11 +85,6 @@ impl FastAutomaton { } } - // Optionally fold the freshly built DFA down to its minimal form - if execution_profile.should_minimize_after_determinization() { - new_automaton.minimize()?; - } - Ok(Cow::Owned(new_automaton)) } } From 518ec3b42807160b61ca43bef1aea2b3949ea245 Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Tue, 9 Jun 2026 22:30:08 +0200 Subject: [PATCH 61/62] Some lib updates --- README.md | 16 +- examples/relate.rs | 2 +- src/cardinality/mod.rs | 2 +- src/error/mod.rs | 6 +- src/execution_profile.rs | 44 +- src/fast_automaton/analyze/cardinality.rs | 2 +- src/fast_automaton/analyze/equivalence.rs | 2 +- src/fast_automaton/analyze/length.rs | 4 +- src/fast_automaton/analyze/mod.rs | 6 +- src/fast_automaton/analyze/subset.rs | 2 +- src/fast_automaton/builder.rs | 7 +- src/fast_automaton/condition/converter.rs | 2 +- .../condition/fast_bit_vec/mod.rs | 2 +- src/fast_automaton/condition/mod.rs | 38 +- src/fast_automaton/mod.rs | 4 +- src/fast_automaton/operation/difference.rs | 4 +- src/fast_automaton/operation/intersection.rs | 2 +- src/fast_automaton/operation/minimize.rs | 4 +- src/fast_automaton/operation/mod.rs | 2 +- src/fast_automaton/operation/repeat.rs | 10 +- src/fast_automaton/operation/union.rs | 2 +- src/fast_automaton/spanning_set/mod.rs | 18 +- src/lib.rs | 633 +++++++++++++++--- src/regex/analyze/number_of_states.rs | 6 +- src/regex/builder.rs | 6 +- src/regex/mod.rs | 14 +- src/regex/operation/concat.rs | 4 +- src/regex/operation/union.rs | 4 +- tests/readme_examples.rs | 5 +- tests/state_elimination_quality.rs | 73 ++ 30 files changed, 778 insertions(+), 148 deletions(-) create mode 100644 tests/state_elimination_quality.rs diff --git a/README.md b/README.md index b2c4ae3..3a831db 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,16 @@ The `regex` crate tells you whether a *string* matches a pattern. **RegexSolver ```rust use regexsolver::Term; -let a = Term::from_pattern("(ab|xy){2}")?; -let b = Term::from_pattern(".*xy")?; +let a: Term = "(ab|xy){2}".parse()?; +let b: Term = ".*xy".parse()?; // Which strings match BOTH patterns? Get the answer as a regex: -let both = a.intersection(&[b])?; +let both = a.intersection([&b])?; assert_eq!(both.to_pattern(), "(ab|xy)xy"); +// Test a concrete string against the result (matching is anchored): +assert!(both.matches("abxy")?); + // ...and sample them: assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); ``` @@ -27,7 +30,7 @@ assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); - **Test-data generation** - `term.generate_strings(100, 0)?`: produce strings matching any pattern, with pagination. - **Rule analysis**: find shadowed or overlapping routes, firewall rules, and validators with `intersection` / `difference`. - **Equivalence proofs** - `a.equivalent(&b)?`: show that two differently-written patterns match exactly the same strings. -- **Pattern simplification**: every operation returns a `Term` you can turn back into a clean pattern with `to_pattern()`. +- **Pattern simplification**: every operation returns a `Term` you can turn back into a regex pattern with `to_pattern()`. Under the hood, every pattern compiles to a finite automaton: @@ -82,10 +85,11 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `Term::from_pattern(pattern)` | Parses a pattern into a term. | | `intersection(&self, terms)` / `union(&self, terms)` | Set operations over any number of terms. | | `difference(&self, other)` / `complement(&self)` | What `self` matches and `other` doesn't / everything `self` doesn't match. | -| `concat(&self, terms)` / `repeat(&self, min, max)` | Sequence and repeat languages. | +| `concat(&self, terms)` / `repeat(&self, range)` | Sequence and repeat languages; `range` is any Rust range expression (`2..=5`, `1..`, `..3`, ...). | | `equivalent(&self, other)` / `subset(&self, other)` | Compare languages. | | `is_empty()` / `is_total()` / `get_length()` / `get_cardinality()` | Analyze a language: matches nothing? everything? string lengths? how many strings? | -| `generate_strings(limit, offset)` | Enumerate matching strings (call `minimize()` once first when paginating). | +| `generate_strings(limit, offset)` | Enumerate matching strings eagerly (call `minimize()` once first when paginating). | +| `iter_strings()` | Lazy iterator equivalent; computes the automaton once and yields strings in batches. | | `to_pattern()` / `to_automaton()` / `to_regex()` | Convert back out. | All fallible operations return `Result<_, EngineError>`. diff --git a/examples/relate.rs b/examples/relate.rs index 25096de..ff056aa 100644 --- a/examples/relate.rs +++ b/examples/relate.rs @@ -29,7 +29,7 @@ fn main() -> Result<(), Box> { println!("b subset of a: {}", b_term.subset(&a_term)?); println!(); - let intersection = a_term.intersection(std::slice::from_ref(&b_term))?; + let intersection = a_term.intersection([&b_term])?; if intersection.is_empty()? { println!("a ∩ b = [] (no string matches both)"); } else { diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 4456820..d2e054c 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,4 +1,4 @@ -/// Represent a number. +/// Represents a cardinality: either a specific integer, a number too large to represent exactly, or infinite. #[derive(PartialEq, Eq, Debug, Clone)] pub enum Cardinality { /// An infinite number. diff --git a/src/error/mod.rs b/src/error/mod.rs index d543728..df44b9d 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -10,9 +10,9 @@ pub enum EngineError { OperationTimeOutError, /// The automaton has too many states. AutomatonHasTooManyStates, - /// The regular expression can not be parsed. + /// The regular expression cannot be parsed. RegexSyntaxError(String), - /// The provided range can not be built from the spanning set. + /// The provided range cannot be built from the spanning set. ConditionInvalidRange, /// The repetition bounds are invalid: the maximum is below the minimum. InvalidRepetitionBounds(u32, u32), @@ -34,7 +34,7 @@ impl fmt::Display for EngineError { EngineError::RegexSyntaxError(err) => write!(f, "{err}."), EngineError::ConditionInvalidRange => write!( f, - "The provided range can not be built from the spanning set." + "The provided range cannot be built from the spanning set." ), EngineError::InvalidRepetitionBounds(min, max) => write!( f, diff --git a/src/execution_profile.rs b/src/execution_profile.rs index df2bd01..5b41c63 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -5,9 +5,9 @@ use std::{ use crate::error::EngineError; -/// Hold settings about limitations and constraints of operations execution within the engine. +/// Holds settings that constrain how operations execute within the engine. /// -/// # Examples: +/// # Examples /// /// ## Limiting the number of states /// ``` @@ -111,7 +111,7 @@ impl PartialEq for ExecutionProfile { } impl ExecutionProfile { - /// Retrieve the current thread-local execution profile. + /// Retrieves the current thread-local execution profile. pub fn get() -> ExecutionProfile { ThreadLocalParams::get_execution_profile() } @@ -163,26 +163,33 @@ impl ExecutionProfile { } } + /// Returns a copy of this profile with the execution timeout set to + /// `execution_timeout_in_ms` milliseconds. Use these `with_*` methods to + /// derive a variant of an existing profile (e.g. one from + /// [`get`](Self::get)); to build one from scratch, prefer + /// [`ExecutionProfileBuilder`]. See + /// [`ExecutionProfileBuilder::execution_timeout`]. pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } + /// Returns a copy of this profile with the maximum number of states set to + /// `max_number_of_states`. See + /// [`ExecutionProfileBuilder::max_number_of_states`]. pub fn with_max_number_of_states(mut self, max_number_of_states: usize) -> Self { self.max_number_of_states = Some(max_number_of_states); self } + /// Returns a copy of this profile with implicit determinization enabled or + /// disabled. See [`ExecutionProfileBuilder::implicit_determinization`]. pub fn with_implicit_determinization(mut self, allowed: bool) -> Self { self.implicit_determinization = allowed; self } - pub fn set(&self) -> &Self { - self - } - - /// Run the given closure with this profile at thread level, setting its start time to now. + /// Runs the given closure with this profile installed for the current thread, setting its start time to now. pub fn run(&self, f: F) -> R where F: FnOnce() -> R, @@ -201,7 +208,7 @@ impl ExecutionProfile { result } - /// Like [`ExecutionProfile::run`], but does *not* reset its start time. Useful if you want to pass a profile state to a new thread. + /// Runs the closure like [`run`](Self::run), but does not reset the start time. Use this to propagate an already-started profile to worker threads without restarting the clock. pub fn apply(&self, f: F) -> R where F: FnOnce() -> R, @@ -231,6 +238,9 @@ impl Default for ExecutionProfileBuilder { } impl ExecutionProfileBuilder { + /// Creates a builder with no limits set and implicit determinization + /// enabled (i.e. the defaults, equivalent to the ambient profile when none + /// has been installed). pub fn new() -> Self { Self { max_number_of_states: None, @@ -239,11 +249,19 @@ impl ExecutionProfileBuilder { } } + /// Sets the longest time, in milliseconds, that an operation may run before + /// it aborts with [`EngineError::OperationTimeOutError`]. Enforcement is + /// best-effort (checked between internal steps), so the exact deadline is + /// not guaranteed. Unset by default (no timeout). pub fn execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { self.execution_timeout = Some(execution_timeout_in_ms); self } + /// Caps the number of states an automaton may reach; operations that would + /// exceed it abort with [`EngineError::AutomatonHasTooManyStates`]. This + /// bounds the exponential blow-up of conversions such as determinization. + /// Unset by default (no cap). pub fn max_number_of_states(mut self, max_number_of_states: usize) -> Self { self.max_number_of_states = Some(max_number_of_states); self @@ -254,13 +272,15 @@ impl ExecutionProfileBuilder { /// non-deterministic input on their own (the default). When set to /// `false`, those operations return /// [`EngineError::DeterministicAutomatonRequired`] instead; explicit - /// `determinize()` calls — and [`Term`](crate::Term) methods, which - /// manage the representation themselves — are always allowed. + /// `determinize()` calls and [`Term`](crate::Term) methods (which + /// manage the representation themselves) are always allowed. pub fn implicit_determinization(mut self, allowed: bool) -> Self { self.implicit_determinization = allowed; self } + /// Builds the [`ExecutionProfile`]. Install it around a unit of work with + /// [`ExecutionProfile::run`]. pub fn build(self) -> ExecutionProfile { ExecutionProfile { max_number_of_states: self.max_number_of_states, @@ -458,7 +478,7 @@ mod tests { assert!(term.concat(std::slice::from_ref(&other)).is_ok()); assert!(term.union(std::slice::from_ref(&other)).is_ok()); assert!(term.intersection(std::slice::from_ref(&other)).is_ok()); - assert!(term.repeat(0, Some(2)).is_ok()); + assert!(term.repeat(0..=2).is_ok()); assert!(term.is_empty().is_ok()); assert!(term.is_empty_string().is_ok()); let _ = term.get_length(); diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 20dd6c4..5ccf1c5 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -7,7 +7,7 @@ impl FastAutomaton { /// /// Works on non-deterministic automata too: acyclic NFAs are determinized /// internally (the only fallible step, subject to the - /// [`crate::execution_profile::ExecutionProfile`] budget — and rejected + /// [`crate::execution_profile::ExecutionProfile`] budget, and rejected /// with [`EngineError::DeterministicAutomatonRequired`] when the profile /// disables implicit determinization). /// diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 857131b..3d432ca 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -5,7 +5,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if both automata accept the same language. /// - /// Non-deterministic operands are determinized internally — unless the + /// Non-deterministic operands are determinized internally, unless the /// execution profile disables implicit determinization, in which case /// [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn equivalent(&self, other: &FastAutomaton) -> Result { diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 00997ac..6023de6 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -21,7 +21,7 @@ impl FastAutomaton { return (None, None); } - // BFS from the start over live states only — every state on an + // BFS from the start over live states only; every state on an // accepting path is live, so this loses no accepting path. BFS visits // in non-decreasing depth, hence the first accept hit is the minimum. // The visited set (reachable ∩ live) is exactly the subgraph relevant @@ -158,7 +158,7 @@ mod tests { } // Regression: `get_length` used to enumerate paths with a cloned `seen` - // set per branch — exponential time and memory on branching DAGs. A chain + // set per branch (exponential time and memory on branching DAGs). A chain // of diamonds has 2^k paths; the linear algorithm must handle it // instantly. #[test] diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 57b8783..d70c724 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -54,7 +54,7 @@ impl FastAutomaton { /// /// Sound and complete for **deterministic** automata: a DFA's language /// equals Σ\* iff every reachable state is accepting AND its outgoing - /// conditions union to Σ. For NFAs this is sound but conservative — + /// conditions union to Σ. For NFAs this is sound but conservative: /// alternative paths may cover a character that no single reachable /// state covers, so callers that need an exact answer on an NFA should /// determinize first. @@ -155,7 +155,7 @@ impl FastAutomaton { /// accept state** by following non-empty transitions. Computed by a reverse /// traversal from the accept states. /// - /// This is co-reachability — note it is *not* the set of states reachable + /// This is co-reachability; note it is *not* the set of states reachable /// from the start state. pub fn get_live_states(&self) -> IntSet { let mut states_map: IntMap> = @@ -193,7 +193,7 @@ impl FastAutomaton { live } - /// Returns one [`Condition`] per base of the spanning set — including the + /// Returns one [`Condition`] per base of the spanning set, including the /// "rest" range when it is non-empty. /// /// The bases must partition the whole alphabet Σ: subset construction diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index e3eef3a..e64baa4 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -5,7 +5,7 @@ use super::*; impl FastAutomaton { /// Returns `true` if all strings accepted by `self` are also accepted by `other`. /// - /// A non-deterministic `other` is determinized internally — unless the + /// A non-deterministic `other` is determinized internally, unless the /// execution profile disables implicit determinization, in which case /// [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn subset(&self, other: &FastAutomaton) -> Result { diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index 488bef1..d603e98 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -178,7 +178,7 @@ impl FastAutomaton { /// /// An empty range matches no character, so no transition is added. /// - /// # Example + /// # Examples /// /// ``` /// use regexsolver::CharRange; @@ -206,8 +206,8 @@ impl FastAutomaton { } // Fast path: the range is exactly expressible in the current - // spanning set. `Condition::from_range` alone cannot tell us that — - // it silently drops partially-covered bases — so round-trip the + // spanning set. `Condition::from_range` alone cannot tell us that + // (it silently drops partially-covered bases), so round-trip the // condition to check exactness. if let Ok(condition) = Condition::from_range(range, &self.spanning_set) && condition.to_range(&self.spanning_set)? == *range @@ -263,7 +263,6 @@ impl FastAutomaton { Ok(()) } - /// Creates a new epsilon transition between the two states. /// Adds an epsilon transition by eagerly folding `to_state`'s **current** /// transitions (and acceptance) into `from_state`. /// diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 43beb02..dd99b7c 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -77,7 +77,7 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { // The one invariant every legitimate use (refining and coarsening // alike) must uphold: the projection denotes the same character set. // A violation means a condition referenced a base the target spanning - // set cannot express — a silent language corruption in release. + // set cannot express, causing silent language corruption in release. debug_assert_eq!( condition .to_range(self.from_spanning_set) diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index 8caf9b6..78116d4 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -82,7 +82,7 @@ impl FastBitVec { /// The binary operations combine blocks pairwise with `zip`, which would /// silently truncate to the shorter operand if two bitvectors built over - /// different spanning sets were ever combined — producing a wrong + /// different spanning sets were ever combined, producing a wrong /// language instead of a loud failure. Catch that in debug builds (and /// therefore in every test run). #[inline] diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 644a05d..788ae88 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -9,7 +9,7 @@ use super::spanning_set::SpanningSet; pub mod converter; mod fast_bit_vec; -/// Contains the condition of a transition in a [`crate::FastAutomaton`] +/// Represents the condition of a transition in a [`crate::FastAutomaton`]. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Condition(FastBitVec); @@ -26,6 +26,8 @@ impl Hash for Condition { } impl Condition { + /// Returns the condition that matches no character, sized for + /// `spanning_set` (every bit cleared). #[inline] pub fn empty(spanning_set: &SpanningSet) -> Self { Self(FastBitVec::from_elem( @@ -34,6 +36,8 @@ impl Condition { )) } + /// Returns the condition that matches every character, sized for + /// `spanning_set` (every bit set). #[inline] pub fn total(spanning_set: &SpanningSet) -> Self { Self(FastBitVec::from_elem( @@ -42,6 +46,13 @@ impl Condition { )) } + /// Converts a [`CharRange`] to a `Condition` sized for `spanning_set`. + /// + /// Returns [`EngineError::ConditionInvalidRange`] if the range is not + /// expressible in the current spanning set (no base is fully contained in + /// `range`). In that case, extend the spanning set first with + /// [`SpanningSet::merge`] or [`SpanningSet::compute_spanning_set`], apply + /// it with [`FastAutomaton::apply_new_spanning_set`], then retry. pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); @@ -68,6 +79,12 @@ impl Condition { Ok(cond) } + /// Converts this `Condition` back to the [`CharRange`] it represents, + /// evaluated against `spanning_set`. + /// + /// Returns [`EngineError::IncompatibleSpanningSet`] if this condition's + /// bit width does not match `spanning_set` (they were built from different + /// spanning sets). pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { // A condition only carries meaning relative to the spanning set it // was built from. Evaluating it against a differently-sized one used @@ -91,6 +108,8 @@ impl Condition { Ok(range) } + /// Returns the condition matching characters in `self` or `cond` (bitwise + /// OR). Both must share the same spanning set. #[inline] pub fn union(&self, cond: &Condition) -> Self { let mut new_cond = self.clone(); @@ -98,6 +117,8 @@ impl Condition { new_cond } + /// Returns the condition matching characters in both `self` and `cond` + /// (bitwise AND). Both must share the same spanning set. #[inline] pub fn intersection(&self, cond: &Condition) -> Self { let mut new_cond = self.clone(); @@ -105,6 +126,8 @@ impl Condition { new_cond } + /// Returns the condition matching exactly the characters `self` does not, + /// relative to its spanning set. #[inline] pub fn complement(&self) -> Self { let mut new_cond = self.clone(); @@ -112,6 +135,8 @@ impl Condition { new_cond } + /// Returns the condition matching characters in `self` but not in `cond` + /// (bitwise AND-NOT). Both must share the same spanning set. #[inline] pub fn difference(&self, cond: &Condition) -> Self { let mut new_cond = self.clone(); @@ -120,11 +145,16 @@ impl Condition { new_cond } + /// Returns `true` if `self` and `cond` share at least one character (their + /// intersection is non-empty). Both must share the same spanning set. #[inline] pub fn has_intersection(&self, cond: &Condition) -> bool { self.0.has_intersection(&cond.0) } + /// Returns `true` if the condition matches `character` (a Unicode scalar + /// value), evaluated against `spanning_set`. Values that are not valid + /// scalar values never match. #[inline] pub fn has_character( &self, @@ -138,21 +168,27 @@ impl Condition { } } + /// Returns `true` if the condition matches no character. #[inline] pub fn is_empty(&self) -> bool { self.0.empty() } + /// Returns `true` if the condition matches every character. #[inline] pub fn is_total(&self) -> bool { self.0.total() } + /// Returns the number of characters the condition matches, evaluated + /// against `spanning_set`. #[inline] pub fn get_cardinality(&self, spanning_set: &SpanningSet) -> Result { Ok(self.to_range(spanning_set)?.get_cardinality()) } + /// Returns the condition as a vector of bits, one per range of the spanning + /// set it was built against (the rest range first, when present). #[inline] pub fn get_binary_representation(&self) -> Vec { self.0.get_bits() diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 79a1e36..8fa7c9f 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -11,7 +11,7 @@ use super::*; pub(crate) type Transitions = IntMap; -/// The identifier of state in an [`FastAutomaton`] +/// The identifier of a state in a [`FastAutomaton`]. pub type State = usize; mod analyze; @@ -22,7 +22,7 @@ mod generate; mod operation; pub mod spanning_set; -/// Represent a finite state automaton. +/// Represents a finite-state automaton. #[derive(Clone, Debug, PartialEq, Eq)] #[must_use = "non-`_mut` operations return a new automaton"] pub struct FastAutomaton { diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index 740a68e..ee865f2 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -47,7 +47,7 @@ impl FastAutomaton { /// Complements the automaton. /// - /// If `self` is non-deterministic, it is determinized in place first — + /// If `self` is non-deterministic, it is determinized in place first, /// unless the execution profile disables implicit determinization, in /// which case [`EngineError::DeterministicAutomatonRequired`] is /// returned. @@ -71,7 +71,7 @@ impl FastAutomaton { /// Computes the difference between `self` and `other`. /// - /// If `other` is non-deterministic, it is determinized first — unless + /// If `other` is non-deterministic, it is determinized first, unless /// the execution profile disables implicit determinization, in which /// case [`EngineError::DeterministicAutomatonRequired`] is returned. pub fn difference(&self, other: &FastAutomaton) -> Result { diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 8bf8bb3..39ec050 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -219,7 +219,7 @@ mod tests { use crate::regex::RegularExpression; // Regression: `has_intersection` enforced the timeout but not the state - // budget, unlike `intersection` — the product pair map could grow + // budget, unlike `intersection`: the product pair map could grow // unchecked. #[test] fn has_intersection_respects_state_budget() { diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs index 9385930..75f54bf 100644 --- a/src/fast_automaton/operation/minimize.rs +++ b/src/fast_automaton/operation/minimize.rs @@ -5,7 +5,7 @@ use super::*; impl FastAutomaton { /// Minimizes the automaton using Hopcroft's Algorithm. /// - /// If `self` is non-deterministic, it is determinized in place first — + /// If `self` is non-deterministic, it is determinized in place first, /// unless the [`ExecutionProfile`] disables implicit determinization, in /// which case [`EngineError::DeterministicAutomatonRequired`] is /// returned. @@ -23,7 +23,7 @@ impl FastAutomaton { let execution_profile = ExecutionProfile::get(); // Drop states unreachable from the start. A minimal automaton has none, - // and downstream invariants rely on it — in particular `is_empty`'s + // and downstream invariants rely on it; in particular `is_empty`'s // fast path treats any minimal automaton with an accept state as // non-empty, which only holds if every accept state is reachable. let reachable = self.forward_reachable_states(); diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index cf3274d..1b59ce0 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -11,7 +11,7 @@ mod repeat; mod union; impl FastAutomaton { - /// Removes "dead" states — those that cannot reach any accept state — since + /// Removes "dead" states (those that cannot reach any accept state), since /// they never contribute to the language. If the language is empty the whole /// automaton collapses to the canonical empty automaton. pub fn remove_dead_states(&mut self) { diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index fdf8afb..9f65f37 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -48,7 +48,7 @@ impl FastAutomaton { // stale state ids in the accept frontier). if self.is_empty() { if min == 0 { - // ∅⁰ is exactly {""} — replace the whole automaton instead + // ∅⁰ is exactly {""}: replace the whole automaton instead // of marking the start accepting: a dead automaton can still // have reachable transitions (e.g. a self-loop on a // non-accepting start), and an accepting start would wrongly @@ -279,7 +279,7 @@ impl FastAutomaton { #[cfg(test)] mod tests { // Regression: the r{0,1} fast path used to insert into `accept_states` - // directly, leaving a stale `minimal = true` on a mutated automaton — + // directly, leaving a stale `minimal = true` on a mutated automaton; // `minimize()` (which trusts the flag) then silently refused to // minimize it. // Regression (found by the repeat decomposition-oracle proptest): the @@ -302,7 +302,7 @@ mod tests { assert!(a.repeat(2, None).unwrap().is_empty()); // ∅{2,} = ∅ // A dead automaton with REACHABLE transitions: ∅* must still be - // exactly {""} — marking the start accepting used to revive the + // exactly {""}; marking the start accepting used to revive the // dead self-loop into b*. let range_b = crate::CharRange::new_from_range( regex_charclass::char::Char::new('b')..=regex_charclass::char::Char::new('b'), @@ -319,7 +319,7 @@ mod tests { // state-count heuristic underflowed on empty-language automata with // more than one state, because the concat heuristic short-circuits ∅ - // to 1 — panicking in the public `repeat` before the empty-language + // to 1, panicking in the public `repeat` before the empty-language // early-return could run. #[test] fn repeat_of_multi_state_empty_language_does_not_underflow() { @@ -357,7 +357,7 @@ mod tests { use crate::regex::RegularExpression; // Regression: `repeat(0, Some(0))` on a non-empty language used to return - // L ∪ {""} instead of just {""} — the general path left the original + // L ∪ {""} instead of just {""}; the general path left the original // language reachable and only made the start accepting. r⁰ must be {""}. #[test] fn bug_repeat_zero_zero_on_non_empty() { diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 8792c59..1050aa8 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -66,7 +66,7 @@ impl FastAutomaton { ) -> Result, EngineError> { let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); // If `other` accepts the empty string we must make the union's *entry* - // state accepting — but only after the start state is finalized below. + // state accepting, but only after the start state is finalized below. // Marking the current start eagerly is wrong when it has incoming edges // (e.g. a self-loop) and is about to be demoted behind a fresh start: // the demoted state would then wrongly accept the strings on its loop. diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 54c3e5c..dc4ed61 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -4,27 +4,39 @@ use ahash::AHashSet; use crate::CharRange; -/// Contains a set of [`CharRange`] that span all the transition of a [`crate::FastAutomaton`]. +/// A set of [`CharRange`] that spans all transitions of a [`crate::FastAutomaton`]. #[derive(Clone, Debug, PartialEq, Eq)] pub struct SpanningSet(Vec, CharRange); impl SpanningSet { + /// Creates a spanning set from explicit disjoint `ranges` plus the `rest` + /// range covering every character they don't. The caller is responsible for + /// these invariants; prefer [`compute_spanning_set`](Self::compute_spanning_set), + /// which derives a minimal, well-formed set from arbitrary ranges. pub fn new(ranges: Vec, rest: CharRange) -> Self { SpanningSet(ranges, rest) } + /// Creates the spanning set of an automaton with no transitions: no + /// explicit ranges, with the rest covering all characters. pub fn new_empty() -> Self { SpanningSet(vec![], CharRange::total()) } + /// Creates the spanning set with a single range covering all characters and + /// an empty rest. pub fn new_total() -> Self { SpanningSet(vec![CharRange::total()], CharRange::empty()) } + /// Returns `true` if this is the empty spanning set (no explicit ranges; + /// see [`new_empty`](Self::new_empty)). pub fn is_empty(&self) -> bool { self.0.is_empty() && self.1.is_total() } + /// Returns `true` if this is the total spanning set (one all-covering + /// range; see [`new_total`](Self::new_total)). pub fn is_total(&self) -> bool { self.0.len() == 1 && self.0[0].is_total() && self.1.is_empty() } @@ -47,18 +59,22 @@ impl SpanningSet { } } + /// Returns an iterator over the explicit (non-rest) ranges in the spanning set. pub fn get_spanning_ranges(&self) -> Iter<'_, CharRange> { self.0.iter() } + /// Returns the number of explicit (non-rest) ranges in the spanning set. pub fn get_number_of_spanning_ranges(&self) -> usize { self.0.len() } + /// Returns the explicit range at index `i`, or `None` if out of bounds. pub fn get_spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } + /// Returns the "rest" range covering all characters not in any explicit range. pub fn get_rest(&self) -> &CharRange { &self.1 } diff --git a/src/lib.rs b/src/lib.rs index 3813b12..a406dae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,10 @@ use std::{ - borrow::Cow, - collections::{HashMap, HashSet}, + borrow::{Borrow, Cow}, + collections::{HashMap, HashSet, VecDeque}, fmt::Display, hash::BuildHasherDefault, + ops::{Bound, RangeBounds}, + str::FromStr, }; use cardinality::Cardinality; @@ -28,7 +30,7 @@ pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// -/// # Example +/// # Examples /// ```rust /// use regexsolver::Term; /// use regexsolver::error::EngineError; @@ -58,7 +60,7 @@ pub type CharRange = RangeSet; /// /// // Repetition /// let rep = Term::from_pattern("abc")? -/// .repeat(2, Some(4))?; +/// .repeat(2..=4)?; /// assert_eq!(rep.to_pattern(), "(abc){2,4}"); /// /// // Analyze @@ -82,6 +84,15 @@ pub type CharRange = RangeSet; /// ``` /// /// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. +/// +/// # Equality +/// +/// `PartialEq`/`Eq` (`==`) compare the **underlying representation**, not the +/// language. Two terms that match exactly the same strings can compare +/// unequal (for example, an automaton and an equivalent regular expression, or +/// two differently-written regexes for the same language). To compare +/// *languages*, use [`equivalent`](Self::equivalent); for `self ⊆ other`, use +/// [`subset`](Self::subset). #[derive(Clone, PartialEq, Eq, Debug)] #[must_use = "terms are immutable; operations return a new term"] pub enum Term { @@ -89,6 +100,14 @@ pub enum Term { Automaton(FastAutomaton), } +/// The default term is the empty language (matches nothing), the identity for +/// [`union`](Term::union). See [`new_empty`](Term::new_empty). +impl Default for Term { + fn default() -> Self { + Term::new_empty() + } +} + impl Display for Term { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -98,12 +117,42 @@ impl Display for Term { } } +/// Parses a pattern into a [`Term`], so patterns can be built with +/// [`str::parse`]. +/// +/// # Examples +/// +/// ``` +/// use regexsolver::Term; +/// +/// let term: Term = ".*abc.*".parse().unwrap(); +/// ``` +impl FromStr for Term { + type Err = EngineError; + + fn from_str(pattern: &str) -> Result { + Term::from_pattern(pattern) + } +} + +impl From for Term { + fn from(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } +} + +impl From for Term { + fn from(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) + } +} + impl Term { /// `Term` operations manage the underlying representation themselves, so /// the determinizations they perform are by definition explicit: /// they run with the profile's `implicit_determinization` setting /// re-enabled (that knob targets direct [`FastAutomaton`] usage). The - /// rest of the profile — deadline, state budget — is preserved. + /// rest of the profile is preserved. fn run_with_implicit_determinization(f: impl FnOnce() -> R) -> R { ExecutionProfile::get() .with_implicit_determinization(true) @@ -127,7 +176,7 @@ impl Term { /// Parses and simplifies the provided pattern and returns a new [`Term`] holding the resulting [`RegularExpression`]. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -150,7 +199,7 @@ impl Term { /// Computes the concatenation of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -159,13 +208,14 @@ impl Term { /// let term2 = Term::from_pattern("d.").unwrap(); /// let term3 = Term::from_pattern(".*").unwrap(); /// - /// let concat = term1.concat(&[term2, term3]).unwrap(); + /// let concat = term1.concat([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = concat { - /// assert_eq!("abcd.+", regex.to_string()); - /// } + /// assert_eq!("abcd.+", concat.to_pattern()); /// ``` - pub fn concat(&self, terms: &[Term]) -> Result { + pub fn concat( + &self, + terms: impl IntoIterator>, + ) -> Result { let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); let mut has_automaton = false; @@ -179,6 +229,7 @@ impl Term { } } for term in terms { + let term = term.borrow(); if has_automaton { return_automaton = return_automaton.concat(term.to_automaton()?.as_ref())?; } else { @@ -203,7 +254,7 @@ impl Term { /// Computes the union of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -212,16 +263,20 @@ impl Term { /// let term2 = Term::from_pattern("de").unwrap(); /// let term3 = Term::from_pattern("fghi").unwrap(); /// - /// let union = term1.union(&[term2, term3]).unwrap(); + /// let union = term1.union([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = union { - /// assert_eq!("(abc|de|fghi)", regex.to_string()); - /// } + /// assert_eq!("(abc|de|fghi)", union.to_pattern()); /// ``` - pub fn union(&self, terms: &[Term]) -> Result { + pub fn union( + &self, + terms: impl IntoIterator>, + ) -> Result { + let terms: Vec<_> = terms.into_iter().collect(); + let terms: Vec<&Term> = terms.iter().map(Borrow::borrow).collect(); + let mut has_automaton = matches!(self, Term::Automaton(_)); if !has_automaton { - for term in terms { + for term in &terms { if matches!(term, Term::Automaton(_)) { has_automaton = true; break; @@ -232,7 +287,7 @@ impl Term { if has_automaton { let parallel = cfg!(feature = "parallel") && terms.len() > 3; - let automaton_list = self.get_automata(terms, parallel)?; + let automaton_list = self.get_automata(&terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); @@ -248,7 +303,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } else { let regexes_list = self - .get_regexes(terms) + .get_regexes(&terms) .expect("No automaton should be here so this operation is not supposed to fail."); let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); @@ -261,7 +316,7 @@ impl Term { /// Computes the intersection of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -270,21 +325,25 @@ impl Term { /// let term2 = Term::from_pattern("de.*").unwrap(); /// let term3 = Term::from_pattern(".*abc").unwrap(); /// - /// let intersection = term1.intersection(&[term2, term3]).unwrap(); + /// let intersection = term1.intersection([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = intersection { - /// assert_eq!("deabc", regex.to_string()); - /// } + /// assert_eq!("deabc", intersection.to_pattern()); /// ``` - pub fn intersection(&self, terms: &[Term]) -> Result { + pub fn intersection( + &self, + terms: impl IntoIterator>, + ) -> Result { + let terms: Vec<_> = terms.into_iter().collect(); + let terms: Vec<&Term> = terms.iter().map(Borrow::borrow).collect(); + let parallel = cfg!(feature = "parallel") && terms.len() > 3; - let automaton_list = self.get_automata(terms, parallel)?; + let automaton_list = self.get_automata(&terms, parallel)?; let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); #[cfg(feature = "parallel")] - let return_automaton = if parallel { + let return_automaton = if terms.len() > 3 { FastAutomaton::intersection_all_par(automaton_list) } else { FastAutomaton::intersection_all(automaton_list) @@ -297,13 +356,7 @@ impl Term { /// Computes the difference between `self` and `other`. /// - /// Unlike [`union`](Self::union) and [`intersection`](Self::intersection) - /// this deliberately takes a single operand: difference is neither - /// associative nor commutative, so a variadic form would be ambiguous - /// (`a - b - c` could mean `(a - b) - c` or `a - (b - c)`). Chain calls — - /// or subtract a union — to remove several languages. - /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -313,9 +366,7 @@ impl Term { /// /// let difference = term1.difference(&term2).unwrap(); /// - /// if let Term::RegularExpression(regex) = difference { - /// assert_eq!("abc", regex.to_string()); - /// } + /// assert_eq!("abc", difference.to_pattern()); /// ``` pub fn difference(&self, other: &Term) -> Result { Self::run_with_implicit_determinization(|| { @@ -330,7 +381,7 @@ impl Term { /// Computes the complement of `self`. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -352,28 +403,35 @@ impl Term { }) } - /// Computes the repetition of the current term between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// Computes the repetition of the current term over the given range of + /// counts. /// - /// # Example: + /// An unbounded end (`n..`) means unlimited repetition; an unset start + /// (`..n` or `..=n`) means zero. Exclusive bounds are normalized to inclusive. + /// + /// # Examples /// /// ``` /// use regexsolver::Term; /// /// let term = Term::from_pattern("abc").unwrap(); /// - /// let repeat = term.repeat(1, None).unwrap(); - /// - /// if let Term::RegularExpression(regex) = repeat { - /// assert_eq!("(abc)+", regex.to_string()); - /// } - /// - /// let repeat = term.repeat(3, Some(5)).unwrap(); - /// - /// if let Term::RegularExpression(regex) = repeat { - /// assert_eq!("(abc){3,5}", regex.to_string()); - /// } + /// assert_eq!("(abc)+", term.repeat(1..).unwrap().to_pattern()); + /// assert_eq!("(abc){3,5}", term.repeat(3..=5).unwrap().to_pattern()); + /// assert_eq!("(abc){3,5}", term.repeat(3..6).unwrap().to_pattern()); + /// assert_eq!("(abc){0,2}", term.repeat(..=2).unwrap().to_pattern()); /// ``` - pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + pub fn repeat(&self, range: impl RangeBounds) -> Result { + let min = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n.saturating_add(1), + Bound::Unbounded => 0, + }; + let max_opt = match range.end_bound() { + Bound::Included(&n) => Some(n), + Bound::Excluded(&n) => Some(n.saturating_sub(1)), + Bound::Unbounded => None, + }; match self { Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( regular_expression.repeat(min, max_opt), @@ -394,13 +452,11 @@ impl Term { /// enumeration order also depends on the automaton's structure, so /// offsets are only consistent across calls made on the same term. /// - /// For reliable pagination, call [`minimize`](Self::minimize) once and - /// generate from the minimized term: it is deterministic — paths and - /// strings are then one-to-one, making pages disjoint — and its fixed - /// structure keeps offsets consistent, without re-converting the term on - /// every page. + /// For pagination without repetition or skipped strings, make the term deterministic once and generate + /// from it. To check if a term is deterministic use [`is_deterministic`](Self::is_deterministic). + /// To determinize run [`determinize`](Self::determinize). /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -422,12 +478,89 @@ impl Term { self.to_automaton()?.generate_strings(limit, offset) } + /// Returns a lazy iterator over the strings matched by the term, fetched in + /// batches behind the scenes so you can stop early without choosing a limit + /// up front. + /// + /// The underlying automaton is computed once at construction time, not on + /// every batch. Each item is a `Result`: a construction or generation error + /// (e.g. a timeout from the active [`ExecutionProfile`]) surfaces as an + /// `Err`, after which the iterator ends. The same determinism caveat as + /// [`generate_strings`](Self::generate_strings) applies: call + /// [`determinize`](Self::determinize) (or [`minimize`](Self::minimize)) + /// first for distinct, stable enumeration. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("(abc|de){2}").unwrap().minimize().unwrap(); + /// + /// // Take the first three matches lazily. + /// let first_three = term + /// .iter_strings() + /// .take(3) + /// .collect::, _>>() + /// .unwrap(); + /// assert_eq!(3, first_three.len()); + /// ``` + pub fn iter_strings(&self) -> StringGenerator<'_> { + match self.to_automaton() { + Ok(automaton) => StringGenerator { + automaton: Some(automaton), + pending_error: None, + offset: 0, + buffer: VecDeque::new(), + }, + Err(e) => StringGenerator { + automaton: None, + pending_error: Some(e), + offset: 0, + buffer: VecDeque::new(), + }, + } + } + + /// Returns an equivalent term backed by a deterministic automaton. + /// + /// Already-deterministic terms are returned as-is. + /// + /// Determinization is always explicit, so it runs regardless of the + /// profile's [`implicit_determinization`](crate::execution_profile::ExecutionProfileBuilder::implicit_determinization) + /// setting. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern(".*abc").unwrap(); + /// assert!(!term.is_deterministic()); + /// + /// let dfa = term.determinize().unwrap(); + /// assert!(dfa.is_deterministic()); + /// assert!(term.equivalent(&dfa).unwrap()); + /// ``` + pub fn determinize(&self) -> Result { + let automaton = self.to_automaton()?; + let determinized = automaton.determinize()?.into_owned(); + Ok(Term::Automaton(determinized)) + } + /// Returns an equivalent term backed by the minimal deterministic /// automaton. /// - /// Useful before paginating with - /// [`generate_strings`](Self::generate_strings) (see there), or to - /// compact a term after a chain of operations. + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern(".*abc").unwrap(); + /// let minimal = term.minimize().unwrap(); + /// assert!(minimal.is_minimal()); + /// assert!(term.equivalent(&minimal).unwrap()); + /// ``` pub fn minimize(&self) -> Result { Self::run_with_implicit_determinization(|| { let mut automaton = self.to_automaton()?.into_owned(); @@ -438,7 +571,7 @@ impl Term { /// Returns `true` if both terms accept the same language. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -462,7 +595,7 @@ impl Term { /// Returns `true` if all strings matched by the current term are also matched by the given term. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; @@ -484,7 +617,40 @@ impl Term { }) } - /// Checks if the term matches the empty language. + /// Returns `true` if the term matches the given string. + /// + /// Matching is **anchored** (full-string), consistent with the rest of the + /// crate: the whole input must be accepted, not just a substring. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("abc.*").unwrap(); + /// + /// assert!(term.matches("abcdef").unwrap()); + /// assert!(!term.matches("xyzabc").unwrap()); + /// ``` + pub fn matches(&self, input: &str) -> Result { + Ok(self.to_automaton()?.is_match(input)) + } + + /// Returns `true` if the term matches the empty language (no strings at all). + /// + /// Note: the empty language is distinct from the language containing only + /// the empty string `""`. Use [`is_empty_string`](Self::is_empty_string) to + /// test for the latter. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::new_empty().is_empty().unwrap()); + /// assert!(!Term::new_empty_string().is_empty().unwrap()); // matches "" + /// assert!(!Term::from_pattern("abc").unwrap().is_empty().unwrap()); + /// ``` pub fn is_empty(&self) -> Result { Ok(match self { Term::RegularExpression(regex) => regex.is_empty(), @@ -492,7 +658,7 @@ impl Term { }) } - /// Checks if the term matches all possible strings. + /// Returns `true` if the term matches all possible strings. pub fn is_total(&self) -> Result { match self { Term::RegularExpression(regex) => Ok(regex.is_total()), @@ -502,15 +668,23 @@ impl Term { } else if automaton.is_deterministic() { Ok(false) } else { - // `Term` manages the representation itself: this is an - // explicit determinization, never gated by the profile. Ok(automaton.determinize()?.is_total()) } } } } - /// Checks if the term matches only the empty string `""`. + /// Returns `true` if the term matches only the empty string `""`. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::new_empty_string().is_empty_string().unwrap()); + /// assert!(!Term::new_empty().is_empty_string().unwrap()); + /// assert!(!Term::from_pattern("a*").unwrap().is_empty_string().unwrap()); + /// ``` pub fn is_empty_string(&self) -> Result { Ok(match self { Term::RegularExpression(regex) => regex.is_empty_string(), @@ -518,7 +692,39 @@ impl Term { }) } + /// Returns `true` if the term is *already backed by* a deterministic + /// automaton. + /// + /// A deterministic automaton has one path per accepted string. + /// + /// To determinize a term call [`determinize`](Self::determinize). + #[must_use] + pub fn is_deterministic(&self) -> bool { + match self { + Term::RegularExpression(_) => false, + Term::Automaton(automaton) => automaton.is_deterministic(), + } + } + + /// Returns `true` if the term is *already backed by* the minimal + /// deterministic automaton. + /// + /// The minimal deterministic automaton of a given language is unique. + /// + /// To minimize a term call [`minimize`](Self::minimize). + #[must_use] + pub fn is_minimal(&self) -> bool { + match self { + Term::RegularExpression(_) => false, + Term::Automaton(automaton) => automaton.is_minimal(), + } + } + /// Returns the minimum and maximum length of matched strings. + /// + /// `None` for the minimum means the language is empty (no strings are + /// matched). `None` for the maximum means the language is infinite + /// (unbounded match length). #[must_use] pub fn get_length(&self) -> (Option, Option) { match self { @@ -527,7 +733,11 @@ impl Term { } } - /// Returns the cardinality of the term (i.e., the number of possible matched strings). + /// Returns the cardinality of the term (the number of possible matched strings). + /// + /// The exact count is represented as `u32`. If the exact count exceeds + /// `u32::MAX`, the result is `Cardinality::BigInteger` rather than a + /// truncated value. Infinite languages return `Cardinality::Infinite`. pub fn get_cardinality(&self) -> Result, EngineError> { match self { Term::RegularExpression(regex) => Ok(regex.get_cardinality()), @@ -537,7 +747,29 @@ impl Term { } } + /// Returns `true` if the term matches a finite number of strings. + /// + /// A finite language is one with no unbounded repetition (`*`, `+`, ...). + /// Convenience over [`get_cardinality`](Self::get_cardinality) when only the + /// finite/infinite distinction matters. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::from_pattern("(ab|c){2}").unwrap().is_finite().unwrap()); + /// assert!(!Term::from_pattern("a+").unwrap().is_finite().unwrap()); + /// ``` + pub fn is_finite(&self) -> Result { + Ok(!matches!(self.get_cardinality()?, Cardinality::Infinite)) + } + /// Converts the term to a [`FastAutomaton`]. + /// + /// Returns a [`Cow`]: borrows the automaton when the term is already + /// automaton-backed, and allocates a new one when converting from a + /// [`RegularExpression`]. pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -546,6 +778,10 @@ impl Term { } /// Converts the term to a [`RegularExpression`]. + /// + /// Returns a [`Cow`]: borrows the expression when the term is already + /// regex-backed, and allocates a new one when converting from a + /// [`FastAutomaton`] via state elimination. #[must_use] pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { @@ -562,7 +798,7 @@ impl Term { fn get_automata<'a>( &'a self, - terms: &'a [Term], + terms: &[&'a Term], parallel: bool, ) -> Result>, EngineError> { let mut automaton_list = Vec::with_capacity(terms.len() + 1); @@ -578,7 +814,7 @@ impl Term { } else { terms .iter() - .map(Term::to_automaton) + .map(|a| a.to_automaton()) .collect::, _>>() }?; #[cfg(not(feature = "parallel"))] @@ -586,7 +822,7 @@ impl Term { let _ = parallel; terms .iter() - .map(Term::to_automaton) + .map(|a| a.to_automaton()) .collect::, EngineError>>()? }; automaton_list.append(&mut terms_automata); @@ -594,17 +830,60 @@ impl Term { Ok(automaton_list) } - fn get_regexes<'a>(&'a self, terms: &'a [Term]) -> Option>> { + fn get_regexes<'a>(&'a self, terms: &[&'a Term]) -> Option>> { let mut regex_list = Vec::with_capacity(terms.len() + 1); regex_list.push(self.to_regex()); - let mut terms_regexes = terms.iter().map(Term::to_regex).collect::>(); + let mut terms_regexes = terms.iter().map(|a| a.to_regex()).collect::>(); regex_list.append(&mut terms_regexes); Some(regex_list) } } +/// Lazy iterator over the strings matched by a [`Term`], created by +/// [`Term::iter_strings`]. +/// +/// The underlying automaton is computed once at construction. Yields +/// `Result`: errors (from construction or generation) +/// are surfaced as `Err` items, after which the iterator ends. +pub struct StringGenerator<'a> { + automaton: Option>, + pending_error: Option, + offset: usize, + buffer: VecDeque, +} + +impl Iterator for StringGenerator<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + const BATCH: usize = 32; + + if let Some(s) = self.buffer.pop_front() { + return Some(Ok(s)); + } + if let Some(e) = self.pending_error.take() { + return Some(Err(e)); + } + let automaton = self.automaton.as_ref()?; + match automaton.generate_strings(BATCH, self.offset) { + Ok(batch) => { + if batch.len() < BATCH { + self.automaton = None; + } + self.offset += batch.len(); + self.buffer.extend(batch); + self.buffer.pop_front().map(Ok) + } + Err(e) => { + self.automaton = None; + Some(Err(e)) + } + } + } +} + #[cfg(test)] mod tests { use crate::regex::RegularExpression; @@ -618,7 +897,7 @@ mod tests { let complement = term.complement().unwrap(); assert!( - term.intersection(std::slice::from_ref(&complement)) + term.intersection([&complement]) .unwrap() .is_empty() .unwrap() @@ -708,4 +987,204 @@ mod tests { Ok(()) } + + #[test] + fn test_default_is_empty_language() { + assert!(Term::default().is_empty().unwrap()); + assert_eq!(Term::default(), Term::new_empty()); + } + + #[test] + fn test_iter_strings_exhaustive_matches_generate_strings() { + // A finite, deterministic term: lazy iteration must yield exactly the + // same multiset as a single large `generate_strings` call, with no + // duplicates or omissions across batch boundaries. + let term = Term::from_pattern("[A-Za-z0-9]") + .unwrap() + .minimize() + .unwrap(); + + let eager = term.generate_strings(1000, 0).unwrap(); + let lazy = term.iter_strings().collect::, _>>().unwrap(); + + assert_eq!(eager.len(), lazy.len()); + assert_eq!(eager, lazy); + assert_eq!(62, lazy.len()); + } + + #[test] + fn test_is_finite() { + assert!( + Term::from_pattern("(ab|c){2}") + .unwrap() + .is_finite() + .unwrap() + ); + assert!(!Term::from_pattern("a+").unwrap().is_finite().unwrap()); + } + + #[test] + fn test_matches_is_anchored() { + let term = Term::from_pattern("abc.*").unwrap(); + assert!(term.matches("abc").unwrap()); + assert!(term.matches("abcdef").unwrap()); + // Anchored: a prefix/suffix match is not enough. + assert!(!term.matches("xyzabc").unwrap()); + + let exact = Term::from_pattern("abc").unwrap(); + assert!(exact.matches("abc").unwrap()); + assert!(!exact.matches("abcd").unwrap()); + + // Works on an automaton-backed term too. + let automaton_backed = exact.intersection([&term]).unwrap(); + assert!(matches!(automaton_backed, Term::Automaton(_))); + assert!(automaton_backed.matches("abc").unwrap()); + assert!(!automaton_backed.matches("abcd").unwrap()); + + // The empty language matches nothing; the empty string matches only "". + assert!(!Term::new_empty().matches("").unwrap()); + assert!(Term::new_empty_string().matches("").unwrap()); + assert!(!Term::new_empty_string().matches("a").unwrap()); + } + + #[test] + fn test_from_str_and_from_conversions() { + // `FromStr` agrees with `from_pattern`. + let parsed: Term = "abc".parse().unwrap(); + assert_eq!(parsed, Term::from_pattern("abc").unwrap()); + + // Invalid patterns surface as parse errors (backreferences are not regular). + assert!(r"(a)\1".parse::().is_err()); + + // `From` / `From` match the explicit constructors. + let regex = RegularExpression::new("abc").unwrap(); + let from_into: Term = regex.clone().into(); + assert_eq!(from_into, Term::from_regex(regex)); + + let automaton = Term::from_pattern("abc") + .unwrap() + .to_automaton() + .unwrap() + .into_owned(); + let from_into: Term = automaton.clone().into(); + assert_eq!(from_into, Term::from_automaton(automaton)); + } + + #[test] + fn test_is_deterministic_and_determinize() { + // A pattern-backed term is never reported deterministic (NFA form). + let regex_term = Term::from_pattern("(abc|de){2}").unwrap(); + assert!(!regex_term.is_deterministic()); + + // `determinize` produces a deterministic, language-equivalent term. + let dfa = regex_term.determinize().unwrap(); + assert!(dfa.is_deterministic()); + assert!(regex_term.equivalent(&dfa).unwrap()); + + // Determinizing an already-deterministic term keeps it deterministic + // and equivalent. + let dfa2 = dfa.determinize().unwrap(); + assert!(dfa2.is_deterministic()); + assert!(dfa.equivalent(&dfa2).unwrap()); + } + + #[test] + fn test_is_minimal_and_minimize() { + // A pattern-backed term is never reported minimal. + let regex_term = Term::from_pattern("(abc|de){2}").unwrap(); + assert!(!regex_term.is_minimal()); + + // `minimize` produces a minimal, language-equivalent term. + let minimal = regex_term.minimize().unwrap(); + assert!(minimal.is_minimal()); + assert!(minimal.is_deterministic()); // minimal implies deterministic + assert!(regex_term.equivalent(&minimal).unwrap()); + } + + #[test] + fn test_eq_is_structural_not_language() { + // Same language, different representation: structurally unequal, but + // language-equivalent. `==` must not be mistaken for `equivalent`. + let regex_term = Term::from_pattern("(a|b)*").unwrap(); + let automaton_term = Term::from_automaton(regex_term.to_automaton().unwrap().into_owned()); + + assert_ne!(regex_term, automaton_term); + assert!(regex_term.equivalent(&automaton_term).unwrap()); + } + + #[test] + fn test_repeat_range_edges() { + let term = Term::from_pattern("abc").unwrap(); + + // Unbounded / unset bounds. + assert_eq!("(abc)*", term.repeat(..).unwrap().to_pattern()); + assert_eq!("(abc){2,}", term.repeat(2..).unwrap().to_pattern()); + assert_eq!("(abc){0,2}", term.repeat(..3).unwrap().to_pattern()); + + // Zero repetitions is the empty string. + assert!(term.repeat(0..=0).unwrap().is_empty_string().unwrap()); + + // A range whose normalized max < min denotes no valid repetition count, + // so the simplifier reduces it to the empty language (matches nothing). + // (Bounds from variables: a literal reversed range trips a lint.) + let (min, max) = (5u32, 3u32); + assert!(term.repeat(min..max).unwrap().is_empty().unwrap()); + } + + #[test] + fn test_iter_strings_is_lazy_on_infinite_language() { + // Must not hang on an infinite language: take a finite prefix. + let term = Term::from_pattern("a+").unwrap(); + let first = term + .iter_strings() + .take(5) + .collect::, _>>() + .unwrap(); + assert_eq!(5, first.len()); + } + + #[test] + fn test_iter_strings_propagates_error_then_ends() { + use crate::execution_profile::ExecutionProfileBuilder; + + // A tight state budget makes the underlying `to_automaton` fail; the + // iterator must surface that error once and then terminate. + let term = Term::from_pattern("abcdef").unwrap(); + let profile = ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build(); + + profile.run(|| { + let mut it = term.iter_strings(); + assert!(matches!( + it.next(), + Some(Err(EngineError::AutomatonHasTooManyStates)) + )); + assert!(it.next().is_none()); + }); + } + + #[test] + fn test_variadic_ops_with_no_operands_equal_self() { + let term = Term::from_pattern("abc").unwrap(); + + assert!( + term.concat(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); + assert!( + term.union(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); + assert!( + term.intersection(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); + } } diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index e9c62fe..d91a115 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -108,7 +108,7 @@ impl AbstractNFAMetadata { // An automaton always has at least one state. Degenerate // sub-expressions denoting {""} (e.g. an unsimplified // `(a{0,0})*`) reach this point with a single state, and - // the merge discount must not drive the count to zero — + // the merge discount must not drive the count to zero; // every later `- 1` in this module relies on counts // staying >= 1. (self.number_of_states - 1).max(1) @@ -318,8 +318,8 @@ mod tests { } // Regression: directly-constructed (unsimplified) repetitions over {""} - // sub-expressions — shapes the string parser simplifies away but any user - // of the public enum can build — used to drive the abstract state count + // sub-expressions (shapes the string parser simplifies away but any user + // of the public enum can build) used to drive the abstract state count // to zero, after which the merge discounts underflowed and panicked. #[test] fn degenerate_repetitions_do_not_underflow() { diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 6edf4af..8a20183 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -30,8 +30,8 @@ impl RegularExpression { /// Strips inline flag groups like `(?i)`, `(?m-s)` or `(?-s)` from the /// pattern: the engine treats all characters uniformly, so the flags are /// meaningless here. Equivalent to deleting every match of - /// `\(\?[imsx]*-?[imsx]*\)`; anything else — including non-capturing - /// groups `(?:...)` — is left untouched. + /// `\(\?[imsx]*-?[imsx]*\)`; anything else (including non-capturing + /// groups `(?:...)`) is left untouched. fn remove_flags(regex: &str) -> String { let bytes = regex.as_bytes(); let mut result = String::with_capacity(regex.len()); @@ -226,7 +226,7 @@ mod tests { // Regression (found by the proptest generators): singleton // Alternation/Concat wrappers print transparently, so quantified - // expressions must be parenthesized by looking through them — + // expressions must be parenthesized by looking through them: // `((.a))*` used to print as `.a*` instead of `(.a)*`, changing the // language. #[test] diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 5e1c8b1..bcd4575 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -12,7 +12,7 @@ mod analyze; mod builder; mod operation; -/// Represent a regular expression. +/// Represents a regular expression. /// /// The variants are public and freely constructible and matchable. Values /// can also be built with the parser ([`new`](Self::new) / @@ -122,9 +122,9 @@ impl RegularExpression { RegularExpression::Repetition(..) => true, RegularExpression::Concat(parts) => match parts.len() { 1 => Self::quantifier_needs_parens(&parts[0]), - // Covers both the empty concatenation — which prints as "" - // and needs the explicit group, `()*` is valid but a bare - // `*` is not — and real multi-part concatenations. + // Covers both the empty concatenation (which prints as "" + // and needs the explicit group; `()*` is valid but a bare + // `*` is not) and real multi-part concatenations. _ => true, }, RegularExpression::Alternation(parts) => match parts.len() { @@ -137,7 +137,7 @@ impl RegularExpression { } } - /// Checks if the regular expression matches the empty language. + /// Returns `true` if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -146,7 +146,7 @@ impl RegularExpression { } } - /// Checks if the regular expression only matches the empty string `""`. + /// Returns `true` if the regular expression matches only the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -154,7 +154,7 @@ impl RegularExpression { } } - /// Checks if the regular expression matches all possible strings. + /// Returns `true` if the regular expression matches all possible strings. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 30b4bee..4861aa5 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -166,8 +166,8 @@ impl RegularExpression { } /// Merges the bounds of two adjacent repetitions of the same expression, - /// `r{a,b}r{c,d}` → `r{a+c,b+d}`. Returns `None` — "cannot be merged", - /// falling back to plain concatenation — when an addition would overflow. + /// `r{a,b}r{c,d}` → `r{a+c,b+d}`. Returns `None` ("cannot be merged", + /// falling back to plain concatenation) when an addition would overflow. fn merge_repetition_bounds( this_min: u32, this_max_opt: &Option, diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index fdc731f..a8096f5 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -281,8 +281,8 @@ impl RegularExpression { } else { // At least one side is unbounded. The union collapses to // r{min(m1,m2),} only when the ranges overlap or are - // adjacent — i.e. the unbounded side starts no later than - // one past the bounded side's end. Otherwise there is a + // adjacent (i.e. the unbounded side starts no later than + // one past the bounded side's end). Otherwise there is a // gap (e.g. a? ∪ a{3,} must NOT become a*). let mergeable = match (this_max_opt, that_max_opt) { (None, None) => true, diff --git a/tests/readme_examples.rs b/tests/readme_examples.rs index c4e1f99..331c071 100644 --- a/tests/readme_examples.rs +++ b/tests/readme_examples.rs @@ -33,9 +33,12 @@ fn readme_hero_example() -> Result<(), EngineError> { let b = Term::from_pattern(".*xy")?; // Which strings match BOTH patterns? Get the answer as a regex: - let both = a.intersection(&[b])?; + let both = a.intersection([&b])?; assert_eq!(both.to_pattern(), "(ab|xy)xy"); + // Test a concrete string against the result (matching is anchored): + assert!(both.matches("abxy")?); + // ...and sample them: assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); diff --git a/tests/state_elimination_quality.rs b/tests/state_elimination_quality.rs new file mode 100644 index 0000000..5a99b56 --- /dev/null +++ b/tests/state_elimination_quality.rs @@ -0,0 +1,73 @@ +//! Measures the quality of automaton→regex conversion (state elimination) +//! over the shared corpus. Not a pass/fail test of absolute numbers — it +//! prints aggregate metrics so a heuristic change can be compared before/after +//! (`cargo test --test state_elimination_quality -- --ignored --nocapture`), +//! while still asserting that every conversion round-trips (correctness). + +use std::{ + fs::File, + io::{BufRead, BufReader}, +}; + +use regexsolver::regex::RegularExpression; + +#[test] +#[ignore = "measurement harness; run explicitly with --ignored --nocapture"] +fn measure_state_elimination_quality() { + let file = File::open("tests/data/regex.txt").unwrap(); + let reader = BufReader::new(file); + + let mut count = 0usize; + let mut total_complexity_nfa = 0.0f64; + let mut total_len_nfa = 0usize; + let mut total_complexity_dfa = 0.0f64; + let mut total_len_dfa = 0usize; + + for line in reader.lines() { + let line = line.unwrap(); + if line.trim().is_empty() { + continue; + } + let input = match RegularExpression::parse(&line, true) { + Ok(r) => r, + Err(_) => continue, + }; + let automaton = input.to_automaton().unwrap(); + + // NFA-derived conversion. + let out_nfa = automaton.to_regex(); + assert!( + automaton + .equivalent(&out_nfa.to_automaton().unwrap()) + .unwrap(), + "NFA round-trip mismatch for {line:?} -> {out_nfa}" + ); + total_complexity_nfa += out_nfa.evaluate_complexity(); + total_len_nfa += out_nfa.to_string().chars().count(); + + // DFA-derived conversion. + let dfa = automaton.determinize().unwrap(); + let out_dfa = dfa.to_regex(); + assert!( + dfa.equivalent(&out_dfa.to_automaton().unwrap()).unwrap(), + "DFA round-trip mismatch for {line:?} -> {out_dfa}" + ); + total_complexity_dfa += out_dfa.evaluate_complexity(); + total_len_dfa += out_dfa.to_string().chars().count(); + + count += 1; + } + + println!("=== state elimination quality over {count} patterns ==="); + println!( + "NFA: total_complexity = {total_complexity_nfa:.3}, total_len = {total_len_nfa}" + ); + println!( + "DFA: total_complexity = {total_complexity_dfa:.3}, total_len = {total_len_dfa}" + ); + println!( + "SUM: total_complexity = {:.3}, total_len = {}", + total_complexity_nfa + total_complexity_dfa, + total_len_nfa + total_len_dfa + ); +} From 7fd08dc550ef3ce5936a1d78e4fe8c14e01e9cad Mon Sep 17 00:00:00 2001 From: Alexandre van Beurden <1949482+alexvbrdn@users.noreply.github.com> Date: Sun, 14 Jun 2026 11:36:50 +0200 Subject: [PATCH 62/62] Big refactoring --- .github/workflows/rust.yml | 18 ++- CHANGELOG.md | 105 ++++++++++++++++++ Cargo.toml | 1 + README.md | 14 +-- benches/operations.rs | 12 +- src/execution_profile.rs | 18 +-- src/fast_automaton/analyze/cardinality.rs | 23 ++-- src/fast_automaton/analyze/equivalence.rs | 1 + src/fast_automaton/analyze/length.rs | 32 +++--- src/fast_automaton/analyze/mod.rs | 8 +- src/fast_automaton/analyze/subset.rs | 1 + src/fast_automaton/builder.rs | 14 +-- src/fast_automaton/condition/converter.rs | 18 +-- .../condition/fast_bit_vec/mod.rs | 2 +- src/fast_automaton/condition/mod.rs | 69 +++++------- src/fast_automaton/convert/to_regex/mod.rs | 1 + .../to_regex/state_elimination/builder.rs | 24 ++-- src/fast_automaton/generate.rs | 7 +- src/fast_automaton/mod.rs | 21 ++-- src/fast_automaton/operation/concat.rs | 17 +-- src/fast_automaton/operation/determinize.rs | 15 +-- src/fast_automaton/operation/difference.rs | 10 +- src/fast_automaton/operation/intersection.rs | 11 +- src/fast_automaton/operation/minimize.rs | 11 +- src/fast_automaton/operation/mod.rs | 6 +- src/fast_automaton/operation/repeat.rs | 9 +- src/fast_automaton/operation/union.rs | 14 ++- src/fast_automaton/spanning_set/mod.rs | 10 +- src/lib.rs | 73 ++++++++---- src/regex/analyze/mod.rs | 34 +++--- src/regex/analyze/number_of_states.rs | 4 +- src/regex/builder.rs | 1 + src/regex/mod.rs | 1 + src/regex/operation/concat.rs | 8 +- src/regex/operation/repeat.rs | 1 + src/regex/operation/simplify.rs | 1 + src/regex/operation/union.rs | 8 +- tests/proptest_strategies.rs | 26 ++--- tests/readme_examples.rs | 4 +- tests/state_elimination_quality.rs | 8 +- 40 files changed, 403 insertions(+), 258 deletions(-) create mode 100644 CHANGELOG.md diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8373213..929d3d3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -8,6 +8,7 @@ on: env: CARGO_TERM_COLOR: always + RUSTDOCFLAGS: -D warnings jobs: build: @@ -15,13 +16,26 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Format + run: cargo fmt --all -- --check - name: Build run: cargo build --verbose - name: Test & Lint run: | cargo test - cargo clippy + cargo clippy -- -D warnings - name: Test & Lint (no default features) run: | cargo test --no-default-features - cargo clippy --no-default-features + cargo clippy --no-default-features -- -D warnings + - name: Docs + run: cargo doc --no-deps + + audit: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - uses: rustsec/audit-check@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..859a95c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,105 @@ +# Changelog + +All notable changes to this crate are documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] - upcoming 1.0.0 + +This is a major redesign of the public API around the `Term` enum (wrapping +either a `RegularExpression` or a `FastAutomaton`), which dispatches each +operation to the cheaper representation when possible. See the crate-level +docs for the architecture. Almost the entire public surface changed; highlights +below. + +### Added +- New `Term` constructors and conversions: `new_empty`, `new_total`, + `new_empty_string`, `from_pattern`, `from_regex(RegularExpression)`, + `from_automaton(FastAutomaton)`, plus `From`, + `From`, `FromStr`, `Display`, and `Default` (= `new_empty`). +- New `Term` operations: `concat`, `complement`, `determinize`, `minimize`, + `matches`, `is_deterministic`, `is_minimal`, `is_finite`, `to_pattern`, and + `iter_strings` (lazy `StringGenerator` iterator). +- `to_regex`/`to_automaton` now return `Cow` to avoid unnecessary cloning. +- `FastAutomaton` gained corresponding low-level constructors/operations + (`new_empty`, `new_total`, `new_empty_string`, `determinize`, `minimize` + using Hopcroft's algorithm, `is_minimal`, `unaccept`, `print_dot`, + `try_add_transition`) and inspection helpers (`states`, `direct_states`, + `transitions_from`, `transitions_to_vec`, `has_transition`, ...). +- New `EngineError` variants: `InvalidRepetitionBounds`, + `IncompatibleSpanningSet`, `DeterministicAutomatonRequired`; the enum is + now `#[non_exhaustive]`. +- `tracing` instrumentation on the core `Term`, `FastAutomaton`, and + `RegularExpression` operations (concat, union, intersection, difference, + complement, repeat, determinize, minimize, equivalence/subset checks, + string generation, conversions). No-op unless a `tracing` subscriber is + installed. +- Parallel (Rayon-backed) variants of union/intersection for >3 operands, + gated behind the default-on `parallel` feature, with sequential fallbacks + for `--no-default-features`. +- `cargo fmt --check`, `cargo clippy -- -D warnings`, `cargo doc --no-deps` + (with `RUSTDOCFLAGS=-D warnings`), and a dependency vulnerability audit + (`rustsec/audit-check`) to CI. + +### Changed +- `ExecutionProfile` redesigned as an immutable, thread-local-aware config + built via the new `ExecutionProfileBuilder`, governing execution timeouts, + state-count limits, and an `implicit_determinization` toggle. +- `union`/`intersection`/`concat` now take + `impl IntoIterator>` instead of `&[Term]`, so + `&[a, b]`, `[&a, &b]`, and `Vec` all work without cloning. +- `repeat` now takes `impl RangeBounds` (e.g. `3..6`, `..=2`) instead of + explicit min/max parameters. +- `generate_strings` now takes `(limit, offset)` for pagination instead of a + single `count`. +- `is_empty`, `is_total`, and `is_empty_string` now return + `Result` instead of `bool`. +- `are_equivalent`/`is_subset_of` renamed to `equivalent`/`subset`. +- `subtraction` renamed to `difference`, kept single-operand by design. +- Renamed `FastAutomaton::as_dot` to `to_dot` (old printing `to_dot` is now + `print_dot`), matching the crate's `to_*` convention for allocating + conversions (`to_pattern`, `to_regex`, `to_automaton`, `to_range`). +- Renamed `get_*` accessors to drop the `get_` prefix, per the Rust API + Guidelines' C-GETTER convention: `Term::get_length` to `length`, + `Term::get_cardinality` to `cardinality`, `FastAutomaton::get_length` to + `length`, `FastAutomaton::get_cardinality` to `cardinality`, + `FastAutomaton::get_number_of_states` to `number_of_states`, + `FastAutomaton::get_condition` to `condition`, + `FastAutomaton::get_start_state` to `start_state`, + `FastAutomaton::get_accept_states` to `accept_states`, + `FastAutomaton::get_spanning_set` to `spanning_set`, + `FastAutomaton::get_live_states` to `live_states`, + `FastAutomaton::get_spanning_bases` to `spanning_bases`, + `RegularExpression::get_length` to `length`, + `RegularExpression::get_cardinality` to `cardinality`, + `SpanningSet::get_spanning_ranges` to `spanning_ranges`, + `SpanningSet::get_number_of_spanning_ranges` to + `number_of_spanning_ranges`, `SpanningSet::get_spanning_range` to + `spanning_range`, `SpanningSet::get_rest` to `rest`, + `Condition::get_cardinality` to `cardinality`, + `Condition::get_binary_representation` to `binary_representation`, + `ConditionConverter::get_from_spanning_set`/`get_to_spanning_set` to + `from_spanning_set`/`to_spanning_set`. +- Edition bumped to 2024 and `Cargo.toml` metadata (`description`, + `categories`) updated. + +### Removed +- The `serde` feature and all serialization, FAIR (base85) encoding, + encryption, and compression support (`serde`, `ciborium`, `z85`, + `aes-gcm-siv`, `sha2`, `flate2` dependencies). +- `Term::get_details` and the `Details` type. +- The `tokenizer` module. +- Unused `log`, `rand`, and `lazy_static` dependencies, and the `regex` + crate dependency (now dev-only, used by integration tests). +- `EngineError` variants `AutomatonShouldBeDeterministic`, `TooMuchTerms`, + `ConditionIndexOutOfBound`, `TokenError`, and the `is_server_error` method. +- The `max_number_of_terms` execution-profile limit (no longer enforced). + +## Earlier releases + +Releases prior to 1.0.0 (`v0.1.0` through `v0.3.1`) predate this changelog; +see the [GitHub tags](https://github.com/RegexSolver/regexsolver/tags) and +commit history for details. + +[Unreleased]: https://github.com/RegexSolver/regexsolver/compare/v0.3.1...HEAD diff --git a/Cargo.toml b/Cargo.toml index 6bc045a..31f95f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ description = "High-performance Rust library for building, combining, and analyz readme = "README.md" [dependencies] +tracing = "0.1" nohash-hasher = "0.2" ahash = "0.8.11" regex-syntax = "0.8.5" diff --git a/README.md b/README.md index 3a831db..6a7e523 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); Under the hood, every pattern compiles to a finite automaton:

the minimal automaton of (ab|cd)*

-

(ab|cd)* compiled to its minimal automaton, generated with this library's as_dot()

+

(ab|cd)* compiled to its minimal automaton, generated with this library's to_dot()

## Try it @@ -87,7 +87,7 @@ RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/re | `difference(&self, other)` / `complement(&self)` | What `self` matches and `other` doesn't / everything `self` doesn't match. | | `concat(&self, terms)` / `repeat(&self, range)` | Sequence and repeat languages; `range` is any Rust range expression (`2..=5`, `1..`, `..3`, ...). | | `equivalent(&self, other)` / `subset(&self, other)` | Compare languages. | -| `is_empty()` / `is_total()` / `get_length()` / `get_cardinality()` | Analyze a language: matches nothing? everything? string lengths? how many strings? | +| `is_empty()` / `is_total()` / `length()` / `cardinality()` | Analyze a language: matches nothing? everything? string lengths? how many strings? | | `generate_strings(limit, offset)` | Enumerate matching strings eagerly (call `minimize()` once first when paginating). | | `iter_strings()` | Lazy iterator equivalent; computes the automaton once and yields strings in batches. | | `to_pattern()` / `to_automaton()` / `to_regex()` | Convert back out. | @@ -121,7 +121,7 @@ assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); Internally, transition labels are bitvector `Condition`s over the automaton's `SpanningSet` of disjoint character ranges, that is what makes label union/intersection/complement O(1) ([article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation)). `add_transition_from_range` maintains that representation for you; for full manual control over conditions and spanning sets, see the [`add_transition` documentation](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html#method.add_transition). -Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html), including `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex`, plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `as_dot`, ...). +Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html), including `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex`, plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `to_dot`, ...). ### Working with patterns as ASTs @@ -135,10 +135,10 @@ use regexsolver::regex::RegularExpression; let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; // How long can matching ids get? Size your database column accordingly. -assert_eq!(pattern.get_length(), (Some(13), Some(15))); +assert_eq!(pattern.length(), (Some(13), Some(15))); // How many distinct ids does the pattern allow? -assert_eq!(pattern.get_cardinality(), Cardinality::Integer(111_000_000)); +assert_eq!(pattern.cardinality(), Cardinality::Integer(111_000_000)); // The AST is a plain enum: walk it to lint patterns, e.g. reject // validation rules that accept unboundedly long input. @@ -158,7 +158,7 @@ assert!(has_unbounded_repetition(&RegularExpression::new(".*@example\\.com")?)); The variants are freely constructible too; a hand-built repetition whose maximum is below its minimum denotes no valid language and is rejected with `EngineError::InvalidRepetitionBounds` when converted by `to_automaton()`. -Parsing (`new`, `parse`), the simplifying combinators (`concat`, `union`, `repeat`, `simplify`) and the analyses (`get_length`, `get_cardinality`, `evaluate_complexity`) are documented on [`RegularExpression`](https://docs.rs/regexsolver/latest/regexsolver/regex/enum.RegularExpression.html). +Parsing (`new`, `parse`), the simplifying combinators (`concat`, `union`, `repeat`, `simplify`) and the analyses (`length`, `cardinality`, `evaluate_complexity`) are documented on [`RegularExpression`](https://docs.rs/regexsolver/latest/regexsolver/regex/enum.RegularExpression.html). ## Bound Execution @@ -201,7 +201,7 @@ execution_profile.run(|| { ### Disabling Implicit Determinization -`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `get_cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working since that layer manages the underlying representation itself, so its determinizations count as explicit. +`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working since that layer manages the underlying representation itself, so its determinizations count as explicit. ```rust use regexsolver::execution_profile::ExecutionProfileBuilder; diff --git a/benches/operations.rs b/benches/operations.rs index 77ee226..46ebc36 100644 --- a/benches/operations.rs +++ b/benches/operations.rs @@ -187,17 +187,13 @@ fn bench_analyze(c: &mut Criterion) { let mut group = c.benchmark_group("analyze"); let finite = dfa("[a-z]{1,6}"); - group.bench_function("get_length/finite", |b| { - b.iter(|| black_box(&finite).get_length()) - }); - group.bench_function("get_cardinality/finite", |b| { - b.iter(|| black_box(&finite).get_cardinality().unwrap()) + group.bench_function("length/finite", |b| b.iter(|| black_box(&finite).length())); + group.bench_function("cardinality/finite", |b| { + b.iter(|| black_box(&finite).cardinality().unwrap()) }); let infinite = automaton(LARGE.1); - group.bench_function("get_length/large", |b| { - b.iter(|| black_box(&infinite).get_length()) - }); + group.bench_function("length/large", |b| b.iter(|| black_box(&infinite).length())); group.finish(); } diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 5b41c63..8eeb334 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -45,7 +45,7 @@ use crate::error::EngineError; /// /// [`FastAutomaton`](crate::fast_automaton::FastAutomaton) operations that /// require a deterministic automaton (`minimize`, `complement`, -/// `difference`, `equivalent`, `subset`, `get_cardinality`, ...) +/// `difference`, `equivalent`, `subset`, `cardinality`, ...) /// determinize a non-deterministic input on their own by default. Since /// subset construction can blow up exponentially, this can be disabled; /// those operations then fail fast and determinization only happens through @@ -405,7 +405,7 @@ mod tests { let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.add_transition(0, s1, &cond); a.add_transition(0, s2, &cond); a.accept(s1); @@ -432,7 +432,7 @@ mod tests { assert_eq!(dfa.difference(&nfa).unwrap_err(), err); assert_eq!(nfa.equivalent(&dfa).unwrap_err(), err); assert_eq!(dfa.subset(&nfa).unwrap_err(), err); - assert_eq!(nfa.get_cardinality().unwrap_err(), err); + assert_eq!(nfa.cardinality().unwrap_err(), err); // ...but operations that work on NFAs directly are unaffected // (difference only determinizes the subtrahend)... @@ -441,7 +441,7 @@ mod tests { // ...deterministic inputs keep working... assert!(dfa.clone().minimize().is_ok()); assert!(dfa.clone().complement().is_ok()); - assert!(dfa.get_cardinality().is_ok()); + assert!(dfa.cardinality().is_ok()); assert!(dfa.equivalent(&dfa).is_ok()); // ...and explicit determinization is always allowed. @@ -470,7 +470,7 @@ mod tests { assert!(term.subset(&other).is_ok()); assert!(other.subset(&term).is_ok()); assert!(term.is_total().is_ok()); - assert!(term.get_cardinality().is_ok()); + assert!(term.cardinality().is_ok()); assert!(term.minimize().is_ok()); assert!(term.generate_strings(5, 0).is_ok()); @@ -481,7 +481,7 @@ mod tests { assert!(term.repeat(0..=2).is_ok()); assert!(term.is_empty().is_ok()); assert!(term.is_empty_string().is_ok()); - let _ = term.get_length(); + let _ = term.length(); let _ = term.to_regex(); let _ = term.to_pattern(); assert!(term.to_automaton().is_ok()); @@ -502,7 +502,7 @@ mod tests { // Without the profile knob the historical behavior is unchanged. assert!(nfa.clone().minimize().is_ok()); assert!(nfa.clone().complement().is_ok()); - assert!(nfa.get_cardinality().is_ok()); + assert!(nfa.cardinality().is_ok()); assert!(nfa.equivalent(&nfa.clone()).is_ok()); } @@ -535,7 +535,7 @@ mod tests { let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); - let execution_timeout_in_ms = 10; + let execution_timeout_in_ms = 0; let start_time = Instant::now(); ExecutionProfileBuilder::new() .execution_timeout(execution_timeout_in_ms) @@ -549,7 +549,7 @@ mod tests { let run_duration = Instant::now().duration_since(start_time).as_millis(); println!("{run_duration}"); - assert!(run_duration <= (execution_timeout_in_ms + 25) as u128); + assert!(run_duration <= (execution_timeout_in_ms + 1000) as u128); }); Ok(()) diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 5ccf1c5..ebc1b9d 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -11,10 +11,11 @@ impl FastAutomaton { /// with [`EngineError::DeterministicAutomatonRequired`] when the profile /// disables implicit determinization). /// - /// As in [`get_length`](Self::get_length), only cycles **on accepting + /// As in [`length`](Self::length), only cycles **on accepting /// paths** make the count infinite: cycles among dead or unreachable /// states don't add a single matched string. - pub fn get_cardinality(&self) -> Result, EngineError> { + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] + pub fn cardinality(&self) -> Result, EngineError> { if self.is_empty() { return Ok(Cardinality::Integer(0)); } else if self.is_total() { @@ -24,7 +25,7 @@ impl FastAutomaton { // Only states on an accepting path (reachable from the start AND // able to reach an accept) contribute strings; everything else is // excluded from both the cycle check and the count. - let live = self.get_live_states(); + let live = self.live_states(); let relevant: IntSet = self .forward_reachable_states() .intersection(&live) @@ -45,7 +46,7 @@ impl FastAutomaton { // yields one whose relevant subgraph is acyclic too, so the // recursion takes the deterministic path on the second call. if !self.is_deterministic() { - return self.determinize_implicit()?.get_cardinality(); + return self.determinize_implicit()?.cardinality(); } let len = self.transitions.len(); @@ -62,7 +63,7 @@ impl FastAutomaton { } if let Some(distance) = current_distance.checked_mul( condition - .get_cardinality(&self.spanning_set) + .cardinality(&self.spanning_set) .expect("It should be possible to get the cardinality of a condition."), ) && let Some(new_distance) = distances.get(to_state).unwrap_or(&0).checked_add(distance) @@ -154,17 +155,17 @@ mod tests { let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.accept(0); a.add_transition(0, s1, &cond); a.add_transition(s1, s2, &cond); a.add_transition(s2, s1, &cond); // s1, s2 can't reach an accept → language is {""} only. - assert_eq!(a.get_cardinality().unwrap(), Cardinality::Integer(1)); + assert_eq!(a.cardinality().unwrap(), Cardinality::Integer(1)); } - // Regression: `get_cardinality` used to `assert!` determinism and panic + // Regression: `cardinality` used to `assert!` determinism and panic // on acyclic NFAs (the only nondeterministic inputs that reach the finite // count; cyclic ones return Infinite earlier). It now determinizes // internally. @@ -173,7 +174,7 @@ mod tests { let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); // Two overlapping transitions from the start: nondeterministic, but // both lead to accepting states after exactly one character. a.add_transition(0, s1, &cond); @@ -182,8 +183,8 @@ mod tests { a.accept(s2); assert!(!a.is_deterministic()); - let cardinality = a.get_cardinality().unwrap(); - let expected = a.determinize().unwrap().get_cardinality().unwrap(); + let cardinality = a.cardinality().unwrap(); + let expected = a.determinize().unwrap().cardinality().unwrap(); assert_eq!(cardinality, expected); assert!(matches!(cardinality, Cardinality::Integer(n) if n > 0)); } diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index 3d432ca..e005d7e 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -8,6 +8,7 @@ impl FastAutomaton { /// Non-deterministic operands are determinized internally, unless the /// execution profile disables implicit determinization, in which case /// [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 6023de6..87b471d 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -13,10 +13,10 @@ impl FastAutomaton { /// which is unbounded exactly when that subgraph has a cycle (any such /// cycle can be pumped). #[must_use] - pub fn get_length(&self) -> (Option, Option) { + pub fn length(&self) -> (Option, Option) { // States that can reach an accept state. If the start state can't, // the language is empty. - let live = self.get_live_states(); + let live = self.live_states(); if !live.contains(&self.start_state) { return (None, None); } @@ -28,7 +28,7 @@ impl FastAutomaton { // for the maximum. let mut min = None; let mut visited = IntSet::default(); - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); + let mut worklist = VecDeque::with_capacity(self.number_of_states()); visited.insert(self.start_state); worklist.push_back((self.start_state, 0u32)); while let Some((state, length)) = worklist.pop_front() { @@ -108,24 +108,24 @@ mod tests { use crate::fast_automaton::FastAutomaton; use crate::fast_automaton::condition::Condition; - // Regression: `get_length` used to set `max = None` on any cycle + // Regression: `length` used to set `max = None` on any cycle // reachable from start, even dead cycles among non-accepting states that // cannot reach an accept. Such cycles don't extend the language; the // max must remain finite. Now fixed by filtering branches to the live // (co-reachable-from-accept) subgraph. #[test] - fn get_length_handles_dead_cycle() { + fn length_handles_dead_cycle() { let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.accept(0); a.add_transition(0, s1, &cond); a.add_transition(s1, s2, &cond); a.add_transition(s2, s1, &cond); // s1, s2 not accepting → language is {""} only. - let (min, max) = a.get_length(); + let (min, max) = a.length(); assert_eq!(min, Some(0), "min length of {{\"\"}} is 0"); assert_eq!( max, @@ -135,38 +135,38 @@ mod tests { } #[test] - fn get_length_finite_and_infinite() { + fn length_finite_and_infinite() { // Chain 0 -> 1 -> 2, accepts {0, 2}: min 0, max 2. let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.add_transition(0, s1, &cond); a.add_transition(s1, s2, &cond); a.accept(0); a.accept(s2); - assert_eq!(a.get_length(), (Some(0), Some(2))); + assert_eq!(a.length(), (Some(0), Some(2))); // Live cycle 0 <-> 1, accept {1}: min 1, max unbounded. let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.add_transition(0, s1, &cond); a.add_transition(s1, 0, &cond); a.accept(s1); - assert_eq!(a.get_length(), (Some(1), None)); + assert_eq!(a.length(), (Some(1), None)); } - // Regression: `get_length` used to enumerate paths with a cloned `seen` + // Regression: `length` used to enumerate paths with a cloned `seen` // set per branch (exponential time and memory on branching DAGs). A chain // of diamonds has 2^k paths; the linear algorithm must handle it // instantly. #[test] - fn get_length_linear_on_branching_dag() { + fn length_linear_on_branching_dag() { const DIAMONDS: usize = 24; let mut a = FastAutomaton::new_empty(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); let mut current = 0; for _ in 0..DIAMONDS { let upper = a.new_state(); @@ -181,6 +181,6 @@ mod tests { a.accept(current); let expected = 2 * DIAMONDS as u32; - assert_eq!(a.get_length(), (Some(expected), Some(expected))); + assert_eq!(a.length(), (Some(expected), Some(expected))); } } diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index d70c724..feeb9aa 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -130,7 +130,7 @@ impl FastAutomaton { /// Returns the states reachable **from the start state** by following /// non-empty transitions (the start state is always included). /// - /// This is forward reachability. Contrast with [`Self::get_live_states`], + /// This is forward reachability. Contrast with [`Self::live_states`], /// which returns the states that can **reach an accept state** /// (co-reachability). pub(crate) fn forward_reachable_states(&self) -> IntSet { @@ -157,7 +157,7 @@ impl FastAutomaton { /// /// This is co-reachability; note it is *not* the set of states reachable /// from the start state. - pub fn get_live_states(&self) -> IntSet { + pub fn live_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); for from_state in self.states() { @@ -202,9 +202,9 @@ impl FastAutomaton { /// drop transitions whose condition lies in the "rest" range. (For a /// spanning set with an empty rest this is exactly the spanning ranges, so /// well-formed automata are unaffected.) - pub fn get_spanning_bases(&self) -> Result, EngineError> { + pub fn spanning_bases(&self) -> Result, EngineError> { self.spanning_set - .get_spanning_ranges_with_rest() + .spanning_ranges_with_rest() .iter() .map(|range| Condition::from_range(range, &self.spanning_set)) .collect() diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index e64baa4..da9e4d0 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -8,6 +8,7 @@ impl FastAutomaton { /// A non-deterministic `other` is determinized internally, unless the /// execution profile disables implicit determinization, in which case /// [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index d603e98..840e41e 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -415,7 +415,7 @@ impl FastAutomaton { /// Recompute a minimal spanning set for the automaton and apply it. pub fn recompute_minimal_spanning_set(&mut self) -> Result<(), EngineError> { - let mut ranges = Vec::with_capacity(self.get_number_of_states()); + let mut ranges = Vec::with_capacity(self.number_of_states()); for state in self.states() { for (condition, _) in self.transitions_from(state) { @@ -501,14 +501,14 @@ mod tests { assert!(a.is_accepted(s1)); assert!(a.has_transition(0, s1)); - assert!(a.get_condition(0, s1).is_some()); + assert!(a.condition(0, s1).is_some()); assert_eq!(a.in_degree(s1), 1); assert_eq!(a.out_degree(0), 1); assert!(a.is_match("a")); // try_add_transition: refuses determinism-breaking additions and // leaves the automaton untouched on Err. - let condition_a = Condition::from_range(&rng('a', 'a'), a.get_spanning_set()).unwrap(); + let condition_a = Condition::from_range(&rng('a', 'a'), a.spanning_set()).unwrap(); assert!(a.is_deterministic()); assert!(a.try_add_transition(0, s2, &condition_a).is_err()); assert!(a.is_deterministic()); @@ -529,7 +529,7 @@ mod tests { // remove_transition removes the edge and updates queries. a.remove_transition(0, s1); assert!(!a.has_transition(0, s1)); - assert!(a.get_condition(0, s1).is_none()); + assert!(a.condition(0, s1).is_none()); assert_eq!(a.in_degree(s1), 0); assert!(!a.is_match("a")); } @@ -555,11 +555,11 @@ mod tests { assert!(!automaton.is_match("x")); // An exactly-covered range takes the fast path: same spanning set. - let before = automaton.get_spanning_set().clone(); + let before = automaton.spanning_set().clone(); automaton .add_transition_from_range(0, s1, &rng('x', 'z')) .unwrap(); - assert_eq!(&before, automaton.get_spanning_set()); + assert_eq!(&before, automaton.spanning_set()); assert!(automaton.is_match("zx")); // An empty range adds nothing. @@ -604,7 +604,7 @@ mod tests { let mut a = FastAutomaton::new_empty(); let s1 = a.new_state(); let s2 = a.new_state(); - let cond = Condition::total(a.get_spanning_set()); + let cond = Condition::total(a.spanning_set()); a.add_transition(0, s1, &cond); a.add_transition(0, s2, &cond); a.accept(s1); diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index dd99b7c..f371a7b 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -28,7 +28,7 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { let mut to_base_map = IntMap::with_capacity(to_spanning_set.spanning_ranges_with_rest_len()); for (i, base) in to_spanning_set - .get_spanning_ranges_with_rest() + .spanning_ranges_with_rest() .into_iter() .enumerate() { @@ -36,8 +36,8 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { } let mut equivalence_map: Vec> = - Vec::with_capacity(from_spanning_set.get_number_of_spanning_ranges() + 1); - for from_base in from_spanning_set.get_spanning_ranges_with_rest().iter() { + Vec::with_capacity(from_spanning_set.number_of_spanning_ranges() + 1); + for from_base in from_spanning_set.spanning_ranges_with_rest().iter() { let mut index = Vec::with_capacity(1); for (i, to_base) in &to_base_map { if from_base == to_base || from_base.has_intersection(to_base) { @@ -92,12 +92,12 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { } /// Returns `from_spanning_set`. - pub fn get_from_spanning_set(&self) -> &'a SpanningSet { + pub fn from_spanning_set(&self) -> &'a SpanningSet { self.from_spanning_set } /// Returns `to_spanning_set`. - pub fn get_to_spanning_set(&self) -> &'b SpanningSet { + pub fn to_spanning_set(&self) -> &'b SpanningSet { self.to_spanning_set } } @@ -109,7 +109,7 @@ mod tests { use super::*; - fn get_from_spanning_set() -> SpanningSet { + fn from_spanning_set() -> SpanningSet { let ranges = vec![ CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')), CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), @@ -119,7 +119,7 @@ mod tests { SpanningSet::compute_spanning_set(&ranges) } - fn get_to_spanning_set() -> SpanningSet { + fn to_spanning_set() -> SpanningSet { let ranges = vec![ CharRange::new_from_range(Char::new('\0')..=Char::new('\u{1}')), CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), @@ -133,8 +133,8 @@ mod tests { #[test] fn test_convert() -> Result<(), String> { - let from_spanning_set = get_from_spanning_set(); - let to_spanning_set = get_to_spanning_set(); + let from_spanning_set = from_spanning_set(); + let to_spanning_set = to_spanning_set(); let converter = ConditionConverter::new(&from_spanning_set, &to_spanning_set).unwrap(); diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index 78116d4..631c7d8 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -141,7 +141,7 @@ impl FastBitVec { (!0) >> ((64 - bits % 64) % 64) } - pub fn get_bits(&self) -> Vec { + pub fn bits(&self) -> Vec { let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { bits.push(self.get(i)); diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index 788ae88..4ac466b 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -52,7 +52,7 @@ impl Condition { /// expressible in the current spanning set (no base is fully contained in /// `range`). In that case, extend the spanning set first with /// [`SpanningSet::merge`] or [`SpanningSet::compute_spanning_set`], apply - /// it with [`FastAutomaton::apply_new_spanning_set`], then retry. + /// it with [`crate::fast_automaton::FastAutomaton::apply_new_spanning_set`], then retry. pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); @@ -62,11 +62,7 @@ impl Condition { let mut cond = Self::empty(spanning_set); - for (i, base) in spanning_set - .get_spanning_ranges_with_rest() - .iter() - .enumerate() - { + for (i, base) in spanning_set.spanning_ranges_with_rest().iter().enumerate() { if range.contains_all(base) { cond.0.set(i, true); } @@ -95,11 +91,7 @@ impl Condition { let mut range = CharRange::empty(); - for (i, base) in spanning_set - .get_spanning_ranges_with_rest() - .iter() - .enumerate() - { + for (i, base) in spanning_set.spanning_ranges_with_rest().iter().enumerate() { if self.0.get(i) { range = range.union(base); } @@ -108,21 +100,21 @@ impl Condition { Ok(range) } - /// Returns the condition matching characters in `self` or `cond` (bitwise + /// Returns the condition matching characters in `self` or `other` (bitwise /// OR). Both must share the same spanning set. #[inline] - pub fn union(&self, cond: &Condition) -> Self { + pub fn union(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - new_cond.0.union(&cond.0); + new_cond.0.union(&other.0); new_cond } - /// Returns the condition matching characters in both `self` and `cond` + /// Returns the condition matching characters in both `self` and `other` /// (bitwise AND). Both must share the same spanning set. #[inline] - pub fn intersection(&self, cond: &Condition) -> Self { + pub fn intersection(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - new_cond.0.intersection(&cond.0); + new_cond.0.intersection(&other.0); new_cond } @@ -135,21 +127,21 @@ impl Condition { new_cond } - /// Returns the condition matching characters in `self` but not in `cond` + /// Returns the condition matching characters in `self` but not in `other` /// (bitwise AND-NOT). Both must share the same spanning set. #[inline] - pub fn difference(&self, cond: &Condition) -> Self { + pub fn difference(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - let subtrahend = cond.complement(); + let subtrahend = other.complement(); new_cond.0.intersection(&subtrahend.0); new_cond } - /// Returns `true` if `self` and `cond` share at least one character (their + /// Returns `true` if `self` and `other` share at least one character (their /// intersection is non-empty). Both must share the same spanning set. #[inline] - pub fn has_intersection(&self, cond: &Condition) -> bool { - self.0.has_intersection(&cond.0) + pub fn has_intersection(&self, other: &Condition) -> bool { + self.0.has_intersection(&other.0) } /// Returns `true` if the condition matches `character` (a Unicode scalar @@ -183,15 +175,15 @@ impl Condition { /// Returns the number of characters the condition matches, evaluated /// against `spanning_set`. #[inline] - pub fn get_cardinality(&self, spanning_set: &SpanningSet) -> Result { + pub fn cardinality(&self, spanning_set: &SpanningSet) -> Result { Ok(self.to_range(spanning_set)?.get_cardinality()) } /// Returns the condition as a vector of bits, one per range of the spanning /// set it was built against (the rest range first, when present). #[inline] - pub fn get_binary_representation(&self) -> Vec { - self.0.get_bits() + pub fn binary_representation(&self) -> Vec { + self.0.bits() } } @@ -202,7 +194,7 @@ mod tests { use super::*; - fn get_spanning_set() -> SpanningSet { + fn spanning_set() -> SpanningSet { let ranges = vec![ CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), @@ -234,7 +226,7 @@ mod tests { let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( Char::new('a')..=Char::new('a'), )]); - let large = get_spanning_set(); + let large = spanning_set(); let condition = Condition::total(&small); assert_eq!( @@ -256,7 +248,7 @@ mod tests { let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( Char::new('a')..=Char::new('a'), )]); - let merged = small.merge(&get_spanning_set()); + let merged = small.merge(&spanning_set()); let converter = ConditionConverter::new(&small, &merged).unwrap(); let foreign = Condition::total(&merged); @@ -268,21 +260,18 @@ mod tests { #[test] fn test_empty_total() -> Result<(), String> { - let spanning_set = get_spanning_set(); + let spanning_set = spanning_set(); let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); assert_eq!( vec![false, false, false, false], - empty.get_binary_representation() + empty.binary_representation() ); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!( - vec![true, true, true, true], - total.get_binary_representation() - ); + assert_eq!(vec![true, true, true, true], total.binary_representation()); assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); @@ -310,13 +299,13 @@ mod tests { empty, Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); - assert_eq!(vec![false], empty.get_binary_representation()); + assert_eq!(vec![false], empty.binary_representation()); assert_eq!( total, Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); - assert_eq!(vec![true], total.get_binary_representation()); + assert_eq!(vec![true], total.binary_representation()); assert_eq!(empty, total.complement()); assert_eq!(total, empty.complement()); @@ -326,7 +315,7 @@ mod tests { #[test] fn test_from_to_range() -> Result<(), String> { - let spanning_set = get_spanning_set(); + let spanning_set = spanning_set(); for range in get_test_cases_range() { assert_range_convertion_to_range(&range, &spanning_set); @@ -348,7 +337,7 @@ mod tests { #[test] fn test_project_to() -> Result<(), String> { - let current_spanning_set = get_spanning_set(); + let current_spanning_set = spanning_set(); let ranges = vec![ CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), @@ -405,7 +394,7 @@ mod tests { #[test] fn test_union_intersection_complement() -> Result<(), String> { - let used_characters = get_spanning_set(); + let used_characters = spanning_set(); for range_1 in get_test_cases_range() { for range_2 in get_test_cases_range() { diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index acbeab0..27723a9 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -4,6 +4,7 @@ mod state_elimination; impl FastAutomaton { /// Converts the automaton to a [`RegularExpression`]. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states()))] pub fn to_regex(&self) -> RegularExpression { state_elimination::convert_to_regex(self) } diff --git a/src/fast_automaton/convert/to_regex/state_elimination/builder.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs index a8f4054..241faa3 100644 --- a/src/fast_automaton/convert/to_regex/state_elimination/builder.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -7,9 +7,9 @@ impl Gnfa { let mut state_elimination_automaton = Gnfa { start_state: 0, // start_state is not set yet accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(automaton.get_number_of_states()), - transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), - removed_states: IntSet::with_capacity(automaton.get_number_of_states()), + transitions: Vec::with_capacity(automaton.number_of_states()), + transitions_in: IntMap::with_capacity(automaton.number_of_states()), + removed_states: IntSet::with_capacity(automaton.number_of_states()), empty: false, }; @@ -18,7 +18,7 @@ impl Gnfa { return state_elimination_automaton; } - let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); + let mut states_map = IntMap::with_capacity(automaton.number_of_states()); for from_state in automaton.states() { let new_from_state = *states_map @@ -33,21 +33,21 @@ impl Gnfa { new_from_state, new_to_state, RegularExpression::Character( - condition.to_range(automaton.get_spanning_set()).unwrap(), + condition.to_range(automaton.spanning_set()).unwrap(), ), ); } } - if automaton.in_degree(automaton.get_start_state()) == 0 { + if automaton.in_degree(automaton.start_state()) == 0 { // If the start state does not have any incoming state we just set it state_elimination_automaton.start_state = - *states_map.get(&automaton.get_start_state()).unwrap(); + *states_map.get(&automaton.start_state()).unwrap(); } else { // If not we create a new state that will be the new start state state_elimination_automaton.start_state = state_elimination_automaton.new_state(); - let previous_start_state = *states_map.get(&automaton.get_start_state()).unwrap(); + let previous_start_state = *states_map.get(&automaton.start_state()).unwrap(); // We add an empty string transition to the new start state state_elimination_automaton.add_transition( state_elimination_automaton.start_state, @@ -56,16 +56,16 @@ impl Gnfa { ); } - let accept_state = *automaton.get_accept_states().iter().next().unwrap(); - if automaton.get_accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { + let accept_state = *automaton.accept_states().iter().next().unwrap(); + if automaton.accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { // If there is only one accept state we just set it state_elimination_automaton.accept_state = *states_map - .get(automaton.get_accept_states().iter().next().unwrap()) + .get(automaton.accept_states().iter().next().unwrap()) .unwrap(); } else { // If not we create a new state that will be the new accept state state_elimination_automaton.accept_state = state_elimination_automaton.new_state(); - for accept_state in automaton.get_accept_states() { + for accept_state in automaton.accept_states() { let accept_state = *states_map.get(accept_state).unwrap(); // We add an empty string transition to the new accept state state_elimination_automaton.add_transition( diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index a7e3413..581cffc 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -41,6 +41,7 @@ impl FastAutomaton { /// calls with different offsets may repeat strings (or skip some). /// [`determinize`](Self::determinize) (and ideally /// [`minimize`](Self::minimize)) first to make pages disjoint. + #[tracing::instrument(level = "debug", skip(self), fields(states = self.number_of_states(), deterministic=self.is_deterministic(), limit=limit, offset=offset))] pub fn generate_strings( &self, limit: usize, @@ -50,7 +51,7 @@ impl FastAutomaton { return Ok(vec![]); } - let (_, max) = self.get_length(); + let (_, max) = self.length(); let max_len = max.unwrap_or(u32::MAX) as usize; let execution_profile = ExecutionProfile::get(); @@ -91,7 +92,7 @@ impl FastAutomaton { let mut visited = AHashSet::with_capacity(num_states); let mut q = BinaryHeap::new(); - let start_state = self.get_start_state(); + let start_state = self.start_state(); // If the start state can't reach an accept state, exit immediately if dist[start_state] != usize::MAX { @@ -439,7 +440,7 @@ mod tests { "Chunked generation did not match bulk generation" ); - let cardinality = automaton.get_cardinality().unwrap(); + let cardinality = automaton.cardinality().unwrap(); if let Cardinality::Integer(count) = cardinality { let empty_chunk = automaton.generate_strings(10, count as usize).unwrap(); diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 8fa7c9f..5599976 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -151,7 +151,7 @@ impl FastAutomaton { if !self.has_state(*from_state) { continue; } - if let Some(condition) = self.get_condition(*from_state, state) { + if let Some(condition) = self.condition(*from_state, state) { in_transitions.push((*from_state, condition.clone())); } } @@ -209,14 +209,14 @@ impl FastAutomaton { /// Returns the number of states in the automaton. #[inline] - pub fn get_number_of_states(&self) -> usize { + pub fn number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } /// Returns a reference to the condition of the directed transition between the two states, if any. /// Returns `None` if either state does not exist. #[inline] - pub fn get_condition(&self, from_state: State, to_state: State) -> Option<&Condition> { + pub fn condition(&self, from_state: State, to_state: State) -> Option<&Condition> { self.transitions .get(from_state) .and_then(|t| t.get(&to_state)) @@ -224,19 +224,19 @@ impl FastAutomaton { /// Returns the start state. #[inline] - pub fn get_start_state(&self) -> State { + pub fn start_state(&self) -> State { self.start_state } /// Returns a reference to the set of accept (final) states. #[inline] - pub fn get_accept_states(&self) -> &IntSet { + pub fn accept_states(&self) -> &IntSet { &self.accept_states } /// Returns a reference to the automaton's spanning set. #[inline] - pub fn get_spanning_set(&self) -> &SpanningSet { + pub fn spanning_set(&self) -> &SpanningSet { &self.spanning_set } @@ -271,6 +271,7 @@ impl FastAutomaton { } /// Returns `true` if the automaton matches the given string. + #[tracing::instrument(level = "debug", skip(self, string), fields(states = self.number_of_states(), string_len=string.len()))] pub fn is_match(&self, string: &str) -> bool { let mut current: IntSet = IntSet::default(); current.insert(self.start_state); @@ -300,7 +301,7 @@ impl FastAutomaton { /// Returns the automaton's DOT representation. #[inline] - pub fn as_dot(&self) -> String { + pub fn to_dot(&self) -> String { format!("{self}") } @@ -353,10 +354,10 @@ mod tests { } #[test] - fn get_condition_safe_on_unknown_state() { + fn condition_safe_on_unknown_state() { let a = FastAutomaton::new_total(); - assert!(a.get_condition(999, 0).is_none()); - assert!(a.get_condition(0, 999).is_none()); + assert!(a.condition(999, 0).is_none()); + assert!(a.condition(0, 999).is_none()); } #[test] diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs index 9af1607..34ee4ae 100644 --- a/src/fast_automaton/operation/concat.rs +++ b/src/fast_automaton/operation/concat.rs @@ -13,6 +13,7 @@ impl FastAutomaton { } /// Computes the concatenation of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] pub fn concat_all<'a, I: IntoIterator>( automata: I, ) -> Result { @@ -61,7 +62,7 @@ impl FastAutomaton { let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), + other.number_of_states(), BuildHasherDefault::default(), ); @@ -154,13 +155,13 @@ impl FastAutomaton { if other.is_empty() { return 1; } else if other.is_empty_string() { - return self.get_number_of_states(); + return self.number_of_states(); } if self.is_empty() { return 1; } else if self.is_empty_string() { - return other.get_number_of_states(); + return other.number_of_states(); } // Determine if we are forced to create a new state to avoid unintended loops @@ -171,8 +172,8 @@ impl FastAutomaton { .cloned() .any(|s| self.out_degree(s) > 0); - let v1 = self.get_number_of_states(); - let v2 = other.get_number_of_states(); + let v1 = self.number_of_states(); + let v2 = other.number_of_states(); // Apply the heuristic if start_state_and_accept_states_not_mergeable { @@ -492,7 +493,7 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert_eq!(1, automaton.get_number_of_states()); + assert_eq!(1, automaton.number_of_states()); Ok(()) } @@ -503,7 +504,7 @@ mod tests { .to_automaton() .unwrap(); automaton.print_dot(); - assert_eq!(3, automaton.get_number_of_states()); + assert_eq!(3, automaton.number_of_states()); Ok(()) } @@ -560,7 +561,7 @@ mod tests { // Execute the actual mutation actual_concat.concat_mut(a2).unwrap(); - let actual_states = actual_concat.get_number_of_states(); + let actual_states = actual_concat.number_of_states(); let heuristic_states = a1.concat_state_count_heuristic(a2); assert_eq!( diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 1099d45..89f7fa7 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -18,17 +18,18 @@ impl FastAutomaton { } /// Determinizes the automaton and returns the result. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { return Ok(Cow::Borrowed(self)); } let execution_profile = ExecutionProfile::get(); - let bases = self.get_spanning_bases()?; + let bases = self.spanning_bases()?; - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); + let mut worklist = VecDeque::with_capacity(self.number_of_states()); - let map_capacity = (self.get_number_of_states() as f64 / 0.75).ceil() as usize; + let map_capacity = (self.number_of_states() as f64 / 0.75).ceil() as usize; let mut new_states = AHashMap::with_capacity(map_capacity); let mut accept_states = BitSet::new(); @@ -98,7 +99,7 @@ mod tests { use crate::regex::RegularExpression; use regex_charclass::char::Char; - // Regression: subset construction iterates `get_spanning_bases`, which used + // Regression: subset construction iterates `spanning_bases`, which used // to omit the spanning set's "rest" range. A transition whose condition // lies in the rest range was therefore silently dropped, so determinizing a // non-deterministic automaton that uses the rest range produced a DFA with @@ -110,7 +111,7 @@ mod tests { CharRange::new_from_range(c..=c) }; let ss = SpanningSet::compute_spanning_set(&[rng('a'), rng('b')]); - let rest = ss.get_rest().clone(); + let rest = ss.rest().clone(); let mut a = FastAutomaton::new_empty(); a.apply_new_spanning_set(&ss).unwrap(); @@ -153,11 +154,11 @@ mod tests { .unwrap() .to_automaton() .unwrap(); - println!("States Before: {}", automaton.get_number_of_states()); + println!("States Before: {}", automaton.number_of_states()); let deterministic_automaton = automaton.determinize().unwrap(); println!( "States After: {}", - deterministic_automaton.get_number_of_states() + deterministic_automaton.number_of_states() ); assert!(deterministic_automaton.is_deterministic()); //deterministic_automaton.print_dot(); diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs index ee865f2..b5410c6 100644 --- a/src/fast_automaton/operation/difference.rs +++ b/src/fast_automaton/operation/difference.rs @@ -13,16 +13,16 @@ impl FastAutomaton { let crash_state = self.new_state(); let mut transitions_to_crash_state: IntMap = IntMap::with_capacity_and_hasher( - self.get_number_of_states(), + self.number_of_states(), BuildHasherDefault::default(), ); - let mut ranges = Vec::with_capacity(self.get_number_of_states()); + let mut ranges = Vec::with_capacity(self.number_of_states()); for from_state in self.states() { let mut new_condition = Condition::empty(&self.spanning_set); for (condition, _) in self.transitions_from(from_state) { new_condition = new_condition.union(condition); - ranges.push(condition.to_range(self.get_spanning_set())?); + ranges.push(condition.to_range(self.spanning_set())?); } new_condition = new_condition.complement(); @@ -32,7 +32,7 @@ impl FastAutomaton { for (from_state, condition) in &transitions_to_crash_state { self.add_transition(*from_state, crash_state, condition); - ranges.push(condition.to_range(self.get_spanning_set())?); + ranges.push(condition.to_range(self.spanning_set())?); } let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); @@ -51,6 +51,7 @@ impl FastAutomaton { /// unless the execution profile disables implicit determinization, in /// which case [`EngineError::DeterministicAutomatonRequired`] is /// returned. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] pub fn complement(&mut self) -> Result<(), EngineError> { if !self.deterministic { *self = self.determinize_implicit()?.into_owned(); @@ -74,6 +75,7 @@ impl FastAutomaton { /// If `other` is non-deterministic, it is determinized first, unless /// the execution profile disables implicit determinization, in which /// case [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] pub fn difference(&self, other: &FastAutomaton) -> Result { let mut complement = other.determinize_implicit()?.into_owned(); complement.complement()?; diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 39ec050..c89e623 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -16,6 +16,7 @@ impl FastAutomaton { } /// Computes the intersection of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] pub fn intersection_all<'a, I: IntoIterator>( automata: I, ) -> Result { @@ -36,6 +37,7 @@ impl FastAutomaton { /// /// Only available with the `parallel` feature (enabled by default). #[cfg(feature = "parallel")] + #[tracing::instrument(level = "debug", skip_all)] pub fn intersection_all_par<'a, I: IntoParallelIterator>( automata: I, ) -> Result { @@ -81,9 +83,9 @@ impl FastAutomaton { let mut new_automaton = FastAutomaton::new_empty(); let mut worklist = - VecDeque::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + VecDeque::with_capacity(self.number_of_states() + other.number_of_states()); let mut new_states: AHashMap<(usize, usize), (usize, usize, usize), _> = - AHashMap::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + AHashMap::with_capacity(self.number_of_states() + other.number_of_states()); let initial_pair = ( new_automaton.start_state, @@ -132,6 +134,7 @@ impl FastAutomaton { } /// Returns `true` if the two automata have a non-empty intersection. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), other_states = other.number_of_states()))] pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); @@ -149,9 +152,9 @@ impl FastAutomaton { let mut new_automaton = FastAutomaton::new_empty(); let mut worklist = - VecDeque::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + VecDeque::with_capacity(self.number_of_states() + other.number_of_states()); let mut new_states: AHashMap<(usize, usize), (usize, usize, usize), _> = - AHashMap::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + AHashMap::with_capacity(self.number_of_states() + other.number_of_states()); let initial_pair = ( new_automaton.start_state, diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs index 75f54bf..13cbd63 100644 --- a/src/fast_automaton/operation/minimize.rs +++ b/src/fast_automaton/operation/minimize.rs @@ -9,6 +9,7 @@ impl FastAutomaton { /// unless the [`ExecutionProfile`] disables implicit determinization, in /// which case [`EngineError::DeterministicAutomatonRequired`] is /// returned. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic(), minimal = self.is_minimal()))] pub fn minimize(&mut self) -> Result<(), EngineError> { // The `minimal` flag is conservatively cleared on every mutation, so // it can be trusted here; this also keeps the @@ -35,7 +36,7 @@ impl FastAutomaton { let max_states = self.transitions.len(); let all_states: IntSet = self.states().collect(); - let accept_states: IntSet = self.get_accept_states().iter().cloned().collect(); + let accept_states: IntSet = self.accept_states().iter().cloned().collect(); let non_accept_states: IntSet = all_states.difference(&accept_states).cloned().collect(); @@ -52,7 +53,7 @@ impl FastAutomaton { let mut worklist: Vec = (0..partitions.len()).collect(); let mut in_worklist: Vec = vec![true; max_states]; - let bases = self.get_spanning_bases()?; + let bases = self.spanning_bases()?; let mut inverse_transitions: Vec> = vec![Vec::new(); max_states]; for to_state in self.states() { @@ -61,7 +62,7 @@ impl FastAutomaton { } } - let mut x = IntSet::with_capacity(self.get_number_of_states()); + let mut x = IntSet::with_capacity(self.number_of_states()); let mut intersection_states: Vec> = vec![Vec::new(); max_states]; let mut touched_partitions: Vec = Vec::with_capacity(max_states); @@ -158,8 +159,8 @@ impl FastAutomaton { let mut representatives = Vec::with_capacity(partitions.len()); for partition in partitions { - let representative = if partition.contains(&self.get_start_state()) { - self.get_start_state() + let representative = if partition.contains(&self.start_state()) { + self.start_state() } else { *partition .iter() diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 1b59ce0..9b558c6 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -16,7 +16,7 @@ impl FastAutomaton { /// automaton collapses to the canonical empty automaton. pub fn remove_dead_states(&mut self) { if !self.is_empty() { - let live_states = self.get_live_states(); + let live_states = self.live_states(); let mut dead_states = IntSet::default(); for from_state in self.states() { @@ -46,8 +46,8 @@ mod tests { .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_live_states().len()); + assert_eq!(3, intersection.number_of_states()); + assert_eq!(3, intersection.live_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs index 9f65f37..7c2198d 100644 --- a/src/fast_automaton/operation/repeat.rs +++ b/src/fast_automaton/operation/repeat.rs @@ -2,6 +2,7 @@ use super::*; impl FastAutomaton { /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + #[tracing::instrument(level = "debug", skip(self), fields(states = self.number_of_states(), deterministic = self.is_deterministic(), min = min, max_opt = tracing::field::debug(max_opt)))] pub fn repeat(&self, min: u32, max_opt: Option) -> Result { let mut automaton = self.clone(); if let Err(error) = automaton.repeat_mut(min, max_opt) { @@ -123,7 +124,7 @@ impl FastAutomaton { } } - star.accept(star.get_start_state()); + star.accept(star.start_state()); } self.apply_model(&star); @@ -182,7 +183,7 @@ impl FastAutomaton { return 1; } - let v_original = self.get_number_of_states(); + let v_original = self.number_of_states(); if v_original == 0 { return 0; } @@ -245,7 +246,7 @@ impl FastAutomaton { let acc_out_gt_0 = self.accept_states.iter().any(|&s| self.out_degree(s) > 0); match self.repeat(0, None) { Ok(star) => { - let star_states = star.get_number_of_states(); + let star_states = star.number_of_states(); let not_mergeable = star.in_degree(star.start_state) > 0 && acc_out_gt_0; let final_concat_cost = if not_mergeable { star_states @@ -484,7 +485,7 @@ mod tests { // Execute the actual mutation (assuming repeat_mut is the core method) actual_automaton.repeat_mut(min, max_opt).unwrap(); - let actual_states = actual_automaton.get_number_of_states(); + let actual_states = actual_automaton.number_of_states(); let heuristic_states = automaton.repeat_state_count_heuristic(min, max_opt); assert_eq!( diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs index 1050aa8..9eb515b 100644 --- a/src/fast_automaton/operation/union.rs +++ b/src/fast_automaton/operation/union.rs @@ -15,6 +15,7 @@ impl FastAutomaton { } /// Computes the union of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] pub fn union_all<'a, I: IntoIterator>( automata: I, ) -> Result { @@ -29,6 +30,7 @@ impl FastAutomaton { /// /// Only available with the `parallel` feature (enabled by default). #[cfg(feature = "parallel")] + #[tracing::instrument(level = "debug", skip_all)] pub fn union_all_par<'a, I: IntoParallelIterator>( automata: I, ) -> Result { @@ -203,7 +205,7 @@ impl FastAutomaton { let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), + other.number_of_states(), BuildHasherDefault::default(), ); @@ -241,13 +243,13 @@ impl FastAutomaton { fn union_state_count_heuristic(&self, other: &FastAutomaton) -> usize { // Edge cases if other.is_empty() || self.is_total() { - return self.get_number_of_states(); + return self.number_of_states(); } else if other.is_total() || self.is_empty() { - return other.get_number_of_states(); + return other.number_of_states(); } - let v1 = self.get_number_of_states(); - let v2 = other.get_number_of_states(); + let v1 = self.number_of_states(); + let v2 = other.number_of_states(); let self_in = self.in_degree(self.start_state); let other_in = other.in_degree(other.start_state); @@ -622,7 +624,7 @@ mod tests { let mut actual_union = a1.clone(); actual_union.union_mut(a2).unwrap(); - let actual_states = actual_union.get_number_of_states(); + let actual_states = actual_union.number_of_states(); let heuristic_states = a1.union_state_count_heuristic(a2); assert_eq!( diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index dc4ed61..dda6178 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -49,7 +49,7 @@ impl SpanningSet { } } - pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec { + pub(crate) fn spanning_ranges_with_rest(&self) -> Vec { if self.1.is_empty() { self.0.clone() } else { @@ -60,22 +60,22 @@ impl SpanningSet { } /// Returns an iterator over the explicit (non-rest) ranges in the spanning set. - pub fn get_spanning_ranges(&self) -> Iter<'_, CharRange> { + pub fn spanning_ranges(&self) -> Iter<'_, CharRange> { self.0.iter() } /// Returns the number of explicit (non-rest) ranges in the spanning set. - pub fn get_number_of_spanning_ranges(&self) -> usize { + pub fn number_of_spanning_ranges(&self) -> usize { self.0.len() } /// Returns the explicit range at index `i`, or `None` if out of bounds. - pub fn get_spanning_range(&self, i: usize) -> Option<&CharRange> { + pub fn spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } /// Returns the "rest" range covering all characters not in any explicit range. - pub fn get_rest(&self) -> &CharRange { + pub fn rest(&self) -> &CharRange { &self.1 } diff --git a/src/lib.rs b/src/lib.rs index a406dae..b75408e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,7 +64,7 @@ pub type CharRange = RangeSet; /// assert_eq!(rep.to_pattern(), "(abc){2,4}"); /// /// // Analyze -/// assert_eq!(rep.get_length(), (Some(6), Some(12))); +/// assert_eq!(rep.length(), (Some(6), Some(12))); /// assert!(!rep.is_empty()?); /// /// // Generate examples @@ -85,6 +85,15 @@ pub type CharRange = RangeSet; /// /// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. /// +/// # Tracing +/// +/// The core operations on [`Term`], [`FastAutomaton`], and [`RegularExpression`] +/// are instrumented with [`tracing`](https://docs.rs/tracing) spans (mostly at +/// `debug` level). Install a [`tracing-subscriber`](https://docs.rs/tracing-subscriber) +/// (or any other `tracing` subscriber) in your application to observe them; if +/// no subscriber is installed, instrumentation has negligible overhead and +/// produces no output. +/// /// # Equality /// /// `PartialEq`/`Eq` (`==`) compare the **underlying representation**, not the @@ -212,6 +221,7 @@ impl Term { /// /// assert_eq!("abcd.+", concat.to_pattern()); /// ``` + #[tracing::instrument(level = "debug", skip_all)] pub fn concat( &self, terms: impl IntoIterator>, @@ -267,6 +277,7 @@ impl Term { /// /// assert_eq!("(abc|de|fghi)", union.to_pattern()); /// ``` + #[tracing::instrument(level = "debug", skip_all)] pub fn union( &self, terms: impl IntoIterator>, @@ -302,9 +313,7 @@ impl Term { Ok(Term::Automaton(return_automaton)) } else { - let regexes_list = self - .get_regexes(&terms) - .expect("No automaton should be here so this operation is not supposed to fail."); + let regexes_list = self.get_regexes(&terms); let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); @@ -329,6 +338,7 @@ impl Term { /// /// assert_eq!("deabc", intersection.to_pattern()); /// ``` + #[tracing::instrument(level = "debug", skip_all)] pub fn intersection( &self, terms: impl IntoIterator>, @@ -368,6 +378,7 @@ impl Term { /// /// assert_eq!("abc", difference.to_pattern()); /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] pub fn difference(&self, other: &Term) -> Result { Self::run_with_implicit_determinization(|| { let minuend_automaton = self.to_automaton()?; @@ -393,6 +404,7 @@ impl Term { /// assert!(term.intersection(&[complement.clone()]).unwrap().is_empty().unwrap()); /// assert!(term.union(&[complement]).unwrap().is_total().unwrap()); /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] pub fn complement(&self) -> Result { Self::run_with_implicit_determinization(|| { // `FastAutomaton::complement` determinizes `self` itself. @@ -421,6 +433,7 @@ impl Term { /// assert_eq!("(abc){3,5}", term.repeat(3..6).unwrap().to_pattern()); /// assert_eq!("(abc){0,2}", term.repeat(..=2).unwrap().to_pattern()); /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), min = tracing::field::Empty, max = tracing::field::Empty))] pub fn repeat(&self, range: impl RangeBounds) -> Result { let min = match range.start_bound() { Bound::Included(&n) => n, @@ -432,6 +445,9 @@ impl Term { Bound::Excluded(&n) => Some(n.saturating_sub(1)), Bound::Unbounded => None, }; + let span = tracing::Span::current(); + span.record("min", min); + span.record("max", tracing::field::debug(max_opt)); match self { Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( regular_expression.repeat(min, max_opt), @@ -470,6 +486,7 @@ impl Term { /// let batch = term.generate_strings(2, 2).unwrap(); /// assert_eq!(2, batch.len()); // ["abcde", "abcabc"] /// ``` + #[tracing::instrument(level = "debug", skip(self), fields(self_deterministic = self.is_deterministic(), limit = limit, offset = offset))] pub fn generate_strings( &self, limit: usize, @@ -542,6 +559,7 @@ impl Term { /// assert!(dfa.is_deterministic()); /// assert!(term.equivalent(&dfa).unwrap()); /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] pub fn determinize(&self) -> Result { let automaton = self.to_automaton()?; let determinized = automaton.determinize()?.into_owned(); @@ -561,6 +579,7 @@ impl Term { /// assert!(minimal.is_minimal()); /// assert!(term.equivalent(&minimal).unwrap()); /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), self_minimal = self.is_minimal()))] pub fn minimize(&self) -> Result { Self::run_with_implicit_determinization(|| { let mut automaton = self.to_automaton()?.into_owned(); @@ -581,14 +600,15 @@ impl Term { /// /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn equivalent(&self, term: &Term) -> Result { - if self == term { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] + pub fn equivalent(&self, other: &Term) -> Result { + if self == other { return Ok(true); } Self::run_with_implicit_determinization(|| { let automaton_1 = self.to_automaton()?; - let automaton_2 = term.to_automaton()?; + let automaton_2 = other.to_automaton()?; automaton_1.equivalent(&automaton_2) }) } @@ -605,14 +625,15 @@ impl Term { /// /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn subset(&self, term: &Term) -> Result { - if self == term { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] + pub fn subset(&self, other: &Term) -> Result { + if self == other { return Ok(true); } Self::run_with_implicit_determinization(|| { let automaton_1 = self.to_automaton()?; - let automaton_2 = term.to_automaton()?; + let automaton_2 = other.to_automaton()?; automaton_1.subset(&automaton_2) }) } @@ -632,6 +653,7 @@ impl Term { /// assert!(term.matches("abcdef").unwrap()); /// assert!(!term.matches("xyzabc").unwrap()); /// ``` + #[tracing::instrument(level = "debug", skip(self, input), fields(self_deterministic = self.is_deterministic(), input_len = input.len()))] pub fn matches(&self, input: &str) -> Result { Ok(self.to_automaton()?.is_match(input)) } @@ -726,10 +748,10 @@ impl Term { /// matched). `None` for the maximum means the language is infinite /// (unbounded match length). #[must_use] - pub fn get_length(&self) -> (Option, Option) { + pub fn length(&self) -> (Option, Option) { match self { - Term::RegularExpression(regex) => regex.get_length(), - Term::Automaton(automaton) => automaton.get_length(), + Term::RegularExpression(regex) => regex.length(), + Term::Automaton(automaton) => automaton.length(), } } @@ -738,11 +760,12 @@ impl Term { /// The exact count is represented as `u32`. If the exact count exceeds /// `u32::MAX`, the result is `Cardinality::BigInteger` rather than a /// truncated value. Infinite languages return `Cardinality::Infinite`. - pub fn get_cardinality(&self) -> Result, EngineError> { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn cardinality(&self) -> Result, EngineError> { match self { - Term::RegularExpression(regex) => Ok(regex.get_cardinality()), + Term::RegularExpression(regex) => Ok(regex.cardinality()), Term::Automaton(automaton) => { - Self::run_with_implicit_determinization(|| automaton.get_cardinality()) + Self::run_with_implicit_determinization(|| automaton.cardinality()) } } } @@ -750,7 +773,7 @@ impl Term { /// Returns `true` if the term matches a finite number of strings. /// /// A finite language is one with no unbounded repetition (`*`, `+`, ...). - /// Convenience over [`get_cardinality`](Self::get_cardinality) when only the + /// Convenience over [`cardinality`](Self::cardinality) when only the /// finite/infinite distinction matters. /// /// # Examples @@ -762,7 +785,7 @@ impl Term { /// assert!(!Term::from_pattern("a+").unwrap().is_finite().unwrap()); /// ``` pub fn is_finite(&self) -> Result { - Ok(!matches!(self.get_cardinality()?, Cardinality::Infinite)) + Ok(!matches!(self.cardinality()?, Cardinality::Infinite)) } /// Converts the term to a [`FastAutomaton`]. @@ -770,6 +793,7 @@ impl Term { /// Returns a [`Cow`]: borrows the automaton when the term is already /// automaton-backed, and allocates a new one when converting from a /// [`RegularExpression`]. + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), @@ -783,6 +807,7 @@ impl Term { /// regex-backed, and allocates a new one when converting from a /// [`FastAutomaton`] via state elimination. #[must_use] + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] pub fn to_regex(&self) -> Cow<'_, RegularExpression> { match self { Term::RegularExpression(regex) => Cow::Borrowed(regex), @@ -830,14 +855,14 @@ impl Term { Ok(automaton_list) } - fn get_regexes<'a>(&'a self, terms: &[&'a Term]) -> Option>> { + fn get_regexes<'a>(&'a self, terms: &[&'a Term]) -> Vec> { let mut regex_list = Vec::with_capacity(terms.len() + 1); regex_list.push(self.to_regex()); let mut terms_regexes = terms.iter().map(|a| a.to_regex()).collect::>(); regex_list.append(&mut terms_regexes); - Some(regex_list) + regex_list } } @@ -903,17 +928,17 @@ mod tests { .unwrap() ); - println!("term: {}", term.to_automaton().unwrap().as_dot()); + println!("term: {}", term.to_automaton().unwrap().to_dot()); if let Term::Automaton(complement) = &complement { - println!("complement: {}", complement.as_dot()); + println!("complement: {}", complement.to_dot()); } let union = term.union(&[complement]).unwrap(); if let Term::Automaton(union) = &union { - println!("{}", union.as_dot()); + println!("{}", union.to_dot()); let union = union.determinize().unwrap(); - println!("{}", union.as_dot()); + println!("{}", union.to_dot()); } assert!(union.is_total().unwrap()); diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index 9a143df..5e2ab8b 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -8,7 +8,7 @@ mod number_of_states; impl RegularExpression { /// Returns the minimum and maximum length of possible matched strings. #[must_use] - pub fn get_length(&self) -> (Option, Option) { + pub fn length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { if range.is_empty() { @@ -17,7 +17,7 @@ impl RegularExpression { (Some(1), Some(1)) } RegularExpression::Repetition(regex, min, max_opt) => { - let (min_length, max_length_opt) = regex.get_length(); + let (min_length, max_length_opt) = regex.length(); if let Some(min_length) = min_length { let new_min_length = min * min_length; let new_max_length = if let Some(max_length) = max_length_opt { @@ -37,7 +37,7 @@ impl RegularExpression { let mut new_max_length = Some(0); for concat_element in concat_vec { - let (min_length, max_length_opt) = concat_element.get_length(); + let (min_length, max_length_opt) = concat_element.length(); if let Some(min_length) = min_length { new_min_length += min_length; @@ -64,7 +64,7 @@ impl RegularExpression { let mut new_max_length = Some(0); for alternation_element in alternation_vec { - let (min_length, max_length_opt) = alternation_element.get_length(); + let (min_length, max_length_opt) = alternation_element.length(); if let Some(min_length) = min_length { new_min_length = cmp::min(new_min_length, min_length); @@ -87,7 +87,7 @@ impl RegularExpression { } /// Returns the cardinality of the regular expression (i.e., the number of possible matched strings). - pub fn get_cardinality(&self) -> Cardinality { + pub fn cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); } else if self.is_total() { @@ -97,7 +97,7 @@ impl RegularExpression { RegularExpression::Character(range) => Cardinality::Integer(range.get_cardinality()), RegularExpression::Repetition(regular_expression, min, max_opt) => { if let Some(max) = max_opt { - let regex_cardinality = regular_expression.get_cardinality(); + let regex_cardinality = regular_expression.cardinality(); if let Cardinality::Integer(cardinality) = regex_cardinality { let mut cardinality_temp: u32 = 0; for i in *min..*max + 1 { @@ -122,7 +122,7 @@ impl RegularExpression { RegularExpression::Concat(concat) => { let mut cardinality: u32 = 1; for concat_element in concat { - let element_cardinality = concat_element.get_cardinality(); + let element_cardinality = concat_element.cardinality(); if let Cardinality::Integer(element_cardinality) = element_cardinality { if let Some(mult) = cardinality.checked_mul(element_cardinality) { cardinality = mult; @@ -138,7 +138,7 @@ impl RegularExpression { RegularExpression::Alternation(alternation) => { let mut cardinality: u32 = 0; for alternation_element in alternation { - let element_cardinality = alternation_element.get_cardinality(); + let element_cardinality = alternation_element.cardinality(); if let Cardinality::Integer(element_cardinality) = element_cardinality { if let Some(add) = cardinality.checked_add(element_cardinality) { cardinality = add; @@ -177,13 +177,13 @@ mod tests { ); assert_eq!( - FastAutomaton::new_empty().get_length(), - RegularExpression::new_empty().get_length() + FastAutomaton::new_empty().length(), + RegularExpression::new_empty().length() ); assert_eq!( - FastAutomaton::new_total().get_length(), - RegularExpression::new_total().get_length() + FastAutomaton::new_total().length(), + RegularExpression::new_total().length() ); Ok(()) } @@ -192,12 +192,12 @@ mod tests { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); - let (min, max_opt) = regex.get_length(); + let (min, max_opt) = regex.length(); let automaton = regex.to_automaton().unwrap(); //automaton.to_dot(); - let (min_automaton_opt, max_automaton_opt) = automaton.get_length(); + let (min_automaton_opt, max_automaton_opt) = automaton.length(); assert_eq!((min_automaton_opt, max_automaton_opt), (min, max_opt)); } @@ -227,13 +227,13 @@ mod tests { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); - let cardinality = regex.get_cardinality(); + let cardinality = regex.cardinality(); let automaton = regex.to_automaton().unwrap(); - // `get_cardinality` returns `Infinite` for cyclic automata without + // `cardinality` returns `Infinite` for cyclic automata without // determinizing and only determinizes the finite (acyclic) // non-deterministic ones internally. - let expected = automaton.get_cardinality().unwrap(); + let expected = automaton.cardinality().unwrap(); assert_eq!(expected, cardinality); } diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index d91a115..90d41ca 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -349,7 +349,7 @@ mod tests { let estimate = regex.get_number_of_states_in_nfa(); assert!(estimate >= 1, "state estimate of {regex} must be >= 1"); let automaton = regex.to_automaton().unwrap(); - assert!(automaton.get_number_of_states() >= 1); + assert!(automaton.number_of_states() >= 1); } } @@ -363,6 +363,6 @@ mod tests { let automaton = regex.to_automaton().unwrap(); - assert_eq!(automaton.get_number_of_states(), number_of_states_in_nfa); + assert_eq!(automaton.number_of_states(), number_of_states_in_nfa); } } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index 8a20183..46367c7 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -10,6 +10,7 @@ impl RegularExpression { } /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If `simplify` is `true`, the expression is simplified during parsing. + #[tracing::instrument(level = "debug", skip(pattern), fields(pattern_len = pattern.len()))] pub fn parse(pattern: &str, simplify: bool) -> Result { if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); diff --git a/src/regex/mod.rs b/src/regex/mod.rs index bcd4575..228e5e4 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -172,6 +172,7 @@ impl RegularExpression { } /// Converts the regular expression to an equivalent [`FastAutomaton`]. + #[tracing::instrument(level = "trace", skip_all)] pub fn to_automaton(&self) -> Result { ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 4861aa5..f32e5be 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,13 +1,14 @@ use super::*; impl RegularExpression { - /// Returns a regular expression that is the concatenation of all expressions in `patterns`. + /// Returns a regular expression that is the concatenation of all expressions in `regexes`. + #[tracing::instrument(level = "trace", skip_all)] pub fn concat_all<'a, I: IntoIterator>( - patterns: I, + regexes: I, ) -> RegularExpression { let mut result = RegularExpression::new_empty_string(); - for other in patterns { + for other in regexes { result = result.concat(other, true); } @@ -15,6 +16,7 @@ impl RegularExpression { } /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. + #[tracing::instrument(level = "trace", skip(self, other), fields(append_back = append_back))] pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs index 363e00d..15da7e7 100644 --- a/src/regex/operation/repeat.rs +++ b/src/regex/operation/repeat.rs @@ -6,6 +6,7 @@ impl RegularExpression { /// When `max_opt` is below `min` there is no valid repetition count and /// the result is the empty language, consistently with /// [`FastAutomaton::repeat`](crate::fast_automaton::FastAutomaton::repeat). + #[tracing::instrument(level = "trace", skip(self), fields(min = min, max_opt = tracing::field::debug(max_opt)))] pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { if self.is_total() { return RegularExpression::new_total(); diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index 6d9ed5f..be02823 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -2,6 +2,7 @@ use super::*; impl RegularExpression { /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. + #[tracing::instrument(level = "trace", skip_all)] pub fn simplify(&self) -> Self { match self { RegularExpression::Character(..) => self.clone(), diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index a8096f5..bcc6e37 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -4,17 +4,19 @@ use super::*; impl RegularExpression { /// Returns a regular expression matching the union of `self` and `other`. + #[tracing::instrument(level = "trace", skip_all)] pub fn union(&self, other: &RegularExpression) -> RegularExpression { Self::union_all([self, other]) } - /// Returns a regular expression that is the union of all expressions in `patterns`. + /// Returns a regular expression that is the union of all expressions in `regexes`. + #[tracing::instrument(level = "trace", skip_all)] pub fn union_all<'a, I: IntoIterator>( - patterns: I, + regexes: I, ) -> RegularExpression { let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); - for other in patterns { + for other in regexes { result = result.union_(other); if result.is_total() { diff --git a/tests/proptest_strategies.rs b/tests/proptest_strategies.rs index 5f638cf..927ad7d 100644 --- a/tests/proptest_strategies.rs +++ b/tests/proptest_strategies.rs @@ -578,13 +578,13 @@ proptest! { assert_set_ops_membership(&a, &b, &probes_over(&['a', 'b', 'c', 'd'], 3))?; } - /// `get_length` and `get_cardinality` agree with brute-force enumeration. + /// `length` and `cardinality` agree with brute-force enumeration. /// The probes cover *every* string up to length 4, so they are exactly /// the language whenever the maximum length is ≤ 4, and a complete /// census of its short strings otherwise. #[test] fn length_cardinality_match_brute_force(a in arb_nfa()) { - let (min, max) = a.get_length(); + let (min, max) = a.length(); let matched_lengths: Vec = probes() .iter() .filter(|s| a.is_match(s)) @@ -613,7 +613,7 @@ proptest! { matched_lengths.iter().max().copied(), "max length disagrees with enumeration" ); - if let Some(cardinality) = bounded(|| a.get_cardinality()) { + if let Some(cardinality) = bounded(|| a.cardinality()) { prop_assert_eq!( cardinality, Cardinality::Integer(matched_lengths.len() as u32), @@ -622,7 +622,7 @@ proptest! { } } else if max.is_none() && min.is_some() - && let Some(cardinality) = bounded(|| a.get_cardinality()) + && let Some(cardinality) = bounded(|| a.cardinality()) { // A cycle on an accepting path means infinitely many strings. prop_assert_eq!( @@ -845,7 +845,7 @@ mod inspect { } else if m.is_total() { // exact on a DFA LangClass::Total - } else if m.get_length().1.is_some() { + } else if m.length().1.is_some() { LangClass::Finite } else { LangClass::Infinite @@ -859,10 +859,10 @@ mod inspect { /// *distinct* languages instead of distinct syntax trees. fn language_key(m: &FastAutomaton) -> String { use std::fmt::Write; - let ss = m.get_spanning_set(); - let mut order = vec![m.get_start_state()]; + let ss = m.spanning_set(); + let mut order = vec![m.start_state()]; let mut ids = std::collections::HashMap::new(); - ids.insert(m.get_start_state(), 0usize); + ids.insert(m.start_state(), 0usize); let mut key = String::new(); let mut i = 0; while i < order.len() { @@ -920,13 +920,7 @@ mod inspect { for s in a.states_vec() { for (cond, _) in a.transitions_from_vec(s) { edges += 1; - if cond - .get_binary_representation() - .iter() - .filter(|&&b| b) - .count() - > 1 - { + if cond.binary_representation().iter().filter(|&&b| b).count() > 1 { multi_base_edges += 1; } } @@ -937,7 +931,7 @@ mod inspect { multi_base_edges, deterministic: a.is_deterministic(), class: classify(&m), - minimal_states: m.get_number_of_states(), + minimal_states: m.number_of_states(), accepts_empty_string: m.is_match(""), key: language_key(&m), } diff --git a/tests/readme_examples.rs b/tests/readme_examples.rs index 331c071..2fc5807 100644 --- a/tests/readme_examples.rs +++ b/tests/readme_examples.rs @@ -54,10 +54,10 @@ fn readme_regular_expression_example() -> Result<(), EngineError> { let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; // How long can matching ids get? Size your database column accordingly. - assert_eq!(pattern.get_length(), (Some(13), Some(15))); + assert_eq!(pattern.length(), (Some(13), Some(15))); // How many distinct ids does the pattern allow? - assert_eq!(pattern.get_cardinality(), Cardinality::Integer(111_000_000)); + assert_eq!(pattern.cardinality(), Cardinality::Integer(111_000_000)); // The AST is a plain enum: walk it to lint patterns, e.g. reject // validation rules that accept unboundedly long input. diff --git a/tests/state_elimination_quality.rs b/tests/state_elimination_quality.rs index 5a99b56..789204c 100644 --- a/tests/state_elimination_quality.rs +++ b/tests/state_elimination_quality.rs @@ -59,12 +59,8 @@ fn measure_state_elimination_quality() { } println!("=== state elimination quality over {count} patterns ==="); - println!( - "NFA: total_complexity = {total_complexity_nfa:.3}, total_len = {total_len_nfa}" - ); - println!( - "DFA: total_complexity = {total_complexity_dfa:.3}, total_len = {total_len_dfa}" - ); + println!("NFA: total_complexity = {total_complexity_nfa:.3}, total_len = {total_len_nfa}"); + println!("DFA: total_complexity = {total_complexity_dfa:.3}, total_len = {total_len_dfa}"); println!( "SUM: total_complexity = {:.3}, total_len = {}", total_complexity_nfa + total_complexity_dfa,