diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7730802..929d3d3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -2,23 +2,40 @@ name: Rust on: push: - branches: [ "main" ] + branches: ["main"] pull_request: - branches: [ "main" ] + branches: ["main"] env: CARGO_TERM_COLOR: always + RUSTDOCFLAGS: -D warnings jobs: build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Format + run: cargo fmt --all -- --check + - name: Build + run: cargo build --verbose + - name: Test & Lint + run: | + cargo test + cargo clippy -- -D warnings + - name: Test & Lint (no default features) + run: | + cargo test --no-default-features + cargo clippy --no-default-features -- -D warnings + - name: Docs + run: cargo doc --no-deps + audit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Build - run: cargo build --all-features --verbose - - name: Test & Lint - run: | - cargo test --all-features - cargo clippy --all-features + - uses: actions/checkout@v4 + - uses: rustsec/audit-check@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index d01bd1a..bf7ff1c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +# cargo mutants output +mutants.out*/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..859a95c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,105 @@ +# Changelog + +All notable changes to this crate are documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] - upcoming 1.0.0 + +This is a major redesign of the public API around the `Term` enum (wrapping +either a `RegularExpression` or a `FastAutomaton`), which dispatches each +operation to the cheaper representation when possible. See the crate-level +docs for the architecture. Almost the entire public surface changed; highlights +below. + +### Added +- New `Term` constructors and conversions: `new_empty`, `new_total`, + `new_empty_string`, `from_pattern`, `from_regex(RegularExpression)`, + `from_automaton(FastAutomaton)`, plus `From`, + `From`, `FromStr`, `Display`, and `Default` (= `new_empty`). +- New `Term` operations: `concat`, `complement`, `determinize`, `minimize`, + `matches`, `is_deterministic`, `is_minimal`, `is_finite`, `to_pattern`, and + `iter_strings` (lazy `StringGenerator` iterator). +- `to_regex`/`to_automaton` now return `Cow` to avoid unnecessary cloning. +- `FastAutomaton` gained corresponding low-level constructors/operations + (`new_empty`, `new_total`, `new_empty_string`, `determinize`, `minimize` + using Hopcroft's algorithm, `is_minimal`, `unaccept`, `print_dot`, + `try_add_transition`) and inspection helpers (`states`, `direct_states`, + `transitions_from`, `transitions_to_vec`, `has_transition`, ...). +- New `EngineError` variants: `InvalidRepetitionBounds`, + `IncompatibleSpanningSet`, `DeterministicAutomatonRequired`; the enum is + now `#[non_exhaustive]`. +- `tracing` instrumentation on the core `Term`, `FastAutomaton`, and + `RegularExpression` operations (concat, union, intersection, difference, + complement, repeat, determinize, minimize, equivalence/subset checks, + string generation, conversions). No-op unless a `tracing` subscriber is + installed. +- Parallel (Rayon-backed) variants of union/intersection for >3 operands, + gated behind the default-on `parallel` feature, with sequential fallbacks + for `--no-default-features`. +- `cargo fmt --check`, `cargo clippy -- -D warnings`, `cargo doc --no-deps` + (with `RUSTDOCFLAGS=-D warnings`), and a dependency vulnerability audit + (`rustsec/audit-check`) to CI. + +### Changed +- `ExecutionProfile` redesigned as an immutable, thread-local-aware config + built via the new `ExecutionProfileBuilder`, governing execution timeouts, + state-count limits, and an `implicit_determinization` toggle. +- `union`/`intersection`/`concat` now take + `impl IntoIterator>` instead of `&[Term]`, so + `&[a, b]`, `[&a, &b]`, and `Vec` all work without cloning. +- `repeat` now takes `impl RangeBounds` (e.g. `3..6`, `..=2`) instead of + explicit min/max parameters. +- `generate_strings` now takes `(limit, offset)` for pagination instead of a + single `count`. +- `is_empty`, `is_total`, and `is_empty_string` now return + `Result` instead of `bool`. +- `are_equivalent`/`is_subset_of` renamed to `equivalent`/`subset`. +- `subtraction` renamed to `difference`, kept single-operand by design. +- Renamed `FastAutomaton::as_dot` to `to_dot` (old printing `to_dot` is now + `print_dot`), matching the crate's `to_*` convention for allocating + conversions (`to_pattern`, `to_regex`, `to_automaton`, `to_range`). +- Renamed `get_*` accessors to drop the `get_` prefix, per the Rust API + Guidelines' C-GETTER convention: `Term::get_length` to `length`, + `Term::get_cardinality` to `cardinality`, `FastAutomaton::get_length` to + `length`, `FastAutomaton::get_cardinality` to `cardinality`, + `FastAutomaton::get_number_of_states` to `number_of_states`, + `FastAutomaton::get_condition` to `condition`, + `FastAutomaton::get_start_state` to `start_state`, + `FastAutomaton::get_accept_states` to `accept_states`, + `FastAutomaton::get_spanning_set` to `spanning_set`, + `FastAutomaton::get_live_states` to `live_states`, + `FastAutomaton::get_spanning_bases` to `spanning_bases`, + `RegularExpression::get_length` to `length`, + `RegularExpression::get_cardinality` to `cardinality`, + `SpanningSet::get_spanning_ranges` to `spanning_ranges`, + `SpanningSet::get_number_of_spanning_ranges` to + `number_of_spanning_ranges`, `SpanningSet::get_spanning_range` to + `spanning_range`, `SpanningSet::get_rest` to `rest`, + `Condition::get_cardinality` to `cardinality`, + `Condition::get_binary_representation` to `binary_representation`, + `ConditionConverter::get_from_spanning_set`/`get_to_spanning_set` to + `from_spanning_set`/`to_spanning_set`. +- Edition bumped to 2024 and `Cargo.toml` metadata (`description`, + `categories`) updated. + +### Removed +- The `serde` feature and all serialization, FAIR (base85) encoding, + encryption, and compression support (`serde`, `ciborium`, `z85`, + `aes-gcm-siv`, `sha2`, `flate2` dependencies). +- `Term::get_details` and the `Details` type. +- The `tokenizer` module. +- Unused `log`, `rand`, and `lazy_static` dependencies, and the `regex` + crate dependency (now dev-only, used by integration tests). +- `EngineError` variants `AutomatonShouldBeDeterministic`, `TooMuchTerms`, + `ConditionIndexOutOfBound`, `TokenError`, and the `is_server_error` method. +- The `max_number_of_terms` execution-profile limit (no longer enforced). + +## Earlier releases + +Releases prior to 1.0.0 (`v0.1.0` through `v0.3.1`) predate this changelog; +see the [GitHub tags](https://github.com/RegexSolver/regexsolver/tags) and +commit history for details. + +[Unreleased]: https://github.com/RegexSolver/regexsolver/compare/v0.3.1...HEAD diff --git a/Cargo.toml b/Cargo.toml index cd03087..31f95f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,50 +1,34 @@ [package] name = "regexsolver" -version = "0.3.1" -edition = "2021" +version = "1.0.0" +edition = "2024" authors = ["Alexandre van Beurden"] repository = "https://github.com/RegexSolver/regexsolver" license = "MIT" keywords = ["automaton", "intersection", "union", "difference", "regex"] -description = "Manipulate regex and automaton as if they were sets." +categories = ["text-processing", "mathematics", "algorithms"] +description = "High-performance Rust library for building, combining, and analyzing regular expressions and finite automata" readme = "README.md" [dependencies] -serde = { version = "1.0", features = ["derive"], optional = true } -ciborium = { version = "0.2.2", optional = true } -z85 = { version = "3.0.5", optional = true } -aes-gcm-siv = { version = "0.11.1", optional = true } -sha2 = { version = "0.10.8", optional = true } -flate2 = { version = "1.0.30", features = [ - "zlib-ng", -], default-features = false, optional = true } +tracing = "0.1" nohash-hasher = "0.2" ahash = "0.8.11" -log = "0.4.21" -rand = "0.8.5" -lazy_static = "1.4.0" -regex = "1.10.3" regex-syntax = "0.8.5" regex-charclass = { version = "1.0.3" } +rayon = { version = "1.10.0", optional = true } +bit-set = "0.8.0" +indexmap = "2.13.0" + +[features] +default = ["parallel"] +parallel = ["dep:rayon"] [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } -env_logger = "0.11.3" -serde_json = "1.0.114" - - -[features] -default = ["serde"] -serde = [ - "regex-charclass/serde", - "dep:serde", - "dep:ciborium", - "dep:z85", - "dep:aes-gcm-siv", - "dep:sha2", - "dep:flate2", -] +proptest = "1" +regex = "1.10.3" [[bench]] -name = "my_benchmark" -harness = false \ No newline at end of file +name = "operations" +harness = false diff --git a/README.md b/README.md index dcb0b47..6a7e523 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,241 @@ # RegexSolver [![Crates.io Version](https://img.shields.io/crates/v/regexsolver)](https://crates.io/crates/regexsolver) +[![docs.rs](https://img.shields.io/docsrs/regexsolver)](https://docs.rs/regexsolver) +[![CI](https://github.com/RegexSolver/regexsolver/actions/workflows/rust.yml/badge.svg)](https://github.com/RegexSolver/regexsolver/actions/workflows/rust.yml) +[![License: MIT](https://img.shields.io/crates/l/regexsolver)](LICENSE) -This repository contains the code of [RegexSolver](https://regexsolver.com/) engine. +The `regex` crate tells you whether a *string* matches a pattern. **RegexSolver treats patterns as the sets of strings they match** — so you can intersect, subtract, compare, complement, and enumerate them, and get the result back as a regex. -For more information, you can check the library's [documentation](https://docs.rs/regexsolver/latest/regexsolver/). +```rust +use regexsolver::Term; -If you want to use this library with other programming languages, we provide a wide range of wrappers: +let a: Term = "(ab|xy){2}".parse()?; +let b: Term = ".*xy".parse()?; -- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) -- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) -- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) +// Which strings match BOTH patterns? Get the answer as a regex: +let both = a.intersection([&b])?; +assert_eq!(both.to_pattern(), "(ab|xy)xy"); + +// Test a concrete string against the result (matching is anchored): +assert!(both.matches("abxy")?); + +// ...and sample them: +assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); +``` + +## What would you use this for? + +- **Safe migrations** - `old_rule.subset(&new_rule)?`: does the new validation pattern accept *everything* the old one did? +- **Test-data generation** - `term.generate_strings(100, 0)?`: produce strings matching any pattern, with pagination. +- **Rule analysis**: find shadowed or overlapping routes, firewall rules, and validators with `intersection` / `difference`. +- **Equivalence proofs** - `a.equivalent(&b)?`: show that two differently-written patterns match exactly the same strings. +- **Pattern simplification**: every operation returns a `Term` you can turn back into a regex pattern with `to_pattern()`. + +Under the hood, every pattern compiles to a finite automaton: -For more information about how to use the wrappers, you can refer to our [getting started guide](https://docs.regexsolver.com/getting-started.html). +

the minimal automaton of (ab|cd)*

+

(ab|cd)* compiled to its minimal automaton, generated with this library's to_dot()

-## Installation +## Try it + +```bash +git clone https://github.com/RegexSolver/regexsolver && cd regexsolver + +# How do two patterns relate? (equivalence, subsets, intersection, differences) +cargo run --example relate -- "(ab|xy){2}" ".*xy" + +# Generate n sample strings matching a pattern +cargo run --example generate -- "[a-z]{2}[0-9]" 20 +``` -Add the following line in your `Cargo.toml`: +Or in your own project: + +```bash +cargo add regexsolver +``` + +By default the `parallel` feature is enabled: unions/intersections of more than 3 operands and parts of the automaton-to-regex conversion run on [rayon](https://crates.io/crates/rayon). Disable it for a leaner dependency tree on single-threaded workloads: ```toml -[dependencies] -regexsolver = "0.3" +regexsolver = { version = "1", default-features = false } ``` -## Examples +## Semantics in 30 seconds + +RegexSolver implements **pure regular languages**, which differs from typical regex engines in two ways: + +- **Everything is anchored**: `abc` matches the string "abc", not "xabc" or "abcx". Patterns describe *whole strings*. +- **`.` matches any character**, including line feed (`\n`). + +The rest follows from regular-language theory: + +- **Backreferences** (`\1`, `\2`, ...) go beyond regular languages and return an error, as do **lookahead/lookbehind** assertions (`(?=...)`, `(?<=...)`). +- **All quantifiers are greedy**: ungreedy markers (`*?`, `+?`, `??`) are ignored as *sets of strings*, `a*` and `a*?` are the same language. +- **The empty language** (matches no string at all) is written `[]` (empty character class). This is distinct from the empty string `""`. + +RegexSolver is based on the [regex-syntax](https://docs.rs/regex-syntax/0.8.5/regex_syntax/) library for parsing patterns. Unsupported features are parsed but ignored; they do not raise an error unless they affect semantics that cannot be represented (e.g., backreferences). This allows for some flexibility in writing regular expressions, but it is important to be aware of the unsupported features to avoid unexpected behavior. -### Union +## A tour of the API + +[`Term`](https://docs.rs/regexsolver/latest/regexsolver/enum.Term.html) is the type you'll interact with: it wraps either a regular expression or an automaton and picks the best representation for each operation. The essentials: + +| Method | Description | +| -------- | ------- | +| `Term::from_pattern(pattern)` | Parses a pattern into a term. | +| `intersection(&self, terms)` / `union(&self, terms)` | Set operations over any number of terms. | +| `difference(&self, other)` / `complement(&self)` | What `self` matches and `other` doesn't / everything `self` doesn't match. | +| `concat(&self, terms)` / `repeat(&self, range)` | Sequence and repeat languages; `range` is any Rust range expression (`2..=5`, `1..`, `..3`, ...). | +| `equivalent(&self, other)` / `subset(&self, other)` | Compare languages. | +| `is_empty()` / `is_total()` / `length()` / `cardinality()` | Analyze a language: matches nothing? everything? string lengths? how many strings? | +| `generate_strings(limit, offset)` | Enumerate matching strings eagerly (call `minimize()` once first when paginating). | +| `iter_strings()` | Lazy iterator equivalent; computes the automaton once and yields strings in batches. | +| `to_pattern()` / `to_automaton()` / `to_regex()` | Convert back out. | + +All fallible operations return `Result<_, EngineError>`. + +### Building automata by hand + +`FastAutomaton` is used to directly build, manipulate and analyze automata. To convert an automaton to a `RegularExpression` the method `to_regex()` can be used. + +States are created with `new_state()` and transitions with `add_transition_from_range`, which labels the transition with a plain `CharRange`: ```rust -use regexsolver::Term; +use regexsolver::CharRange; +use regexsolver::fast_automaton::FastAutomaton; +use regex_charclass::char::Char; + +// Build an automaton matching "[a-c][0-9]*" by hand: +let mut automaton = FastAutomaton::new_empty(); +let s1 = automaton.new_state(); +automaton.accept(s1); + +let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); +let digits = CharRange::new_from_range(Char::new('0')..=Char::new('9')); +automaton.add_transition_from_range(0, s1, &a_to_c)?; +automaton.add_transition_from_range(s1, s1, &digits)?; + +assert!(automaton.is_match("b42")); +assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); +``` + +Internally, transition labels are bitvector `Condition`s over the automaton's `SpanningSet` of disjoint character ranges, that is what makes label union/intersection/complement O(1) ([article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation)). `add_transition_from_range` maintains that representation for you; for full manual control over conditions and spanning sets, see the [`add_transition` documentation](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html#method.add_transition). -let term1 = Term::from_regex("abc").unwrap(); -let term2 = Term::from_regex("de").unwrap(); -let term3 = Term::from_regex("fghi").unwrap(); +Everything `Term` does is also available directly on [`FastAutomaton`](https://docs.rs/regexsolver/latest/regexsolver/fast_automaton/struct.FastAutomaton.html), including `determinize`, `minimize`, the set operations, `equivalent`/`subset`, the analyses, `generate_strings`, `to_regex`, plus low-level construction (`new_state`, `accept`, `add_epsilon_transition`, ...) and inspection (`states`, `transitions_from`, `to_dot`, ...). -let union = term1.union(&[term2, term3]).unwrap(); +### Working with patterns as ASTs -if let Term::RegularExpression(regex) = union { - println!("{}", regex.to_string()); // (abc|de|fghi) +`RegularExpression` is the parsed pattern itself: a plain AST enum (`Character` / `Repetition` / `Concat` / `Alternation`) you can analyze and walk directly. Set operations like intersection and difference live on `FastAutomaton` (or, more conveniently, on `Term`); convert with `to_automaton()`. + +```rust +use regexsolver::cardinality::Cardinality; +use regexsolver::regex::RegularExpression; + +// A validation pattern for an order id, e.g. "ORD-2024-12345". +let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; + +// How long can matching ids get? Size your database column accordingly. +assert_eq!(pattern.length(), (Some(13), Some(15))); + +// How many distinct ids does the pattern allow? +assert_eq!(pattern.cardinality(), Cardinality::Integer(111_000_000)); + +// The AST is a plain enum: walk it to lint patterns, e.g. reject +// validation rules that accept unboundedly long input. +fn has_unbounded_repetition(regex: &RegularExpression) -> bool { + match regex { + RegularExpression::Character(_) => false, + RegularExpression::Repetition(inner, _, max) => { + max.is_none() || has_unbounded_repetition(inner) + } + RegularExpression::Concat(parts) => parts.iter().any(has_unbounded_repetition), + RegularExpression::Alternation(parts) => parts.iter().any(has_unbounded_repetition), + } } +assert!(!has_unbounded_repetition(&pattern)); +assert!(has_unbounded_repetition(&RegularExpression::new(".*@example\\.com")?)); ``` -### Intersection +The variants are freely constructible too; a hand-built repetition whose maximum is below its minimum denotes no valid language and is rejected with `EngineError::InvalidRepetitionBounds` when converted by `to_automaton()`. + +Parsing (`new`, `parse`), the simplifying combinators (`concat`, `union`, `repeat`, `simplify`) and the analyses (`length`, `cardinality`, `evaluate_complexity`) are documented on [`RegularExpression`](https://docs.rs/regexsolver/latest/regexsolver/regex/enum.RegularExpression.html). + +## Bound Execution + +Automaton operations can blow up on adversarial inputs, so the engine is built to run untrusted patterns safely: a thread-local `ExecutionProfile` caps runtime and state explosion, and controls when the engine may determinize or minimize on its own. Hitting a limit returns a specific `EngineError` instead of hanging or panicking. + +### Time-Bounded Execution ```rust -use regexsolver::Term; +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_regex("(abc|de){2}").unwrap(); -let term2 = Term::from_regex("de.*").unwrap(); -let term3 = Term::from_regex(".*abc").unwrap(); +let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*")?; -let intersection = term1.intersection(&[term2, term3]).unwrap(); +let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(5) // limit in milliseconds + .build(); -if let Term::RegularExpression(regex) = intersection { - println!("{}", regex.to_string()); // deabc -} +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000).unwrap_err()); +}); ``` -### Difference/Subtraction +### State-Limited Execution ```rust -use regexsolver::Term; +use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; -let term1 = Term::from_regex("(abc|de)").unwrap(); -let term2 = Term::from_regex("de").unwrap(); +let term1 = Term::from_pattern(".*abcdef.*")?; +let term2 = Term::from_pattern(".*defabc.*")?; -let subtraction = term1.subtraction(&term2).unwrap(); +let execution_profile = ExecutionProfileBuilder::new() + .max_number_of_states(5) // we set the limit + .build(); -if let Term::RegularExpression(regex) = subtraction { - println!("{}", regex.to_string()); // abc -} +// We run the operation with the defined limitation +execution_profile.run(|| { + assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +}); ``` + +### Disabling Implicit Determinization + +`FastAutomaton` operations that require a deterministic automaton (`minimize`, `complement`, `difference`, `equivalent`, `subset`, `cardinality`, ...) determinize a non-deterministic input on their own by default. Since subset construction can blow up exponentially, this can be disabled: those operations then return `EngineError::DeterministicAutomatonRequired` instead, and determinization only happens through an explicit `determinize()` call. Deterministic inputs are always accepted, and the whole `Term` API keeps working since that layer manages the underlying representation itself, so its determinizations count as explicit. + +```rust +use regexsolver::execution_profile::ExecutionProfileBuilder; +use regexsolver::error::EngineError; + +let execution_profile = ExecutionProfileBuilder::new() + .implicit_determinization(false) // default is true + .build(); + +// `nfa` is any non-deterministic FastAutomaton +execution_profile.run(|| { + assert_eq!(EngineError::DeterministicAutomatonRequired, nfa.clone().minimize().unwrap_err()); + + // Determinizing explicitly is always allowed. + let mut dfa = nfa.determinize().unwrap().into_owned(); + assert!(dfa.minimize().is_ok()); +}); +``` + +## How it works + +- Patterns are parsed with [regex-syntax](https://docs.rs/regex-syntax/latest/regex_syntax/) and simplified into a small regular-expression AST; set operations run on finite automata; results convert back to patterns via state elimination. +- Transition labels are bitvectors over a per-automaton "spanning set" of disjoint character ranges, making label union/intersection/complement O(1): see [Optimizing Automaton Representation with Transition Conditions](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). +- Correctness is cross-validated against the `regex` crate and exercised by property-based tests over randomly generated automata and expressions, with brute-force oracles for the analyses. + +## Cross-Language Support + +If you want to use this library with other programming languages, we provide a wide range of wrappers: +- [regexsolver-java](https://github.com/RegexSolver/regexsolver-java) +- [regexsolver-js](https://github.com/RegexSolver/regexsolver-js) +- [regexsolver-python](https://github.com/RegexSolver/regexsolver-python) + +For more information about how to use the wrappers, you can refer to our [guide](https://docs.regexsolver.com/getting-started.html). + +## License + +This project is licensed under the MIT License. diff --git a/assets/automaton.svg b/assets/automaton.svg new file mode 100644 index 0000000..dabacdd --- /dev/null +++ b/assets/automaton.svg @@ -0,0 +1,70 @@ + + + + + + +Automaton + + + +1 + +1 + + + +2 + + +2 + + + +1->2 + + +b + + + +2->1 + + +a + + + +3 + +3 + + + +2->3 + + +c + + + +initial + + + +initial->2 + + + + + +3->2 + + +d + + + diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs deleted file mode 100644 index f2f9fdc..0000000 --- a/benches/my_benchmark.rs +++ /dev/null @@ -1,87 +0,0 @@ -use ahash::AHashSet; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use regexsolver::{fast_automaton::FastAutomaton, regex::RegularExpression}; - -fn parse_regex(regex: &str) -> RegularExpression { - RegularExpression::new(regex).unwrap() -} - -fn to_regex(automaton: &FastAutomaton) -> RegularExpression { - automaton.to_regex().unwrap() -} - -fn determinize(automaton: &FastAutomaton) -> FastAutomaton { - automaton.determinize().unwrap() -} - -fn intersection(automaton_1: &FastAutomaton, automaton_2: &FastAutomaton) -> FastAutomaton { - automaton_1.intersection(automaton_2).unwrap() -} - -fn generate_strings(automaton: &FastAutomaton) -> AHashSet { - automaton.generate_strings(2000).unwrap() -} - -fn criterion_benchmark(c: &mut Criterion) { - { - c.bench_function("parse_regex", |b| { - b.iter(|| parse_regex(black_box("a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}"))) - }); - } - - { - let input_regex = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){2,3}").unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("to_regex", |b| { - b.iter(|| to_regex(black_box(&input_automaton))) - }); - } - - { - let input_regex = RegularExpression::new( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - ) - .unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("determinize", |b| { - b.iter(|| determinize(black_box(&input_automaton))) - }); - } - - /*{ - let input_regex = RegularExpression::new("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5}").unwrap(); - let input_automaton = input_regex.to_automaton().unwrap(); - - c.bench_function("test_determinize", |b| { - b.iter(|| determinize(black_box(&input_automaton))) - }); - }*/ - - { - let automaton1 = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") - .unwrap() - .to_automaton().unwrap(); - let automaton2 = RegularExpression::new("a(bcfe|mkv|opr)*(abc){2,4}") - .unwrap() - .to_automaton().unwrap(); - - c.bench_function("intersection", |b| { - b.iter(|| intersection(black_box(&automaton1), black_box(&automaton2))) - }); - } - - { - let automaton = RegularExpression::new("a(bcfe|bcdg|mkv)*(abc){1,3}") - .unwrap() - .to_automaton().unwrap(); - - c.bench_function("generate_strings", |b| { - b.iter(|| generate_strings(black_box(&automaton))) - }); - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/benches/operations.rs b/benches/operations.rs new file mode 100644 index 0000000..46ebc36 --- /dev/null +++ b/benches/operations.rs @@ -0,0 +1,285 @@ +//! Benchmarks covering the main operation families of the library. +//! +//! Inputs come in named sizes so numbers stay comparable across versions: +//! +//! * `small` / `medium` / `large` — realistic patterns of increasing size. +//! * `blowup_N` — the classic `(a|b)*a(a|b){N}` family whose minimal DFA has +//! 2^N states: the worst case of subset construction. +//! +//! Mutating operations (`minimize`, `complement`) are measured with +//! `iter_batched` on a fresh clone per iteration, so flag short-circuits +//! (e.g. `minimize` early-returning on an already-minimal automaton) don't +//! skew the numbers. + +use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main}; +use regex_charclass::char::Char; +use regexsolver::fast_automaton::FastAutomaton; +use regexsolver::regex::RegularExpression; +use regexsolver::{CharRange, Term}; +use std::hint::black_box; + +const SMALL: (&str, &str) = ("small", "(abc|de){2}"); +const MEDIUM: (&str, &str) = ("medium", "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}"); +const LARGE: (&str, &str) = ( + "large", + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", +); + +fn automaton(pattern: &str) -> FastAutomaton { + RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap() +} + +fn dfa(pattern: &str) -> FastAutomaton { + automaton(pattern).determinize().unwrap().into_owned() +} + +/// `(a|b)*a(a|b){n}`: an n+2-state NFA whose minimal DFA has 2^(n+1) states. +fn blowup_pattern(n: usize) -> String { + format!("(a|b)*a(a|b){{{n}}}") +} + +fn bench_parse(c: &mut Criterion) { + let mut group = c.benchmark_group("parse"); + for (name, pattern) in [SMALL, MEDIUM, LARGE] { + group.bench_with_input(BenchmarkId::from_parameter(name), pattern, |b, pattern| { + b.iter(|| RegularExpression::new(black_box(pattern)).unwrap()) + }); + } + group.finish(); +} + +fn bench_to_automaton(c: &mut Criterion) { + let mut group = c.benchmark_group("to_automaton"); + for (name, pattern) in [SMALL, MEDIUM, LARGE] { + let regex = RegularExpression::new(pattern).unwrap(); + group.bench_with_input(BenchmarkId::from_parameter(name), ®ex, |b, regex| { + b.iter(|| black_box(regex).to_automaton().unwrap()) + }); + } + group.finish(); +} + +fn bench_determinize(c: &mut Criterion) { + let mut group = c.benchmark_group("determinize"); + for n in [5, 10] { + let nfa = automaton(&blowup_pattern(n)); + group.bench_with_input(BenchmarkId::new("blowup", n), &nfa, |b, nfa| { + b.iter(|| black_box(nfa).determinize().unwrap().into_owned()) + }); + } + let nfa = automaton(LARGE.1); + group.bench_with_input(BenchmarkId::from_parameter("large"), &nfa, |b, nfa| { + b.iter(|| black_box(nfa).determinize().unwrap().into_owned()) + }); + group.finish(); +} + +fn bench_minimize(c: &mut Criterion) { + let mut group = c.benchmark_group("minimize"); + for n in [5, 10] { + let blowup_dfa = dfa(&blowup_pattern(n)); + group.bench_with_input(BenchmarkId::new("blowup", n), &blowup_dfa, |b, dfa| { + b.iter_batched( + || dfa.clone(), + |mut automaton| { + automaton.minimize().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }); + } + let large_dfa = dfa(LARGE.1); + group.bench_with_input( + BenchmarkId::from_parameter("large"), + &large_dfa, + |b, dfa| { + b.iter_batched( + || dfa.clone(), + |mut automaton| { + automaton.minimize().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }, + ); + group.finish(); +} + +fn bench_set_operations(c: &mut Criterion) { + let mut group = c.benchmark_group("set_operations"); + + let a = automaton("a(bcfe|bcdg|mkv)*(abc){1,3}"); + let b_op = automaton("a(bcfe|mkv|opr)*(abc){2,4}"); + group.bench_function("intersection", |b| { + b.iter(|| black_box(&a).intersection(black_box(&b_op)).unwrap()) + }); + group.bench_function("union", |b| { + b.iter(|| black_box(&a).union(black_box(&b_op)).unwrap()) + }); + + let minuend = automaton(".*abc.*"); + let subtrahend = automaton(".*def.*"); + group.bench_function("difference", |b| { + b.iter(|| { + black_box(&minuend) + .difference(black_box(&subtrahend)) + .unwrap() + }) + }); + + let complement_input = dfa(".*abc.*"); + group.bench_function("complement", |b| { + b.iter_batched( + || complement_input.clone(), + |mut automaton| { + automaton.complement().unwrap(); + automaton + }, + BatchSize::SmallInput, + ) + }); + + group.finish(); +} + +fn bench_decision(c: &mut Criterion) { + let mut group = c.benchmark_group("decision"); + + // Same language, structurally different automata: the `self == other` + // shortcut cannot fire, forcing the full check in both directions. + let left_form = automaton("(a|b)*abc(a|b)*"); + let right_form = automaton("(a*b*)*abc(b*a*)*"); + assert_ne!(left_form, right_form); + assert!(left_form.equivalent(&right_form).unwrap()); + group.bench_function("equivalent", |b| { + b.iter(|| { + black_box(&left_form) + .equivalent(black_box(&right_form)) + .unwrap() + }) + }); + + let smaller = automaton("abc(de|fg){1,3}"); + let bigger = automaton("abc.*"); + group.bench_function("subset", |b| { + b.iter(|| black_box(&smaller).subset(black_box(&bigger)).unwrap()) + }); + + let left = automaton(".*abc.*"); + let right = automaton(".*cba.*"); + group.bench_function("has_intersection", |b| { + b.iter(|| { + black_box(&left) + .has_intersection(black_box(&right)) + .unwrap() + }) + }); + + group.finish(); +} + +fn bench_analyze(c: &mut Criterion) { + let mut group = c.benchmark_group("analyze"); + + let finite = dfa("[a-z]{1,6}"); + group.bench_function("length/finite", |b| b.iter(|| black_box(&finite).length())); + group.bench_function("cardinality/finite", |b| { + b.iter(|| black_box(&finite).cardinality().unwrap()) + }); + + let infinite = automaton(LARGE.1); + group.bench_function("length/large", |b| b.iter(|| black_box(&infinite).length())); + + group.finish(); +} + +fn bench_to_regex(c: &mut Criterion) { + let mut group = c.benchmark_group("to_regex"); + + let nfa = automaton(MEDIUM.1); + group.bench_function("nfa", |b| b.iter(|| black_box(&nfa).to_regex())); + + let medium_dfa = dfa(MEDIUM.1); + group.bench_function("dfa", |b| b.iter(|| black_box(&medium_dfa).to_regex())); + + group.finish(); +} + +fn bench_generate_strings(c: &mut Criterion) { + let mut group = c.benchmark_group("generate_strings"); + + let automaton = dfa("[a-z]{1,4}"); + group.bench_function("first_2000", |b| { + b.iter(|| black_box(&automaton).generate_strings(2000, 0).unwrap()) + }); + + // The offset fast-skips whole subtrees by counting paths. + let deep = dfa("[a-z]{1,10}"); + group.bench_function("deep_offset", |b| { + b.iter(|| black_box(&deep).generate_strings(100, 1_000_000).unwrap()) + }); + + group.finish(); +} + +fn bench_construction(c: &mut Criterion) { + let mut group = c.benchmark_group("construction"); + + // A 64-transition chain over a growing alphabet: every few transitions + // extend the spanning set and re-project the existing conditions. + group.bench_function("add_transition_from_range/chain_64", |b| { + b.iter(|| { + let mut automaton = FastAutomaton::new_empty(); + let mut previous = 0; + for i in 0..64u8 { + let next = automaton.new_state(); + let character = Char::new(char::from(b'a' + (i % 26))); + let range = CharRange::new_from_range(character..=character); + automaton + .add_transition_from_range(previous, next, &range) + .unwrap(); + previous = next; + } + automaton.accept(previous); + automaton + }) + }); + + group.finish(); +} + +fn bench_end_to_end(c: &mut Criterion) { + let mut group = c.benchmark_group("end_to_end"); + + // The front-page scenario: parse two patterns, intersect, print back. + group.bench_function("intersection_to_pattern", |b| { + b.iter(|| { + let a = Term::from_pattern(black_box("(ab|xy){2}")).unwrap(); + let b_term = Term::from_pattern(black_box(".*xy")).unwrap(); + a.intersection(&[b_term]).unwrap().to_pattern() + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_parse, + bench_to_automaton, + bench_determinize, + bench_minimize, + bench_set_operations, + bench_decision, + bench_analyze, + bench_to_regex, + bench_generate_strings, + bench_construction, + bench_end_to_end, +); +criterion_main!(benches); diff --git a/examples/generate.rs b/examples/generate.rs new file mode 100644 index 0000000..9c9fa6e --- /dev/null +++ b/examples/generate.rs @@ -0,0 +1,25 @@ +//! Generate strings matching a regex pattern. +//! +//! ```text +//! cargo run --example generate -- "[a-z]{2}[0-9]" 20 +//! ``` + +use regexsolver::Term; + +fn main() -> Result<(), Box> { + let mut args = std::env::args().skip(1); + let Some(pattern) = args.next() else { + eprintln!("Usage: cargo run --example generate -- [count]"); + std::process::exit(2); + }; + let count: usize = args.next().map(|c| c.parse()).transpose()?.unwrap_or(10); + + // Minimize once: pagination over the same minimized term yields + // disjoint, consistent pages (see `Term::generate_strings`). + let term = Term::from_pattern(&pattern)?.minimize()?; + for string in term.generate_strings(count, 0)? { + println!("{string:?}"); + } + + Ok(()) +} diff --git a/examples/relate.rs b/examples/relate.rs new file mode 100644 index 0000000..ff056aa --- /dev/null +++ b/examples/relate.rs @@ -0,0 +1,51 @@ +//! Explore how two regex patterns relate as languages. +//! +//! ```text +//! cargo run --example relate -- "(abc|de){2}" ".*xy" +//! ``` + +use regexsolver::Term; + +fn main() -> Result<(), Box> { + let mut args = std::env::args().skip(1); + let (Some(a), Some(b)) = (args.next(), args.next()) else { + eprintln!("Usage: cargo run --example relate -- "); + std::process::exit(2); + }; + + let a_term = Term::from_pattern(&a)?; + let b_term = Term::from_pattern(&b)?; + + println!("a = {a}"); + println!("b = {b}"); + println!(); + + if a_term.equivalent(&b_term)? { + println!("a and b match exactly the same strings."); + return Ok(()); + } + println!("equivalent: no"); + println!("a subset of b: {}", a_term.subset(&b_term)?); + println!("b subset of a: {}", b_term.subset(&a_term)?); + println!(); + + let intersection = a_term.intersection([&b_term])?; + if intersection.is_empty()? { + println!("a ∩ b = [] (no string matches both)"); + } else { + println!("a ∩ b = {}", intersection.to_pattern()); + println!(" e.g. {:?}", intersection.generate_strings(5, 0)?); + } + + let pattern_or_empty = |term: Term| -> Result> { + Ok(if term.is_empty()? { + "[]".to_string() + } else { + term.to_pattern() + }) + }; + println!("a - b = {}", pattern_or_empty(a_term.difference(&b_term)?)?); + println!("b - a = {}", pattern_or_empty(b_term.difference(&a_term)?)?); + + Ok(()) +} diff --git a/src/cardinality/mod.rs b/src/cardinality/mod.rs index 08131e0..d2e054c 100644 --- a/src/cardinality/mod.rs +++ b/src/cardinality/mod.rs @@ -1,10 +1,5 @@ -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; - -/// Represent a number. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +/// Represents a cardinality: either a specific integer, a number too large to represent exactly, or infinite. #[derive(PartialEq, Eq, Debug, Clone)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] pub enum Cardinality { /// An infinite number. Infinite, diff --git a/src/error/mod.rs b/src/error/mod.rs index 6447ebe..df44b9d 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -1,28 +1,26 @@ use std::fmt::{self}; -use crate::tokenizer::token::TokenError; - /// An error thrown by the engine. #[derive(Debug, PartialEq, Eq)] +#[non_exhaustive] pub enum EngineError { /// Invalid character used in regex. InvalidCharacterInRegex, /// The operation took too much time. OperationTimeOutError, - /// The given automaton should be deterministic. - AutomatonShouldBeDeterministic, /// The automaton has too many states. AutomatonHasTooManyStates, - /// The regular expression can not be parsed. + /// The regular expression cannot be parsed. RegexSyntaxError(String), - /// Too many terms are used in the operation. - TooMuchTerms(usize, usize), - /// The provided range can not be built from the spanning set. + /// The provided range cannot be built from the spanning set. ConditionInvalidRange, - /// The provided index is out of bound of the condition. - ConditionIndexOutOfBound, - /// There is an error with one of the token. - TokenError(TokenError), + /// The repetition bounds are invalid: the maximum is below the minimum. + InvalidRepetitionBounds(u32, u32), + /// The condition does not match the spanning set it is evaluated against. + IncompatibleSpanningSet, + /// The operation requires a deterministic automaton, and implicit + /// determinization is disabled by the execution profile. + DeterministicAutomatonRequired, } impl fmt::Display for EngineError { @@ -30,33 +28,28 @@ impl fmt::Display for EngineError { match self { EngineError::InvalidCharacterInRegex => write!(f, "Invalid character used in regex."), EngineError::OperationTimeOutError => write!(f, "The operation took too much time."), - EngineError::AutomatonShouldBeDeterministic => write!(f, "The given automaton should be deterministic."), - EngineError::AutomatonHasTooManyStates => write!(f, "The automaton has too many states."), + EngineError::AutomatonHasTooManyStates => { + write!(f, "The automaton has too many states.") + } EngineError::RegexSyntaxError(err) => write!(f, "{err}."), - EngineError::TooMuchTerms(max, got) => write!(f, "Too many terms are used in this operation, the maximum allowed for your plan is {max} and you used {got}."), - EngineError::TokenError(err) => write!(f, "{err}."), - EngineError::ConditionInvalidRange => write!(f, "The provided range can not be built from the spanning set."), - EngineError::ConditionIndexOutOfBound => write!(f, "The provided index is out of bound of the condition."), + EngineError::ConditionInvalidRange => write!( + f, + "The provided range cannot be built from the spanning set." + ), + EngineError::InvalidRepetitionBounds(min, max) => write!( + f, + "The repetition maximum ({max}) is below its minimum ({min})." + ), + EngineError::IncompatibleSpanningSet => write!( + f, + "The condition does not match the spanning set it is evaluated against." + ), + EngineError::DeterministicAutomatonRequired => write!( + f, + "The operation requires a deterministic automaton, and implicit determinization is disabled by the execution profile." + ), } } } impl std::error::Error for EngineError {} - -impl EngineError { - /// Determine if the error is a server error. - /// A server error should not be shown to the end user. - pub fn is_server_error(&self) -> bool { - match self { - EngineError::InvalidCharacterInRegex => false, - EngineError::OperationTimeOutError => false, - EngineError::AutomatonShouldBeDeterministic => true, - EngineError::AutomatonHasTooManyStates => false, - EngineError::RegexSyntaxError(_) => false, - EngineError::TooMuchTerms(_, _) => false, - EngineError::TokenError(_) => false, - EngineError::ConditionInvalidRange => true, - EngineError::ConditionIndexOutOfBound => true, - } - } -} diff --git a/src/execution_profile.rs b/src/execution_profile.rs index 2ae8e2b..8eeb334 100644 --- a/src/execution_profile.rs +++ b/src/execution_profile.rs @@ -1,105 +1,129 @@ -use std::{cell::RefCell, time::SystemTime}; +use std::{ + cell::RefCell, + time::{Duration, Instant}, +}; use crate::error::EngineError; -/// Hold settings about limitations and constraints of operations execution within the engine. +/// Holds settings that constrain how operations execute within the engine. /// -/// To apply the settings on the current thread you need to call the following function: -/// ``` -/// use regexsolver::execution_profile::{ExecutionProfile, ThreadLocalParams}; -/// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// -/// // Store the settings on the current thread. -/// ThreadLocalParams::init_profile(&execution_profile); -/// ``` -/// -/// # Examples: +/// # Examples /// /// ## Limiting the number of states /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; /// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); +/// let term1 = Term::from_pattern(".*abcdef.*").unwrap(); +/// let term2 = Term::from_pattern(".*defabc.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 1, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 10, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .max_number_of_states(5) +/// .build(); /// -/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::AutomatonHasTooManyStates, term1.intersection(&[term2]).unwrap_err()); +/// }); /// ``` /// -/// ## Limiting the number of terms +/// ## Limiting the execution time /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; +/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ExecutionProfileBuilder}, error::EngineError}; +/// use std::time::SystemTime; /// -/// let term1 = Term::from_regex(".*abc.*").unwrap(); -/// let term2 = Term::from_regex(".*def.*").unwrap(); -/// let term3 = Term::from_regex(".*hij.*").unwrap(); +/// let term = Term::from_pattern(".*abc.*cdef.*sqdsqf.*").unwrap(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1000, -/// max_number_of_terms: 2, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .execution_timeout(5) // 5ms +/// .build(); /// -/// assert_eq!(EngineError::TooMuchTerms(2,3), term1.intersection(&[term2, term3]).unwrap_err()); +/// execution_profile.run(|| { +/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(1000, 1_000_000).unwrap_err()); +/// }); /// ``` /// -/// ## Limiting the execution time +/// ## Disabling implicit determinization +/// +/// [`FastAutomaton`](crate::fast_automaton::FastAutomaton) operations that +/// require a deterministic automaton (`minimize`, `complement`, +/// `difference`, `equivalent`, `subset`, `cardinality`, ...) +/// determinize a non-deterministic input on their own by default. Since +/// subset construction can blow up exponentially, this can be disabled; +/// those operations then fail fast and determinization only happens through +/// an explicit `determinize()` call. [`Term`](crate::Term) methods are not +/// affected: that layer manages the underlying representation itself, so +/// its determinizations count as explicit. +/// /// ``` -/// use regexsolver::{Term, execution_profile::{ExecutionProfile, ThreadLocalParams}, error::EngineError}; -/// use std::time::SystemTime; +/// use regexsolver::CharRange; +/// use regexsolver::fast_automaton::FastAutomaton; +/// use regexsolver::execution_profile::ExecutionProfileBuilder; +/// use regexsolver::error::EngineError; +/// +/// // Two overlapping transitions from the start state: non-deterministic. +/// let mut nfa = FastAutomaton::new_empty(); +/// let s1 = nfa.new_state(); +/// let s2 = nfa.new_state(); +/// nfa.add_transition_from_range(0, s1, &CharRange::total()).unwrap(); +/// nfa.add_transition_from_range(0, s2, &CharRange::total()).unwrap(); +/// nfa.accept(s1); /// -/// let term = Term::from_regex(".*abc.*cdef.*sqdsqf.*").unwrap(); +/// let execution_profile = ExecutionProfileBuilder::new() +/// .implicit_determinization(false) +/// .build(); /// -/// let execution_profile = ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: Some(SystemTime::now()), -/// execution_timeout: 1, -/// max_number_of_terms: 50, -/// }; -/// ThreadLocalParams::init_profile(&execution_profile); +/// execution_profile.run(|| { +/// // `minimize` requires a DFA and refuses to determinize on its own. +/// assert_eq!( +/// EngineError::DeterministicAutomatonRequired, +/// nfa.clone().minimize().unwrap_err() +/// ); /// -/// assert_eq!(EngineError::OperationTimeOutError, term.generate_strings(100).unwrap_err()); +/// // Determinizing explicitly is always allowed. +/// let mut dfa = nfa.determinize().unwrap().into_owned(); +/// assert!(dfa.minimize().is_ok()); +/// }); /// ``` +#[derive(Clone, Debug)] pub struct ExecutionProfile { /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. - pub max_number_of_states: usize, - /// Timestamp of when the execution has started, if this value is not set the operations will never timeout. - pub start_execution_time: Option, + max_number_of_states: Option, /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. - pub execution_timeout: u128, - /// The maximum number of terms that an operation can have. - pub max_number_of_terms: usize, + execution_timeout: Option, + /// The time after when a [`EngineError::OperationTimeOutError`] should be thrown. + execution_deadline: Option, + /// Whether [`FastAutomaton`](crate::fast_automaton::FastAutomaton) + /// operations that require a deterministic automaton may determinize a + /// non-deterministic input on their own (the default). When `false`, + /// those operations return + /// [`EngineError::DeterministicAutomatonRequired`] instead, so that the + /// potentially exponential subset construction only ever happens through + /// an explicit `determinize()` call. [`Term`](crate::Term) methods + /// always work: that layer manages the representation itself. + implicit_determinization: bool, +} + +impl PartialEq for ExecutionProfile { + fn eq(&self, other: &ExecutionProfile) -> bool { + self.max_number_of_states == other.max_number_of_states + && self.execution_timeout == other.execution_timeout + && self.implicit_determinization == other.implicit_determinization + } } impl ExecutionProfile { + /// Retrieves the current thread-local execution profile. + pub fn get() -> ExecutionProfile { + ThreadLocalParams::get_execution_profile() + } + /// Assert that `execution_timeout` is not exceeded. /// - /// Return empty if `execution_timeout` is not exceeded or if `start_execution_time` is not set. + /// Return empty if `execution_timeout` is not exceeded. /// /// Return [`EngineError::OperationTimeOutError`] otherwise. - pub fn assert_not_timed_out(&self) -> Result<(), EngineError> { - if let Some(start) = self.start_execution_time { - let run_duration = SystemTime::now() - .duration_since(start) - .expect("Time went backwards") - .as_millis(); - - if run_duration > self.execution_timeout { + pub(crate) fn assert_not_timed_out(&self) -> Result<(), EngineError> { + if let Some(execution_deadline) = self.execution_deadline { + if Instant::now() > execution_deadline { Err(EngineError::OperationTimeOutError) } else { Ok(()) @@ -108,186 +132,454 @@ impl ExecutionProfile { Ok(()) } } + + /// Assert that `max_number_of_states` is not exceeded. + /// + /// Return empty if `max_number_of_states` is not exceeded. + /// + /// Return [`EngineError::AutomatonHasTooManyStates`] otherwise. + pub(crate) fn assert_max_number_of_states( + &self, + number_of_states: usize, + ) -> Result<(), EngineError> { + if let Some(max_number_of_states) = self.max_number_of_states + && number_of_states >= max_number_of_states + { + return Err(EngineError::AutomatonHasTooManyStates); + } + Ok(()) + } + + /// Assert that implicit determinization is allowed. + /// + /// Return empty if it is. + /// + /// Return [`EngineError::DeterministicAutomatonRequired`] otherwise. + pub(crate) fn assert_implicit_determinization_allowed(&self) -> Result<(), EngineError> { + if self.implicit_determinization { + Ok(()) + } else { + Err(EngineError::DeterministicAutomatonRequired) + } + } + + /// Returns a copy of this profile with the execution timeout set to + /// `execution_timeout_in_ms` milliseconds. Use these `with_*` methods to + /// derive a variant of an existing profile (e.g. one from + /// [`get`](Self::get)); to build one from scratch, prefer + /// [`ExecutionProfileBuilder`]. See + /// [`ExecutionProfileBuilder::execution_timeout`]. + pub fn with_execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + /// Returns a copy of this profile with the maximum number of states set to + /// `max_number_of_states`. See + /// [`ExecutionProfileBuilder::max_number_of_states`]. + pub fn with_max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + /// Returns a copy of this profile with implicit determinization enabled or + /// disabled. See [`ExecutionProfileBuilder::implicit_determinization`]. + pub fn with_implicit_determinization(mut self, allowed: bool) -> Self { + self.implicit_determinization = allowed; + self + } + + /// Runs the given closure with this profile installed for the current thread, setting its start time to now. + pub fn run(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + let mut execution_profile = self.clone(); + if let Some(execution_timeout) = execution_profile.execution_timeout { + execution_profile.execution_deadline = + Some(Instant::now() + Duration::from_millis(execution_timeout)); + } + + ThreadLocalParams::set_execution_profile(&execution_profile); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } + + /// Runs the closure like [`run`](Self::run), but does not reset the start time. Use this to propagate an already-started profile to worker threads without restarting the clock. + pub fn apply(&self, f: F) -> R + where + F: FnOnce() -> R, + { + let initial_execution_profile = ThreadLocalParams::get_execution_profile(); + + ThreadLocalParams::set_execution_profile(self); + let result = f(); + ThreadLocalParams::set_execution_profile(&initial_execution_profile); + result + } } -/// Hold [`ExecutionProfile`] on the current thread. -/// -/// The default [`ExecutionProfile`] is the following: -/// ``` -/// use regexsolver::execution_profile::ExecutionProfile; -/// -/// ExecutionProfile { -/// max_number_of_states: 8192, -/// start_execution_time: None, -/// execution_timeout: 1500, -/// max_number_of_terms: 50, -/// }; -/// ``` -pub struct ThreadLocalParams; +pub struct ExecutionProfileBuilder { + /// The maximum number of states that a non-determinitic finite automaton can hold, this is checked during the convertion of regular expression to automaton. + max_number_of_states: Option, + /// The longest time in milliseconds that an operation execution can last, there are no guaranties that the exact time will be respected. + execution_timeout: Option, + /// Whether operations requiring a deterministic automaton may determinize + /// a non-deterministic input on their own. Defaults to `true`. + implicit_determinization: bool, +} +impl Default for ExecutionProfileBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ExecutionProfileBuilder { + /// Creates a builder with no limits set and implicit determinization + /// enabled (i.e. the defaults, equivalent to the ambient profile when none + /// has been installed). + pub fn new() -> Self { + Self { + max_number_of_states: None, + execution_timeout: None, + implicit_determinization: true, + } + } + + /// Sets the longest time, in milliseconds, that an operation may run before + /// it aborts with [`EngineError::OperationTimeOutError`]. Enforcement is + /// best-effort (checked between internal steps), so the exact deadline is + /// not guaranteed. Unset by default (no timeout). + pub fn execution_timeout(mut self, execution_timeout_in_ms: u64) -> Self { + self.execution_timeout = Some(execution_timeout_in_ms); + self + } + + /// Caps the number of states an automaton may reach; operations that would + /// exceed it abort with [`EngineError::AutomatonHasTooManyStates`]. This + /// bounds the exponential blow-up of conversions such as determinization. + /// Unset by default (no cap). + pub fn max_number_of_states(mut self, max_number_of_states: usize) -> Self { + self.max_number_of_states = Some(max_number_of_states); + self + } + + /// Whether [`FastAutomaton`](crate::fast_automaton::FastAutomaton) + /// operations that require a deterministic automaton may determinize a + /// non-deterministic input on their own (the default). When set to + /// `false`, those operations return + /// [`EngineError::DeterministicAutomatonRequired`] instead; explicit + /// `determinize()` calls and [`Term`](crate::Term) methods (which + /// manage the representation themselves) are always allowed. + pub fn implicit_determinization(mut self, allowed: bool) -> Self { + self.implicit_determinization = allowed; + self + } + + /// Builds the [`ExecutionProfile`]. Install it around a unit of work with + /// [`ExecutionProfile::run`]. + pub fn build(self) -> ExecutionProfile { + ExecutionProfile { + max_number_of_states: self.max_number_of_states, + execution_timeout: self.execution_timeout, + execution_deadline: None, + implicit_determinization: self.implicit_determinization, + } + } +} + +struct ThreadLocalParams; impl ThreadLocalParams { thread_local! { - static MAX_NUMBER_OF_STATES: RefCell = const { RefCell::new(8192) }; - static START_EXECUTION_TIME: RefCell> = const { RefCell::new(None) }; - static EXECUTION_TIMEOUT: RefCell = const { RefCell::new(1500) }; - static MAX_NUMBER_OF_TERMS: RefCell = const { RefCell::new(50) }; + static MAX_NUMBER_OF_STATES: RefCell> = const { RefCell::new(None) }; + static EXECUTION_DEADLINE: RefCell> = const { RefCell::new(None) }; + static EXECUTION_TIMEOUT: RefCell> = const { RefCell::new(None) }; + static IMPLICIT_DETERMINIZATION: RefCell = const { RefCell::new(true) }; } /// Store on the current thread [`ExecutionProfile`]. - pub fn init_profile(profile: &ExecutionProfile) { + fn set_execution_profile(profile: &ExecutionProfile) { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| { *cell.borrow_mut() = profile.max_number_of_states; }); - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| { - *cell.borrow_mut() = profile.start_execution_time; + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| { + *cell.borrow_mut() = profile.execution_deadline; }); ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| { *cell.borrow_mut() = profile.execution_timeout; }); - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| { - *cell.borrow_mut() = profile.max_number_of_terms; + ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| { + *cell.borrow_mut() = profile.implicit_determinization; }); } - pub fn get_max_number_of_states() -> usize { + fn get_max_number_of_states() -> Option { ThreadLocalParams::MAX_NUMBER_OF_STATES.with(|cell| *cell.borrow()) } - pub fn get_start_execution_time() -> Option { - ThreadLocalParams::START_EXECUTION_TIME.with(|cell| *cell.borrow()) + fn get_execution_deadline() -> Option { + ThreadLocalParams::EXECUTION_DEADLINE.with(|cell| *cell.borrow()) } - pub fn get_execution_timeout() -> u128 { + fn get_execution_timeout() -> Option { ThreadLocalParams::EXECUTION_TIMEOUT.with(|cell| *cell.borrow()) } - pub fn get_max_number_of_terms() -> usize { - ThreadLocalParams::MAX_NUMBER_OF_TERMS.with(|cell| *cell.borrow()) + fn get_implicit_determinization() -> bool { + ThreadLocalParams::IMPLICIT_DETERMINIZATION.with(|cell| *cell.borrow()) } /// Return the [`ExecutionProfile`] stored on the current thread. - pub fn get_execution_profile() -> ExecutionProfile { + fn get_execution_profile() -> ExecutionProfile { ExecutionProfile { max_number_of_states: Self::get_max_number_of_states(), - start_execution_time: Self::get_start_execution_time(), + execution_deadline: Self::get_execution_deadline(), execution_timeout: Self::get_execution_timeout(), - max_number_of_terms: Self::get_max_number_of_terms(), + implicit_determinization: Self::get_implicit_determinization(), } } } #[cfg(test)] mod tests { - use crate::{regex::RegularExpression, Term}; + use crate::{Term, regex::RegularExpression}; use super::*; + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } + + #[test] + fn test_execution_get() -> Result<(), String> { + let execution_profile = ExecutionProfileBuilder::new() + .execution_timeout(1000) + .max_number_of_states(8192) + .build(); + + execution_profile.run(|| { + assert_eq!(execution_profile, ExecutionProfile::get()); + }); + + Ok(()) + } + #[test] fn test_execution() -> Result<(), String> { - let execution_profile = ExecutionProfile { - max_number_of_states: 1, - start_execution_time: None, - execution_timeout: 1000, - max_number_of_terms: 10, - }; - ThreadLocalParams::init_profile(&execution_profile); - - let regex = RegularExpression::new("test").unwrap(); - - assert!(regex.to_automaton().is_err()); - assert_eq!( - EngineError::AutomatonHasTooManyStates, - regex.to_automaton().unwrap_err() - ); + ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build() + .run(|| { + let regex = RegularExpression::new("test").unwrap(); + + assert!(regex.to_automaton().is_err()); + assert_eq!( + EngineError::AutomatonHasTooManyStates, + regex.to_automaton().unwrap_err() + ); + }); Ok(()) } + /// A two-way acyclic automaton with overlapping transitions: the + /// smallest shape that is non-deterministic and reaches the + /// determinization paths of every DFA-requiring operation. + fn nondeterministic_automaton() -> crate::fast_automaton::FastAutomaton { + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + assert!(!a.is_deterministic()); + a + } + + #[test] + fn test_implicit_determinization_disabled() { + let nfa = nondeterministic_automaton(); + let dfa = nfa.determinize().unwrap().into_owned(); + + ExecutionProfileBuilder::new() + .implicit_determinization(false) + .build() + .run(|| { + let err = EngineError::DeterministicAutomatonRequired; + + // Every DFA-requiring operation refuses to determinize a + // non-deterministic input on its own... + assert_eq!(nfa.clone().minimize().unwrap_err(), err); + assert_eq!(nfa.clone().complement().unwrap_err(), err); + assert_eq!(dfa.difference(&nfa).unwrap_err(), err); + assert_eq!(nfa.equivalent(&dfa).unwrap_err(), err); + assert_eq!(dfa.subset(&nfa).unwrap_err(), err); + assert_eq!(nfa.cardinality().unwrap_err(), err); + + // ...but operations that work on NFAs directly are unaffected + // (difference only determinizes the subtrahend)... + assert!(nfa.difference(&dfa).is_ok()); + + // ...deterministic inputs keep working... + assert!(dfa.clone().minimize().is_ok()); + assert!(dfa.clone().complement().is_ok()); + assert!(dfa.cardinality().is_ok()); + assert!(dfa.equivalent(&dfa).is_ok()); + + // ...and explicit determinization is always allowed. + assert!(nfa.determinize().is_ok()); + }); + } + + /// The `implicit_determinization` knob targets direct `FastAutomaton` + /// usage; `Term` manages the underlying representation itself, so its + /// whole public API must keep working when the knob is off. + #[test] + fn test_term_api_works_without_implicit_determinization() { + let term = Term::from_automaton(nondeterministic_automaton()); + let other = Term::from_pattern("a*").unwrap(); + + ExecutionProfileBuilder::new() + .implicit_determinization(false) + .build() + .run(|| { + // Methods that need a DFA internally determinize on Term's + // behalf (an explicit choice of the Term layer)... + assert!(term.difference(&other).is_ok()); + assert!(other.difference(&term).is_ok()); + assert!(term.complement().is_ok()); + assert!(term.equivalent(&other).is_ok()); + assert!(term.subset(&other).is_ok()); + assert!(other.subset(&term).is_ok()); + assert!(term.is_total().is_ok()); + assert!(term.cardinality().is_ok()); + assert!(term.minimize().is_ok()); + assert!(term.generate_strings(5, 0).is_ok()); + + // ...and the rest of the API never needed one. + assert!(term.concat(std::slice::from_ref(&other)).is_ok()); + assert!(term.union(std::slice::from_ref(&other)).is_ok()); + assert!(term.intersection(std::slice::from_ref(&other)).is_ok()); + assert!(term.repeat(0..=2).is_ok()); + assert!(term.is_empty().is_ok()); + assert!(term.is_empty_string().is_ok()); + let _ = term.length(); + let _ = term.to_regex(); + let _ = term.to_pattern(); + assert!(term.to_automaton().is_ok()); + + // The override is scoped: direct FastAutomaton usage stays + // gated afterwards. + assert_eq!( + nondeterministic_automaton().minimize().unwrap_err(), + EngineError::DeterministicAutomatonRequired + ); + }); + } + + #[test] + fn test_implicit_determinization_default() { + let nfa = nondeterministic_automaton(); + + // Without the profile knob the historical behavior is unchanged. + assert!(nfa.clone().minimize().is_ok()); + assert!(nfa.clone().complement().is_ok()); + assert!(nfa.cardinality().is_ok()); + assert!(nfa.equivalent(&nfa.clone()).is_ok()); + } + #[test] fn test_execution_timeout_generate_strings() -> Result<(), String> { - let term = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + + let execution_timeout_in_ms = 10; + let start_time = Instant::now(); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term.generate_strings(100, 1_000_000).unwrap_err() + ); + + let run_duration = Instant::now().duration_since(start_time).as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= (execution_timeout_in_ms + 50) as u128); + }); - let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term.generate_strings(100).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); Ok(()) } #[test] fn test_execution_timeout_difference() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz").unwrap(); + + let execution_timeout_in_ms = 0; + let start_time = Instant::now(); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.difference(&term2).unwrap_err() + ); + + let run_duration = Instant::now().duration_since(start_time).as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= (execution_timeout_in_ms + 1000) as u128); + }); - let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.difference(&term2).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); Ok(()) } - #[test] + /*#[test] fn test_execution_timeout_intersection() -> Result<(), String> { - let term1 = Term::from_regex(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); - let term2 = Term::from_regex(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term1 = Term::from_pattern(".*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let term2 = Term::from_pattern(".*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdsqd.*sqdsqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz.*abc.*def.*qdqd.*qsdsqdsqdz").unwrap(); + let execution_timeout_in_ms = 100; let start_time = SystemTime::now(); - let execution_profile = ExecutionProfile { - max_number_of_states: 8192, - start_execution_time: Some(start_time), - execution_timeout: 100, - max_number_of_terms: 50, - }; - ThreadLocalParams::init_profile(&execution_profile); - - assert_eq!( - EngineError::OperationTimeOutError, - term1.intersection(&[term2]).unwrap_err() - ); - - let run_duration = SystemTime::now() - .duration_since(start_time) - .expect("Time went backwards") - .as_millis(); - - println!("{run_duration}"); - assert!(run_duration <= execution_profile.execution_timeout + 50); + ExecutionProfileBuilder::new() + .execution_timeout(execution_timeout_in_ms) + .build() + .run(|| { + assert_eq!( + EngineError::OperationTimeOutError, + term1.intersection(&[term2]).unwrap_err() + ); + + let run_duration = SystemTime::now() + .duration_since(start_time) + .expect("Time went backwards") + .as_millis(); + + println!("{run_duration}"); + assert!(run_duration <= execution_timeout_in_ms + 100); + }); + Ok(()) - } + }*/ } diff --git a/src/fast_automaton/analyze/cardinality.rs b/src/fast_automaton/analyze/cardinality.rs index 04ea226..ebc1b9d 100644 --- a/src/fast_automaton/analyze/cardinality.rs +++ b/src/fast_automaton/analyze/cardinality.rs @@ -3,20 +3,51 @@ use std::hash::BuildHasherDefault; use super::*; impl FastAutomaton { - pub fn get_cardinality(&self) -> Option> { + /// Returns the cardinality of the automaton (i.e., the number of possible matched strings). + /// + /// Works on non-deterministic automata too: acyclic NFAs are determinized + /// internally (the only fallible step, subject to the + /// [`crate::execution_profile::ExecutionProfile`] budget, and rejected + /// with [`EngineError::DeterministicAutomatonRequired`] when the profile + /// disables implicit determinization). + /// + /// As in [`length`](Self::length), only cycles **on accepting + /// paths** make the count infinite: cycles among dead or unreachable + /// states don't add a single matched string. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] + pub fn cardinality(&self) -> Result, EngineError> { if self.is_empty() { - return Some(Cardinality::Integer(0)); - } else if self.cyclic || self.is_total() { - return Some(Cardinality::Infinite); - } else if !self.deterministic { - return None; + return Ok(Cardinality::Integer(0)); + } else if self.is_total() { + return Ok(Cardinality::Infinite); } - let topologically_sorted_states = self.topological_sorted_states(); - if topologically_sorted_states.is_none() { - return Some(Cardinality::Infinite); + // Only states on an accepting path (reachable from the start AND + // able to reach an accept) contribute strings; everything else is + // excluded from both the cycle check and the count. + let live = self.live_states(); + let relevant: IntSet = self + .forward_reachable_states() + .intersection(&live) + .copied() + .collect(); + + // A cycle among relevant states means infinitely many strings. + // `topological_sorted_states` returns `None` exactly when that + // subgraph is cyclic and needs no determinism, so this also covers + // cyclic non-deterministic inputs. + let topologically_sorted_states = match self.topological_sorted_states(&relevant) { + None => return Ok(Cardinality::Infinite), + Some(states) => states, + }; + + // The finite count below assumes deterministic (single-path) + // transitions. Determinizing an automaton with a finite language + // yields one whose relevant subgraph is acyclic too, so the + // recursion takes the deterministic path on the second call. + if !self.is_deterministic() { + return self.determinize_implicit()?.cardinality(); } - let topologically_sorted_states = topologically_sorted_states.unwrap(); let len = self.transitions.len(); let mut distances: IntMap = @@ -27,20 +58,21 @@ impl FastAutomaton { let current_distance = *distances.entry(state).or_insert(0); if let Some(to_states) = self.transitions.get(state) { for (to_state, condition) in to_states { + if !relevant.contains(to_state) { + continue; + } if let Some(distance) = current_distance.checked_mul( condition - .get_cardinality(&self.spanning_set) + .cardinality(&self.spanning_set) .expect("It should be possible to get the cardinality of a condition."), - ) { - if let Some(new_distance) = - distances.get(to_state).unwrap_or(&0).checked_add(distance) - { - distances.insert(*to_state, new_distance); - continue; - } + ) && let Some(new_distance) = + distances.get(to_state).unwrap_or(&0).checked_add(distance) + { + distances.insert(*to_state, new_distance); + continue; } - return Some(Cardinality::BigInteger); + return Ok(Cardinality::BigInteger); } } } @@ -52,22 +84,33 @@ impl FastAutomaton { temp_cardinality = add; continue; } - return Some(Cardinality::BigInteger); + return Ok(Cardinality::BigInteger); } } - Some(Cardinality::Integer(temp_cardinality)) + Ok(Cardinality::Integer(temp_cardinality)) } - fn topological_sorted_states(&self) -> Option> { - let len = self.get_number_of_states(); + /// Kahn's algorithm restricted to the `relevant` subgraph (transitions + /// with empty conditions can't be taken and are ignored). Returns `None` + /// when that subgraph contains a cycle. + fn topological_sorted_states(&self, relevant: &IntSet) -> Option> { + let len = relevant.len(); let mut in_degree: IntMap = IntMap::with_capacity_and_hasher(len, BuildHasherDefault::default()); let mut queue = VecDeque::with_capacity(len); let mut order = Vec::with_capacity(len); - for from_state in &self.transitions_vec() { - in_degree.entry(*from_state).or_insert(0); - for to_state in self.transitions_from_state_iter(from_state) { + let successors = |from_state: State| { + self.transitions_from(from_state) + .filter(|(condition, to_state)| { + !condition.is_empty() && relevant.contains(to_state) + }) + .map(|(_, to_state)| *to_state) + }; + + for &from_state in relevant { + in_degree.entry(from_state).or_insert(0); + for to_state in successors(from_state) { *in_degree.entry(to_state).or_insert(0) += 1; } } @@ -80,7 +123,7 @@ impl FastAutomaton { while let Some(from_state) = queue.pop_front() { order.push(from_state); - for to_state in self.transitions_from_state_iter(&from_state) { + for to_state in successors(from_state) { *in_degree.entry(to_state).or_default() -= 1; if in_degree[&to_state] == 0 { @@ -96,3 +139,53 @@ impl FastAutomaton { } } } + +#[cfg(test)] +mod tests { + use crate::cardinality::Cardinality; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + // Regression (found by the brute-force enumeration proptest): the cycle + // check used to run over ALL states, so a cycle among dead states made + // the cardinality of a finite language Infinite. Only cycles on + // accepting paths count. + #[test] + fn get_cardinality_ignores_dead_cycles() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.accept(0); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.add_transition(s2, s1, &cond); + // s1, s2 can't reach an accept → language is {""} only. + + assert_eq!(a.cardinality().unwrap(), Cardinality::Integer(1)); + } + + // Regression: `cardinality` used to `assert!` determinism and panic + // on acyclic NFAs (the only nondeterministic inputs that reach the finite + // count; cyclic ones return Infinite earlier). It now determinizes + // internally. + #[test] + fn get_cardinality_determinizes_acyclic_nfas() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + // Two overlapping transitions from the start: nondeterministic, but + // both lead to accepting states after exactly one character. + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + assert!(!a.is_deterministic()); + + let cardinality = a.cardinality().unwrap(); + let expected = a.determinize().unwrap().cardinality().unwrap(); + assert_eq!(cardinality, expected); + assert!(matches!(cardinality, Cardinality::Integer(n) if n > 0)); + } +} diff --git a/src/fast_automaton/analyze/equivalence.rs b/src/fast_automaton/analyze/equivalence.rs index d81294c..e005d7e 100644 --- a/src/fast_automaton/analyze/equivalence.rs +++ b/src/fast_automaton/analyze/equivalence.rs @@ -3,21 +3,27 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn is_equivalent_of(&self, other: &FastAutomaton) -> Result { + /// Returns `true` if both automata accept the same language. + /// + /// Non-deterministic operands are determinized internally, unless the + /// execution profile disables implicit determinization, in which case + /// [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] + pub fn equivalent(&self, other: &FastAutomaton) -> Result { if self.is_empty() != other.is_empty() && self.is_total() != other.is_total() { return Ok(false); } else if self == other { return Ok(true); } - let mut other_complement = other.determinize()?; + let mut other_complement = other.determinize_implicit()?.into_owned(); other_complement.complement()?; if self.has_intersection(&other_complement)? { return Ok(false); } - let mut self_complement = self.determinize()?; + let mut self_complement = self.determinize_implicit()?.into_owned(); self_complement.complement()?; Ok(!self_complement.has_intersection(other)?) @@ -43,26 +49,26 @@ mod tests { false, ); - let regex_1 = RegularExpression::new("cd").unwrap(); - let regex_2 = RegularExpression::new("cd").unwrap(); + let regex_1 = RegularExpression::parse("cd", false).unwrap(); + let regex_2 = RegularExpression::parse("cd", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); - let regex_1 = RegularExpression::new("test.*other").unwrap(); - let regex_2 = RegularExpression::new("test.*othew").unwrap(); + let regex_1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex_2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex_1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex_2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("[0]").unwrap(); - let regex_2 = RegularExpression::new("[01]").unwrap(); + let regex_1 = RegularExpression::parse("[0]", false).unwrap(); + let regex_2 = RegularExpression::parse("[01]", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, false); - let regex_1 = RegularExpression::new("(b+a+)*").unwrap(); - let regex_2 = RegularExpression::new("(b[a-b]*a)?").unwrap(); + let regex_1 = RegularExpression::parse("(b+a+)*", false).unwrap(); + let regex_2 = RegularExpression::parse("(b[a-b]*a)?", false).unwrap(); assert_equivalent(®ex_1, ®ex_2, true); Ok(()) @@ -71,14 +77,11 @@ mod tests { fn assert_equivalent(regex_1: &RegularExpression, regex_2: &RegularExpression, expected: bool) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_equivalent_of(&automaton_1).unwrap()); + assert!(automaton_1.equivalent(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_equivalent_of(&automaton_2).unwrap()); + assert!(automaton_2.equivalent(&automaton_2).unwrap()); - assert_eq!( - expected, - automaton_1.is_equivalent_of(&automaton_2).unwrap() - ); + assert_eq!(expected, automaton_1.equivalent(&automaton_2).unwrap()); } } diff --git a/src/fast_automaton/analyze/length.rs b/src/fast_automaton/analyze/length.rs index 70eccbd..87b471d 100644 --- a/src/fast_automaton/analyze/length.rs +++ b/src/fast_automaton/analyze/length.rs @@ -1,67 +1,186 @@ use super::*; impl FastAutomaton { - pub fn get_length(&self) -> (Option, Option) { - if self.is_empty() { + /// Returns the minimum and maximum length of matched strings. + /// + /// Cycles are only treated as "language-extending" if they sit on an + /// accepting path. Cycles among dead states (states that can't reach any + /// accept) don't extend the language and therefore don't make the max + /// infinite. + /// + /// Runs in O(V + E): the minimum is a BFS distance; the maximum is a + /// longest path over the subgraph of states lying on accepting paths, + /// which is unbounded exactly when that subgraph has a cycle (any such + /// cycle can be pumped). + #[must_use] + pub fn length(&self) -> (Option, Option) { + // States that can reach an accept state. If the start state can't, + // the language is empty. + let live = self.live_states(); + if !live.contains(&self.start_state) { return (None, None); - } else if self.is_total(){ - return (Some(0), None); } + // BFS from the start over live states only; every state on an + // accepting path is live, so this loses no accepting path. BFS visits + // in non-decreasing depth, hence the first accept hit is the minimum. + // The visited set (reachable ∩ live) is exactly the subgraph relevant + // for the maximum. let mut min = None; - let mut is_infinite = false; - - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); - worklist.push_back((self.start_state, 0, IntSet::default())); - - while let Some(element) = worklist.pop_front() { - let state = element.0; - let length = element.1; - let mut seen = element.2; - if min.is_some() && length > min.unwrap() { - continue; - } - if self.accept_states.contains(&state) && (min.is_none() || length < min.unwrap()) { + let mut visited = IntSet::default(); + let mut worklist = VecDeque::with_capacity(self.number_of_states()); + visited.insert(self.start_state); + worklist.push_back((self.start_state, 0u32)); + while let Some((state, length)) = worklist.pop_front() { + if min.is_none() && self.accept_states.contains(&state) { min = Some(length); } - seen.insert(state); - - for to_state in self.transitions_from_state_iter(&state) { - if to_state == state || seen.contains(&to_state) { - is_infinite = true; + for (condition, to_state) in self.transitions_from(state) { + if condition.is_empty() || !live.contains(to_state) { continue; } - worklist.push_back((to_state, length + 1, seen.clone())); + if visited.insert(*to_state) { + worklist.push_back((*to_state, length + 1)); + } } } - if is_infinite || min.is_none() { - return (min, None); + // Longest path via Kahn's algorithm on the visited subgraph. In the + // acyclic case the topological order covers all visited states and + // every state's longest distance is final when it is dequeued. + let mut in_degree: IntMap = IntMap::default(); + for &from in &visited { + in_degree.entry(from).or_insert(0); + for (condition, to_state) in self.transitions_from(from) { + if condition.is_empty() || !visited.contains(to_state) { + continue; + } + *in_degree.entry(*to_state).or_insert(0) += 1; + } } - let mut max = None; - - worklist.clear(); - worklist.push_back((self.start_state, 0, IntSet::default())); + let mut queue: VecDeque = in_degree + .iter() + .filter(|&(_, °ree)| degree == 0) + .map(|(&state, _)| state) + .collect(); - while let Some(element) = worklist.pop_back() { - let state = element.0; - let length = element.1; - let mut seen = element.2; - if self.accept_states.contains(&state) && (max.is_none() || length > max.unwrap()) { - max = Some(length); + let mut longest: IntMap = IntMap::default(); + longest.insert(self.start_state, 0); + let mut max = None; + let mut processed = 0usize; + while let Some(from) = queue.pop_front() { + processed += 1; + let length = *longest.get(&from).unwrap_or(&0); + if self.accept_states.contains(&from) { + max = Some(max.map_or(length, |m: u32| m.max(length))); } - seen.insert(state); - - for to_state in self.transitions_from_state_iter(&state) { - if to_state == state || seen.contains(&to_state) { - max = None; - break; + for (condition, to_state) in self.transitions_from(from) { + if condition.is_empty() || !visited.contains(to_state) { + continue; + } + longest + .entry(*to_state) + .and_modify(|l| *l = (*l).max(length + 1)) + .or_insert(length + 1); + let degree = in_degree + .get_mut(to_state) + .expect("every visited target was counted above"); + *degree -= 1; + if *degree == 0 { + queue.push_back(*to_state); } - worklist.push_back((to_state, length + 1, seen.clone())); } } + if processed != visited.len() { + // A cycle on an accepting path: matched strings can be pumped + // arbitrarily, the maximum is unbounded. + return (min, None); + } + (min, max) } -} \ No newline at end of file +} + +#[cfg(test)] +mod tests { + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + + // Regression: `length` used to set `max = None` on any cycle + // reachable from start, even dead cycles among non-accepting states that + // cannot reach an accept. Such cycles don't extend the language; the + // max must remain finite. Now fixed by filtering branches to the live + // (co-reachable-from-accept) subgraph. + #[test] + fn length_handles_dead_cycle() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.accept(0); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.add_transition(s2, s1, &cond); + // s1, s2 not accepting → language is {""} only. + + let (min, max) = a.length(); + assert_eq!(min, Some(0), "min length of {{\"\"}} is 0"); + assert_eq!( + max, + Some(0), + "max length of {{\"\"}} is 0; got {max:?} (cycle is dead, shouldn't extend the language)" + ); + } + + #[test] + fn length_finite_and_infinite() { + // Chain 0 -> 1 -> 2, accepts {0, 2}: min 0, max 2. + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(s1, s2, &cond); + a.accept(0); + a.accept(s2); + assert_eq!(a.length(), (Some(0), Some(2))); + + // Live cycle 0 <-> 1, accept {1}: min 1, max unbounded. + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(s1, 0, &cond); + a.accept(s1); + assert_eq!(a.length(), (Some(1), None)); + } + + // Regression: `length` used to enumerate paths with a cloned `seen` + // set per branch (exponential time and memory on branching DAGs). A chain + // of diamonds has 2^k paths; the linear algorithm must handle it + // instantly. + #[test] + fn length_linear_on_branching_dag() { + const DIAMONDS: usize = 24; + + let mut a = FastAutomaton::new_empty(); + let cond = Condition::total(a.spanning_set()); + let mut current = 0; + for _ in 0..DIAMONDS { + let upper = a.new_state(); + let lower = a.new_state(); + let next = a.new_state(); + a.add_transition(current, upper, &cond); + a.add_transition(current, lower, &cond); + a.add_transition(upper, next, &cond); + a.add_transition(lower, next, &cond); + current = next; + } + a.accept(current); + + let expected = 2 * DIAMONDS as u32; + assert_eq!(a.length(), (Some(expected), Some(expected))); + } +} diff --git a/src/fast_automaton/analyze/mod.rs b/src/fast_automaton/analyze/mod.rs index 56f0884..feeb9aa 100644 --- a/src/fast_automaton/analyze/mod.rs +++ b/src/fast_automaton/analyze/mod.rs @@ -10,27 +10,159 @@ mod length; mod subset; impl FastAutomaton { - #[inline] + /// Checks if the automaton matches the empty language. + /// + /// Sound and complete: works on NFAs and non-minimal automata without + /// requiring determinization or minimization. O(V + E) worst case, with + /// O(1) fast paths for the common cases. pub fn is_empty(&self) -> bool { - self.accept_states.is_empty() + if self.accept_states.is_empty() { + return true; + } + if self.accept_states.contains(&self.start_state) { + return false; + } + if self.minimal { + // A minimal automaton with at least one accept state has a + // non-empty language (minimization prunes dead accepts). + return false; + } + + // Forward BFS from `start_state`; stop on first accept hit. + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + + while let Some(s) = worklist.pop_front() { + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + if self.accept_states.contains(to) { + return false; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + } + true } - #[inline] + /// Checks if the automaton matches all possible strings. + /// + /// Sound and complete for **deterministic** automata: a DFA's language + /// equals Σ\* iff every reachable state is accepting AND its outgoing + /// conditions union to Σ. For NFAs this is sound but conservative: + /// alternative paths may cover a character that no single reachable + /// state covers, so callers that need an exact answer on an NFA should + /// determinize first. + /// + /// O(V + E) plus one condition-union per outgoing transition. pub fn is_total(&self) -> bool { - if self.accept_states.contains(&self.start_state) { - if let Some(condition) = self.transitions[self.start_state].get(&self.start_state) { - return condition.is_total(); + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + + while let Some(s) = worklist.pop_front() { + if !self.accept_states.contains(&s) { + return false; + } + let mut covered = Condition::empty(&self.spanning_set); + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + covered = covered.union(cond); + if visited.insert(*to) { + worklist.push_back(*to); + } + } + if !covered.is_total() { + return false; } } - false + true } - pub fn get_reacheable_states(&self) -> IntSet { + /// Checks if the automaton only matches the empty string `""`. + /// + /// Sound and complete on any automaton (DFA or NFA): the language equals + /// `{""}` iff start is accepting AND no state reachable from start by at + /// least one non-empty transition is, or can reach, an accept state. + /// O(V + E). + pub fn is_empty_string(&self) -> bool { + if !self.accept_states.contains(&self.start_state) { + return false; + } + + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + + // Seed with states reachable in exactly one non-empty step from start. + for (cond, to) in self.transitions_from(self.start_state) { + if cond.is_empty() { + continue; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + + while let Some(s) = worklist.pop_front() { + if self.accept_states.contains(&s) { + return false; + } + for (cond, to) in self.transitions_from(s) { + if cond.is_empty() { + continue; + } + if visited.insert(*to) { + worklist.push_back(*to); + } + } + } + true + } + + /// Returns the states reachable **from the start state** by following + /// non-empty transitions (the start state is always included). + /// + /// This is forward reachability. Contrast with [`Self::live_states`], + /// which returns the states that can **reach an accept state** + /// (co-reachability). + pub(crate) fn forward_reachable_states(&self) -> IntSet { + let mut visited = IntSet::default(); + let mut worklist = VecDeque::new(); + visited.insert(self.start_state); + worklist.push_back(self.start_state); + while let Some(s) = worklist.pop_front() { + for (condition, to_state) in self.transitions_from(s) { + if condition.is_empty() { + continue; + } + if visited.insert(*to_state) { + worklist.push_back(*to_state); + } + } + } + visited + } + + /// Returns the "live" (co-reachable) states: those that can **reach an + /// accept state** by following non-empty transitions. Computed by a reverse + /// traversal from the accept states. + /// + /// This is co-reachability; note it is *not* the set of states reachable + /// from the start state. + pub fn live_states(&self) -> IntSet { let mut states_map: IntMap> = IntMap::with_capacity_and_hasher(self.transitions.len(), BuildHasherDefault::default()); - for from_state in self.transitions_iter() { - for (to_state, transition) in self.transitions_from_state_enumerate_iter(&from_state) { - if transition.is_empty() { + for from_state in self.states() { + for (condition, to_state) in self.transitions_from(from_state) { + if condition.is_empty() { continue; } match states_map.entry(*to_state) { @@ -61,9 +193,53 @@ impl FastAutomaton { live } - pub fn get_ranges(&self) -> Result, EngineError> { - self.spanning_set.get_spanning_ranges().map(|range| { - Condition::from_range(range, &self.spanning_set) - }).collect() + /// Returns one [`Condition`] per base of the spanning set, including the + /// "rest" range when it is non-empty. + /// + /// The bases must partition the whole alphabet Σ: subset construction + /// ([`determinize`](Self::determinize)) and Hopcroft partitioning + /// ([`minimize`](Self::minimize)) iterate them and would otherwise silently + /// drop transitions whose condition lies in the "rest" range. (For a + /// spanning set with an empty rest this is exactly the spanning ranges, so + /// well-formed automata are unaffected.) + pub fn spanning_bases(&self) -> Result, EngineError> { + self.spanning_set + .spanning_ranges_with_rest() + .iter() + .map(|range| Condition::from_range(range, &self.spanning_set)) + .collect() + } +} + +#[cfg(test)] +mod tests { + + use crate::fast_automaton::FastAutomaton; + + #[test] + fn test_empty() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty()); + assert!(!FastAutomaton::new_empty_string().is_empty()); + assert!(FastAutomaton::new_empty().is_empty()); + + Ok(()) + } + + #[test] + fn test_empty_string() -> Result<(), String> { + assert!(!FastAutomaton::new_total().is_empty_string()); + assert!(FastAutomaton::new_empty_string().is_empty_string()); + assert!(!FastAutomaton::new_empty().is_empty_string()); + + Ok(()) + } + + #[test] + fn test_total() -> Result<(), String> { + assert!(FastAutomaton::new_total().is_total()); + assert!(!FastAutomaton::new_empty_string().is_total()); + assert!(!FastAutomaton::new_empty().is_total()); + + Ok(()) } } diff --git a/src/fast_automaton/analyze/subset.rs b/src/fast_automaton/analyze/subset.rs index 5705fc2..da9e4d0 100644 --- a/src/fast_automaton/analyze/subset.rs +++ b/src/fast_automaton/analyze/subset.rs @@ -3,14 +3,25 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { - pub fn is_subset_of(&self, other: &FastAutomaton) -> Result { + /// Returns `true` if all strings accepted by `self` are also accepted by `other`. + /// + /// A non-deterministic `other` is determinized internally, unless the + /// execution profile disables implicit determinization, in which case + /// [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] + pub fn subset(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_total() || self == other { return Ok(true); - } else if other.is_empty() || self.is_total() { + } else if other.is_empty() { return Ok(false); + } else if self.is_total() { + // self ⊆ other iff Σ* ⊆ other iff other = Σ*. We already failed + // the cheap `other.is_total()` check above; that check is sound + // but conservative on NFAs, so retry on the determinized form. + return Ok(other.determinize_implicit()?.is_total()); } - let mut other = other.determinize()?; + let mut other = other.determinize_implicit()?.into_owned(); other.complement()?; Ok(!self.has_intersection(&other)?) @@ -38,33 +49,33 @@ mod tests { true, ); - let regex1 = RegularExpression::new("test.*other").unwrap(); - let regex2 = RegularExpression::new("test.*othew").unwrap(); + let regex1 = RegularExpression::parse("test.*other", false).unwrap(); + let regex2 = RegularExpression::parse("test.*othew", false).unwrap(); assert_subset(®ex1, ®ex2, false, false); - let regex1 = RegularExpression::new("test.{0,50}other").unwrap(); - let regex2 = RegularExpression::new("test.{0,49}other").unwrap(); + let regex1 = RegularExpression::parse("test.{0,50}other", false).unwrap(); + let regex2 = RegularExpression::parse("test.{0,49}other", false).unwrap(); assert_subset(®ex1, ®ex2, false, true); - let regex1 = RegularExpression::new("(abc|def)").unwrap(); - let regex2 = RegularExpression::new("(abc|def|xyz)").unwrap(); + let regex1 = RegularExpression::parse("(abc|def)", false).unwrap(); + let regex2 = RegularExpression::parse("(abc|def|xyz)", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("[0]").unwrap(); - let regex2 = RegularExpression::new("[01]").unwrap(); + let regex1 = RegularExpression::parse("[0]", false).unwrap(); + let regex2 = RegularExpression::parse("[01]", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("a.*b.*c.*").unwrap(); - let regex2 = RegularExpression::new("a.*b.*").unwrap(); + let regex1 = RegularExpression::parse("a.*b.*c.*", false).unwrap(); + let regex2 = RegularExpression::parse("a.*b.*", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); - let regex1 = RegularExpression::new("1..").unwrap(); - let regex2 = RegularExpression::new("...").unwrap(); + let regex1 = RegularExpression::parse("1..", false).unwrap(); + let regex2 = RegularExpression::parse("...", false).unwrap(); assert_subset(®ex1, ®ex2, true, false); @@ -79,18 +90,12 @@ mod tests { ) { println!("{regex_1} and {regex_2}"); let automaton_1 = regex_1.to_automaton().unwrap(); - assert_eq!(true, automaton_1.is_subset_of(&automaton_1).unwrap()); + assert!(automaton_1.subset(&automaton_1).unwrap()); let automaton_2 = regex_2.to_automaton().unwrap(); - assert_eq!(true, automaton_2.is_subset_of(&automaton_2).unwrap()); + assert!(automaton_2.subset(&automaton_2).unwrap()); - assert_eq!( - expected_1_2, - automaton_1.is_subset_of(&automaton_2).unwrap() - ); - assert_eq!( - expected_2_1, - automaton_2.is_subset_of(&automaton_1).unwrap() - ); + assert_eq!(expected_1_2, automaton_1.subset(&automaton_2).unwrap()); + assert_eq!(expected_2_1, automaton_2.subset(&automaton_1).unwrap()); } } diff --git a/src/fast_automaton/builder.rs b/src/fast_automaton/builder.rs index b6cf50b..840e41e 100644 --- a/src/fast_automaton/builder.rs +++ b/src/fast_automaton/builder.rs @@ -5,6 +5,7 @@ use crate::error::EngineError; use super::*; impl FastAutomaton { + /// Creates an automaton that matches the empty language. #[inline] pub fn new_empty() -> Self { Self { @@ -15,86 +16,52 @@ impl FastAutomaton { removed_states: IntSet::default(), spanning_set: SpanningSet::new_empty(), deterministic: true, - cyclic: false, + minimal: true, } } + /// Creates an automaton that only matches the empty string `""`. #[inline] pub fn new_empty_string() -> Self { let mut automaton = Self::new_empty(); automaton.accept(automaton.start_state); + automaton.minimal = true; automaton } + /// Creates an automaton that matches all possible strings. #[inline] pub fn new_total() -> Self { let mut automaton: FastAutomaton = Self::new_empty(); automaton.spanning_set = SpanningSet::new_total(); automaton.accept(automaton.start_state); - automaton.add_transition_to(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.add_transition(0, 0, &Condition::total(&automaton.spanning_set)); + automaton.minimal = true; automaton } - #[inline] - pub fn make_empty(&mut self) { - self.apply_model(&Self::new_empty()) - } - - #[inline] - pub fn make_total(&mut self) { - self.apply_model(&Self::new_total()) - } - - pub fn make_from_range(range: &Range) -> Result { + /// Creates an automaton that matches one of the characters in the given [`CharRange`]. + pub fn new_from_range(range: &CharRange) -> Self { let mut automaton = Self::new_empty(); if range.is_empty() { - return Ok(automaton); + return automaton; } let new_state = automaton.new_state(); - let spanning_set = SpanningSet::compute_spanning_set(&[range.clone()]); - let condition = Condition::from_range(range, &spanning_set)?; + let spanning_set = SpanningSet::compute_spanning_set(std::slice::from_ref(range)); + let condition = + Condition::from_range(range, &spanning_set).expect("The spanning set should be valid"); automaton.spanning_set = spanning_set; - automaton.add_transition_to(0, new_state, &condition); + automaton.add_transition(0, new_state, &condition); automaton.accept(new_state); - Ok(automaton) - } - - pub fn apply_new_spanning_set( - &mut self, - new_spanning_set: &SpanningSet, - ) -> Result<(), EngineError> { - if new_spanning_set == &self.spanning_set { - return Ok(()); - } - let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; - for from_state in &self.transitions_vec() { - for to_state in self.transitions_from_state(from_state) { - match self.transitions[*from_state].entry(to_state) { - Entry::Occupied(mut o) => { - o.insert(condition_converter.convert(o.get())?); - } - Entry::Vacant(_) => {} - }; - } - } - self.spanning_set = new_spanning_set.clone(); - Ok(()) - } - - #[inline] - pub fn apply_model(&mut self, model: &FastAutomaton) { - self.transitions = model.transitions.clone(); - self.start_state = model.start_state; - self.accept_states = model.accept_states.clone(); - self.removed_states = model.removed_states.clone(); - self.spanning_set = model.spanning_set.clone(); - self.deterministic = model.deterministic; - self.cyclic = model.cyclic; + automaton.minimal = true; + automaton } + /// Creates a new state and returns its identifier. #[inline] pub fn new_state(&mut self) -> State { + self.minimal = false; if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); *new_state @@ -104,13 +71,65 @@ impl FastAutomaton { } } + /// Marks the provided state as an accepting (final) state. #[inline] pub fn accept(&mut self, state: State) { self.assert_state_exists(state); + self.minimal = false; self.accept_states.insert(state); } - pub fn add_transition_to(&mut self, from_state: State, to_state: State, new_cond: &Condition) { + /// Marks the provided state as a non-accepting state. + #[inline] + pub fn unaccept(&mut self, state: State) { + self.assert_state_exists(state); + self.minimal = false; + self.accept_states.remove(&state); + } + + /// Creates a new transition with the given condition; the condition must follow the automaton’s current spanning set. + /// + /// If you don't want to deal with conditions and spanning sets, use + /// [`add_transition_from_range`](Self::add_transition_from_range), which + /// handles the bookkeeping for you. + /// + /// This method accepts a [`Condition`] rather than a raw character set. To build a [`Condition`], call: + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::{condition::Condition, spanning_set::SpanningSet}; + /// # let range = CharRange::total(); + /// # let spanning_set = SpanningSet::new_total(); + /// Condition::from_range(&range, &spanning_set); + /// ``` + /// where `spanning_set` is the automaton's current [`SpanningSet`]. The [`CharRange`] you pass must be fully covered by that spanning set. If it isn't, you have two options: + /// + /// 1. Merge an existing spanning set with another: + /// ```rust + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let old_set = SpanningSet::new_total(); + /// # let other_set = SpanningSet::new_total(); + /// let new_set = SpanningSet::merge(&old_set, &other_set); + /// ``` + /// + /// 2. Recompute from a list of ranges: + /// ```rust + /// # use regexsolver::CharRange; + /// # use regexsolver::fast_automaton::spanning_set::SpanningSet; + /// # let range_set1 = CharRange::total(); + /// # let range_set2 = CharRange::total(); + /// let new_set = SpanningSet::compute_spanning_set(&[range_set1, range_set2]); + /// ``` + /// + /// After constructing `new_set`, apply it to the automaton: + /// ```rust + /// # use regexsolver::fast_automaton::{FastAutomaton, spanning_set::SpanningSet}; + /// # let mut fast_automaton = FastAutomaton::new_total(); + /// # let new_set = SpanningSet::new_total(); + /// fast_automaton.apply_new_spanning_set(&new_set); + /// ``` + /// + /// This design allows us to perform unions, intersections, and complements of transition conditions in O(1) time, but it does add some complexity to automaton construction. For more details, you can check [this article](https://alexvbrdn.me/post/optimizing-transition-conditions-automaton-representation). + pub fn add_transition(&mut self, from_state: State, to_state: State, new_cond: &Condition) { self.assert_state_exists(from_state); if from_state != to_state { self.assert_state_exists(to_state); @@ -119,9 +138,10 @@ impl FastAutomaton { return; } + self.minimal = false; if self.deterministic { let mut deterministic = true; - for (state, condition) in self.transitions_from_state_enumerate_iter(&from_state) { + for (condition, state) in self.transitions_from(from_state) { if state == &to_state { continue; } @@ -147,22 +167,130 @@ impl FastAutomaton { }; } - pub fn add_epsilon(&mut self, from_state: State, to_state: State) { + /// Adds a transition labeled with the given character range, taking care + /// of the spanning-set bookkeeping. + /// + /// This is the convenient counterpart to + /// [`add_transition`](Self::add_transition): the range is converted to a + /// [`Condition`] automatically, and when it is not exactly expressible + /// in the automaton's current spanning set, the spanning set is extended + /// and every existing condition is re-projected first. + /// + /// An empty range matches no character, so no transition is added. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::CharRange; + /// use regexsolver::fast_automaton::FastAutomaton; + /// use regex_charclass::char::Char; + /// + /// let mut automaton = FastAutomaton::new_empty(); + /// let s1 = automaton.new_state(); + /// automaton.accept(s1); + /// + /// let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); + /// automaton.add_transition_from_range(0, s1, &a_to_c).unwrap(); + /// + /// assert!(automaton.is_match("b")); + /// assert!(!automaton.is_match("d")); + /// ``` + pub fn add_transition_from_range( + &mut self, + from_state: State, + to_state: State, + range: &CharRange, + ) -> Result<(), EngineError> { + if range.is_empty() { + return Ok(()); + } + + // Fast path: the range is exactly expressible in the current + // spanning set. `Condition::from_range` alone cannot tell us that + // (it silently drops partially-covered bases), so round-trip the + // condition to check exactness. + if let Ok(condition) = Condition::from_range(range, &self.spanning_set) + && condition.to_range(&self.spanning_set)? == *range + { + self.add_transition(from_state, to_state, &condition); + return Ok(()); + } + + // The range is not (fully) covered: extend the spanning set, + // re-project the existing conditions, then add. + let new_spanning_set = + self.spanning_set + .merge(&SpanningSet::compute_spanning_set(std::slice::from_ref( + range, + ))); + self.apply_new_spanning_set(&new_spanning_set)?; + + let condition = Condition::from_range(range, &self.spanning_set)?; + self.add_transition(from_state, to_state, &condition); + Ok(()) + } + + /// Adds a transition, but refuses if it would turn a DFA into an NFA. + /// + /// On `Err(DeterminismLost)` the automaton is left untouched; on `Ok`, + /// the transition has been added and `is_deterministic()` still holds + /// (provided it held before the call). This is the opt-in strict + /// counterpart to [`add_transition`](Self::add_transition). + pub fn try_add_transition( + &mut self, + from_state: State, + to_state: State, + new_cond: &Condition, + ) -> Result<(), super::DeterminismLost> { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + if new_cond.is_empty() { + return Ok(()); + } + if self.deterministic { + for (condition, state) in self.transitions_from(from_state) { + if *state == to_state { + continue; + } + if condition.has_intersection(new_cond) { + return Err(super::DeterminismLost); + } + } + } + self.add_transition(from_state, to_state, new_cond); + Ok(()) + } + + /// Adds an epsilon transition by eagerly folding `to_state`'s **current** + /// transitions (and acceptance) into `from_state`. + /// + /// This is a snapshot: transitions added to `to_state` *afterwards* are + /// not propagated retroactively. When building automata incrementally, + /// add epsilon transitions last. + pub fn add_epsilon_transition(&mut self, from_state: State, to_state: State) { if from_state == to_state { return; } self.assert_state_exists(from_state); self.assert_state_exists(to_state); + + self.minimal = false; + if self.accept_states.contains(&to_state) { self.accept_states.insert(from_state); } - let transitions_to: Vec<_> = self.transitions_from_state_into_iter(&to_state).collect(); + let transitions_to: Vec<_> = self + .transitions_from(to_state) + .map(|(cond, to_state)| (cond.clone(), *to_state)) + .collect(); - for (state, cond) in transitions_to { + for (cond, state) in transitions_to { if self.deterministic { let mut deterministic = true; - for (s, c) in self.transitions_from_state_enumerate_iter(&from_state) { + for (c, s) in self.transitions_from(from_state) { if state == *s { continue; } @@ -188,14 +316,29 @@ impl FastAutomaton { } } + /// Removes the transition between the two provided states if it exists. + pub fn remove_transition(&mut self, from_state: State, to_state: State) { + self.assert_state_exists(from_state); + if from_state != to_state { + self.assert_state_exists(to_state); + } + + self.minimal = false; + + self.transitions_in + .entry(to_state) + .or_default() + .remove(&from_state); + self.transitions[from_state].remove(&to_state); + } + + /// Removes the state and its connected transitions; panics if it's a start state. pub fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state { - panic!( - "Can not remove the state {}, it is still used as start state.", - state - ); + panic!("Can not remove the state {state}, it is still used as start state."); } + self.minimal = false; self.accept_states.remove(&state); self.transitions_in.remove(&state); if self.transitions.len() - 1 == state { @@ -220,17 +363,16 @@ impl FastAutomaton { } } + /// Removes the given states and their connected transitions; panics if any is a start state. pub fn remove_states(&mut self, states: &IntSet) { self.accept_states.retain(|e| !states.contains(e)); + self.minimal = false; let mut states_to_remove = Vec::with_capacity(states.len()); for &state in states { if self.start_state == state { - panic!( - "Can not remove the state {}, it is still used as start state.", - state - ); + panic!("Can not remove the state {state}, it is still used as start state."); } if self.transitions.len() - 1 == state { self.transitions.remove(state); @@ -260,13 +402,228 @@ impl FastAutomaton { transitions.remove(state); } } + + for state in &states_to_remove { + self.transitions_in.remove(state); + } + for predecessors in self.transitions_in.values_mut() { + for state in &states_to_remove { + predecessors.remove(state); + } + } + } + + /// Recompute a minimal spanning set for the automaton and apply it. + pub fn recompute_minimal_spanning_set(&mut self) -> Result<(), EngineError> { + let mut ranges = Vec::with_capacity(self.number_of_states()); + + for state in self.states() { + for (condition, _) in self.transitions_from(state) { + ranges.push(condition.to_range(&self.spanning_set)?); + } + } + + let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); + + self.apply_new_spanning_set(&new_spanning_set) + } + + /// Applies the provided spanning set and projects all existing conditions onto it. + pub fn apply_new_spanning_set( + &mut self, + new_spanning_set: &SpanningSet, + ) -> Result<(), EngineError> { + if new_spanning_set == &self.spanning_set { + return Ok(()); + } + let condition_converter = ConditionConverter::new(&self.spanning_set, new_spanning_set)?; + for &from_state in &self.states_vec() { + for to_state in self.direct_states_vec(from_state) { + match self.transitions[from_state].entry(to_state) { + Entry::Occupied(mut o) => { + o.insert(condition_converter.convert(o.get())?); + } + Entry::Vacant(_) => {} + }; + } + } + self.spanning_set = new_spanning_set.clone(); + Ok(()) + } + + #[inline] + pub(crate) fn make_empty(&mut self) { + self.apply_model(&Self::new_empty()) + } + + #[inline] + pub(crate) fn make_total(&mut self) { + self.apply_model(&Self::new_total()) + } + + #[inline] + pub(crate) fn make_empty_string(&mut self) { + self.apply_model(&Self::new_empty_string()) + } + + #[inline] + pub(crate) fn apply_model(&mut self, model: &FastAutomaton) { + self.transitions = model.transitions.clone(); + self.transitions_in = model.transitions_in.clone(); + self.start_state = model.start_state; + self.accept_states = model.accept_states.clone(); + self.removed_states = model.removed_states.clone(); + self.spanning_set = model.spanning_set.clone(); + self.deterministic = model.deterministic; + self.minimal = model.minimal; } } #[cfg(test)] mod tests { + use crate::IntSet; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; use crate::regex::RegularExpression; + fn rng(a: char, b: char) -> crate::CharRange { + use regex_charclass::char::Char; + crate::CharRange::new_from_range(Char::new(a)..=Char::new(b)) + } + + #[test] + fn small_mutators_and_queries() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + a.add_transition_from_range(0, s1, &rng('a', 'a')).unwrap(); + a.accept(s1); + + assert!(a.is_accepted(s1)); + assert!(a.has_transition(0, s1)); + assert!(a.condition(0, s1).is_some()); + assert_eq!(a.in_degree(s1), 1); + assert_eq!(a.out_degree(0), 1); + assert!(a.is_match("a")); + + // try_add_transition: refuses determinism-breaking additions and + // leaves the automaton untouched on Err. + let condition_a = Condition::from_range(&rng('a', 'a'), a.spanning_set()).unwrap(); + assert!(a.is_deterministic()); + assert!(a.try_add_transition(0, s2, &condition_a).is_err()); + assert!(a.is_deterministic()); + assert!(!a.has_transition(0, s2)); + // ...but accepts disjoint conditions. + let condition_not_a = condition_a.complement(); + a.try_add_transition(0, s2, &condition_not_a).unwrap(); + assert!(a.is_deterministic()); + assert!(a.has_transition(0, s2)); + + // unaccept flips membership and the language. + a.unaccept(s1); + assert!(!a.is_accepted(s1)); + assert!(!a.is_match("a")); + a.accept(s1); + assert!(a.is_match("a")); + + // remove_transition removes the edge and updates queries. + a.remove_transition(0, s1); + assert!(!a.has_transition(0, s1)); + assert!(a.condition(0, s1).is_none()); + assert_eq!(a.in_degree(s1), 0); + assert!(!a.is_match("a")); + } + + #[test] + fn add_transition_from_range_extends_the_spanning_set() { + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + let s2 = automaton.new_state(); + automaton.accept(s2); + + // Both ranges extend the (initially empty) spanning set. + automaton + .add_transition_from_range(0, s1, &rng('a', 'c')) + .unwrap(); + automaton + .add_transition_from_range(s1, s2, &rng('x', 'z')) + .unwrap(); + + assert!(automaton.is_match("ax")); + assert!(automaton.is_match("cz")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("x")); + + // An exactly-covered range takes the fast path: same spanning set. + let before = automaton.spanning_set().clone(); + automaton + .add_transition_from_range(0, s1, &rng('x', 'z')) + .unwrap(); + assert_eq!(&before, automaton.spanning_set()); + assert!(automaton.is_match("zx")); + + // An empty range adds nothing. + automaton + .add_transition_from_range(0, s2, &crate::CharRange::empty()) + .unwrap(); + assert!(!automaton.is_match("a")); + } + + // Regression guard: `Condition::from_range` silently drops + // partially-covered bases, so a naive "convert, merge only on error" + // implementation would truncate [a-e] to the existing [a-c] base. The + // exactness round-trip must force a spanning-set refinement instead. + #[test] + fn add_transition_from_range_is_exact_on_partial_coverage() { + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + automaton.accept(s1); + + automaton + .add_transition_from_range(0, s1, &rng('a', 'c')) + .unwrap(); + // Contains the whole [a-c] base but only part of the rest. + automaton + .add_transition_from_range(0, s1, &rng('a', 'e')) + .unwrap(); + + for accepted in ["a", "b", "c", "d", "e"] { + assert!(automaton.is_match(accepted), "{accepted:?} must match"); + } + assert!(!automaton.is_match("f")); + } + + // Regression: `remove_states` used to skip the `transitions_in` cleanup + // that the single-state variant `remove_state` performs (drop entries + // keyed by removed states; purge them from surviving predecessor sets). + // Without that cleanup, `in_degree` of removed states stayed stale and + // any caller (repeat, concat, union, difference, to_regex) would see + // wrong values. + #[test] + fn remove_states_cleans_transitions_in() { + let mut a = FastAutomaton::new_empty(); + let s1 = a.new_state(); + let s2 = a.new_state(); + let cond = Condition::total(a.spanning_set()); + a.add_transition(0, s1, &cond); + a.add_transition(0, s2, &cond); + a.accept(s1); + a.accept(s2); + + assert_eq!(a.in_degree(s1), 1); + assert_eq!(a.in_degree(s2), 1); + + let mut to_remove = IntSet::default(); + to_remove.insert(s1); + a.remove_states(&to_remove); + + // After removing s1, its in_degree should report 0 (or, equivalently, + // queries on a removed state should be a clean no-op). Currently it + // still reports the pre-removal count. + assert_eq!(a.in_degree(s1), 0, "in_degree of removed state should be 0"); + assert_eq!(a.in_degree(s2), 1); + } + #[test] fn test_regex_build_deterministic_automaton() -> Result<(), String> { assert_regex_build_deterministic_automaton("...", true); @@ -278,10 +635,10 @@ mod tests { } fn assert_regex_build_deterministic_automaton(regex: &str, deterministic: bool) { - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - assert_eq!(deterministic, automaton.is_determinitic()); + assert_eq!(deterministic, automaton.is_deterministic()); } } diff --git a/src/fast_automaton/condition/converter.rs b/src/fast_automaton/condition/converter.rs index 89bb123..f371a7b 100644 --- a/src/fast_automaton/condition/converter.rs +++ b/src/fast_automaton/condition/converter.rs @@ -15,7 +15,12 @@ pub struct ConditionConverter<'a, 'b> { impl<'a, 'b> ConditionConverter<'a, 'b> { /// Build a converter to project [`Condition`] from `from_spanning_set` to `to_spanning_set`. /// - /// Currently this method does not check that the provided [`SpanningSet`] are actually convertible. + /// Two directions are legitimate: refinement (a merged spanning set + /// before a binary operation) and coarsening (a recomputed minimal + /// spanning set, where bases no transition uses fold into the rest). The + /// pair is therefore not validated here; instead [`convert`](Self::convert) + /// asserts in debug builds that each projection preserves the + /// condition's character range. pub fn new( from_spanning_set: &'a SpanningSet, to_spanning_set: &'b SpanningSet, @@ -23,7 +28,7 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { let mut to_base_map = IntMap::with_capacity(to_spanning_set.spanning_ranges_with_rest_len()); for (i, base) in to_spanning_set - .get_spanning_ranges_with_rest() + .spanning_ranges_with_rest() .into_iter() .enumerate() { @@ -31,8 +36,8 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { } let mut equivalence_map: Vec> = - Vec::with_capacity(from_spanning_set.get_number_of_spanning_ranges() + 1); - for from_base in from_spanning_set.get_spanning_ranges_with_rest().iter() { + Vec::with_capacity(from_spanning_set.number_of_spanning_ranges() + 1); + for from_base in from_spanning_set.spanning_ranges_with_rest().iter() { let mut index = Vec::with_capacity(1); for (i, to_base) in &to_base_map { if from_base == to_base || from_base.has_intersection(to_base) { @@ -54,61 +59,73 @@ impl<'a, 'b> ConditionConverter<'a, 'b> { /// Project the given [`Condition`] from `from_spanning_set` to `to_spanning_set`. /// - /// If `from_spanning_set` is not convertible to `to_spanning_set` or if the given [`Condition`] is not based on `from_spanning_set`, - /// the resulting [`Condition`] will not have any relevance. + /// Returns [`EngineError::IncompatibleSpanningSet`] if the given + /// [`Condition`] was not built over `from_spanning_set`. pub fn convert(&self, condition: &Condition) -> Result { + if condition.0.len() != self.from_spanning_set.spanning_ranges_with_rest_len() { + return Err(EngineError::IncompatibleSpanningSet); + } let mut new_condition = Condition::empty(self.to_spanning_set); for (from_index, to_indexes) in self.equivalence_map.iter().enumerate() { - if let Some(has) = condition.0.get(from_index) { - if has && !to_indexes.is_empty() { - to_indexes.iter().for_each(|&to_index| { - new_condition.0.set(to_index, true); - }); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + if condition.0.get(from_index) && !to_indexes.is_empty() { + to_indexes.iter().for_each(|&to_index| { + new_condition.0.set(to_index, true); + }); } } + // The one invariant every legitimate use (refining and coarsening + // alike) must uphold: the projection denotes the same character set. + // A violation means a condition referenced a base the target spanning + // set cannot express, causing silent language corruption in release. + debug_assert_eq!( + condition + .to_range(self.from_spanning_set) + .expect("the length was checked above"), + new_condition + .to_range(self.to_spanning_set) + .expect("the condition was built over the target spanning set"), + "the projection changed the condition's character range" + ); + Ok(new_condition) } /// Returns `from_spanning_set`. - pub fn get_from_spanning_set(&self) -> &'a SpanningSet { + pub fn from_spanning_set(&self) -> &'a SpanningSet { self.from_spanning_set } /// Returns `to_spanning_set`. - pub fn get_to_spanning_set(&self) -> &'b SpanningSet { + pub fn to_spanning_set(&self) -> &'b SpanningSet { self.to_spanning_set } } #[cfg(test)] mod tests { + use crate::CharRange; use regex_charclass::{char::Char, irange::range::AnyRange}; - use crate::Range; - use super::*; - fn get_from_spanning_set() -> SpanningSet { + fn from_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) } - fn get_to_spanning_set() -> SpanningSet { + fn to_spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\0')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), + CharRange::new_from_range(Char::new('\0')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{20}')..=Char::new('\u{22}')), ]; SpanningSet::compute_spanning_set(&ranges) @@ -116,8 +133,8 @@ mod tests { #[test] fn test_convert() -> Result<(), String> { - let from_spanning_set = get_from_spanning_set(); - let to_spanning_set = get_to_spanning_set(); + let from_spanning_set = from_spanning_set(); + let to_spanning_set = to_spanning_set(); let converter = ConditionConverter::new(&from_spanning_set, &to_spanning_set).unwrap(); @@ -127,7 +144,7 @@ mod tests { let total = Condition::total(&from_spanning_set); assert!(converter.convert(&total).unwrap().is_total()); - let range = Range::new_from_range(Char::new('\0')..=Char::new('\u{2}')); + let range = CharRange::new_from_range(Char::new('\0')..=Char::new('\u{2}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -138,7 +155,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); + let range = CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')); let condition = Condition::from_range(&range, &from_spanning_set).unwrap(); assert_eq!( range, @@ -149,7 +166,7 @@ mod tests { .unwrap() ); - let range = Range::new_from_ranges(&[ + let range = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), AnyRange::from(Char::new('\u{9}')..=Char::new('\u{9}')), ]); diff --git a/src/fast_automaton/condition/fast_bit_vec/mod.rs b/src/fast_automaton/condition/fast_bit_vec/mod.rs index bbf4376..631c7d8 100644 --- a/src/fast_automaton/condition/fast_bit_vec/mod.rs +++ b/src/fast_automaton/condition/fast_bit_vec/mod.rs @@ -7,8 +7,8 @@ pub struct FastBitVec { impl std::fmt::Display for FastBitVec { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { for i in 0..self.n { - let bit = if self.get(i).unwrap() { 1 } else { 0 }; - write!(f, "{}", bit)?; + let bit = if self.get(i) { 1 } else { 0 }; + write!(f, "{bit}")?; } Ok(()) } @@ -17,7 +17,11 @@ impl std::fmt::Display for FastBitVec { impl FastBitVec { #[inline] pub fn from_elem(n: usize, bit: bool) -> Self { - let nblocks = if n % 64 == 0 { n / 64 } else { n / 64 + 1 }; + let nblocks = if n.is_multiple_of(64) { + n / 64 + } else { + n / 64 + 1 + }; let bits = vec![if bit { !0_u64 } else { 0_u64 }; nblocks]; let mut bit_vec = FastBitVec { bits, n }; bit_vec.fix_last_block(); @@ -48,13 +52,11 @@ impl FastBitVec { } #[inline] - pub fn get(&self, i: usize) -> Option { - if i >= self.n { - return None; - } + pub fn get(&self, i: usize) -> bool { + assert!(i < self.n, "The provided bit index is out of bound."); let w = i / 64; let b = i % 64; - self.bits.get(w).map(|&block| (block & (1 << b)) != 0) + (self.bits[w] & (1 << b)) != 0 } #[inline] @@ -78,8 +80,22 @@ impl FastBitVec { self.fix_last_block(); } + /// The binary operations combine blocks pairwise with `zip`, which would + /// silently truncate to the shorter operand if two bitvectors built over + /// different spanning sets were ever combined, producing a wrong + /// language instead of a loud failure. Catch that in debug builds (and + /// therefore in every test run). + #[inline] + fn assert_same_len(&self, other: &Self) { + debug_assert_eq!( + self.n, other.n, + "conditions built over different spanning sets cannot be combined" + ); + } + #[inline] pub fn union(&mut self, other: &Self) { + self.assert_same_len(other); for (a, b) in self.bits.iter_mut().zip(&other.bits) { let w = *a | b; *a = w; @@ -88,6 +104,7 @@ impl FastBitVec { #[inline] pub fn intersection(&mut self, other: &Self) { + self.assert_same_len(other); for (a, b) in self.bits.iter_mut().zip(&other.bits) { let w = *a & b; *a = w; @@ -96,6 +113,7 @@ impl FastBitVec { #[inline] pub fn has_intersection(&self, other: &Self) -> bool { + self.assert_same_len(other); for (a, b) in self.bits.iter().zip(&other.bits) { if *a & b != 0 { return true; @@ -123,11 +141,11 @@ impl FastBitVec { (!0) >> ((64 - bits % 64) % 64) } - pub fn get_bits(&self) -> Vec { - let mut hot_bits = Vec::with_capacity(self.n); + pub fn bits(&self) -> Vec { + let mut bits = Vec::with_capacity(self.n); for i in 0..self.n { - hot_bits.push(self.get(i).unwrap()); + bits.push(self.get(i)); } - hot_bits + bits } } diff --git a/src/fast_automaton/condition/mod.rs b/src/fast_automaton/condition/mod.rs index da9c2b8..4ac466b 100644 --- a/src/fast_automaton/condition/mod.rs +++ b/src/fast_automaton/condition/mod.rs @@ -1,16 +1,15 @@ use std::hash::Hash; -use crate::Range; use fast_bit_vec::FastBitVec; -use regex_charclass::{char::Char, CharacterClass}; +use regex_charclass::{CharacterClass, char::Char}; -use crate::error::EngineError; +use crate::{CharRange, error::EngineError}; use super::spanning_set::SpanningSet; pub mod converter; mod fast_bit_vec; -/// Contains the condition of a transition in a [`crate::FastAutomaton`] +/// Represents the condition of a transition in a [`crate::FastAutomaton`]. #[derive(Clone, PartialEq, Eq, Debug)] pub struct Condition(FastBitVec); @@ -27,6 +26,8 @@ impl Hash for Condition { } impl Condition { + /// Returns the condition that matches no character, sized for + /// `spanning_set` (every bit cleared). #[inline] pub fn empty(spanning_set: &SpanningSet) -> Self { Self(FastBitVec::from_elem( @@ -35,6 +36,8 @@ impl Condition { )) } + /// Returns the condition that matches every character, sized for + /// `spanning_set` (every bit set). #[inline] pub fn total(spanning_set: &SpanningSet) -> Self { Self(FastBitVec::from_elem( @@ -43,7 +46,14 @@ impl Condition { )) } - pub fn from_range(range: &Range, spanning_set: &SpanningSet) -> Result { + /// Converts a [`CharRange`] to a `Condition` sized for `spanning_set`. + /// + /// Returns [`EngineError::ConditionInvalidRange`] if the range is not + /// expressible in the current spanning set (no base is fully contained in + /// `range`). In that case, extend the spanning set first with + /// [`SpanningSet::merge`] or [`SpanningSet::compute_spanning_set`], apply + /// it with [`crate::fast_automaton::FastAutomaton::apply_new_spanning_set`], then retry. + pub fn from_range(range: &CharRange, spanning_set: &SpanningSet) -> Result { if range.is_empty() { return Ok(Self::empty(spanning_set)); } else if range.is_total() { @@ -52,11 +62,7 @@ impl Condition { let mut cond = Self::empty(spanning_set); - for (i, base) in spanning_set - .get_spanning_ranges_with_rest() - .iter() - .enumerate() - { + for (i, base) in spanning_set.spanning_ranges_with_rest().iter().enumerate() { if range.contains_all(base) { cond.0.set(i, true); } @@ -69,40 +75,51 @@ impl Condition { Ok(cond) } - pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { - let mut range = Range::empty(); - - for (i, base) in spanning_set - .get_spanning_ranges_with_rest() - .iter() - .enumerate() - { - if let Some(has) = self.0.get(i) { - if has { - range = range.union(base); - } - } else { - return Err(EngineError::ConditionIndexOutOfBound); + /// Converts this `Condition` back to the [`CharRange`] it represents, + /// evaluated against `spanning_set`. + /// + /// Returns [`EngineError::IncompatibleSpanningSet`] if this condition's + /// bit width does not match `spanning_set` (they were built from different + /// spanning sets). + pub fn to_range(&self, spanning_set: &SpanningSet) -> Result { + // A condition only carries meaning relative to the spanning set it + // was built from. Evaluating it against a differently-sized one used + // to panic (too short) or silently drop bits (too long). + if self.0.len() != spanning_set.spanning_ranges_with_rest_len() { + return Err(EngineError::IncompatibleSpanningSet); + } + + let mut range = CharRange::empty(); + + for (i, base) in spanning_set.spanning_ranges_with_rest().iter().enumerate() { + if self.0.get(i) { + range = range.union(base); } } Ok(range) } + /// Returns the condition matching characters in `self` or `other` (bitwise + /// OR). Both must share the same spanning set. #[inline] - pub fn union(&self, cond: &Condition) -> Self { + pub fn union(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - new_cond.0.union(&cond.0); + new_cond.0.union(&other.0); new_cond } + /// Returns the condition matching characters in both `self` and `other` + /// (bitwise AND). Both must share the same spanning set. #[inline] - pub fn intersection(&self, cond: &Condition) -> Self { + pub fn intersection(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - new_cond.0.intersection(&cond.0); + new_cond.0.intersection(&other.0); new_cond } + /// Returns the condition matching exactly the characters `self` does not, + /// relative to its spanning set. #[inline] pub fn complement(&self) -> Self { let mut new_cond = self.clone(); @@ -110,19 +127,26 @@ impl Condition { new_cond } + /// Returns the condition matching characters in `self` but not in `other` + /// (bitwise AND-NOT). Both must share the same spanning set. #[inline] - pub fn difference(&self, cond: &Condition) -> Self { + pub fn difference(&self, other: &Condition) -> Self { let mut new_cond = self.clone(); - let subtrahend = cond.complement(); + let subtrahend = other.complement(); new_cond.0.intersection(&subtrahend.0); new_cond } + /// Returns `true` if `self` and `other` share at least one character (their + /// intersection is non-empty). Both must share the same spanning set. #[inline] - pub fn has_intersection(&self, cond: &Condition) -> bool { - self.0.has_intersection(&cond.0) + pub fn has_intersection(&self, other: &Condition) -> bool { + self.0.has_intersection(&other.0) } + /// Returns `true` if the condition matches `character` (a Unicode scalar + /// value), evaluated against `spanning_set`. Values that are not valid + /// scalar values never match. #[inline] pub fn has_character( &self, @@ -136,23 +160,30 @@ impl Condition { } } + /// Returns `true` if the condition matches no character. #[inline] pub fn is_empty(&self) -> bool { self.0.empty() } + /// Returns `true` if the condition matches every character. #[inline] pub fn is_total(&self) -> bool { self.0.total() } + /// Returns the number of characters the condition matches, evaluated + /// against `spanning_set`. #[inline] - pub fn get_cardinality(&self, spanning_set: &SpanningSet) -> Result { + pub fn cardinality(&self, spanning_set: &SpanningSet) -> Result { Ok(self.to_range(spanning_set)?.get_cardinality()) } - pub fn get_bits(&self) -> Vec { - self.0.get_bits() + /// Returns the condition as a vector of bits, one per range of the spanning + /// set it was built against (the rest range first, when present). + #[inline] + pub fn binary_representation(&self) -> Vec { + self.0.bits() } } @@ -163,52 +194,95 @@ mod tests { use super::*; - fn get_spanning_set() -> SpanningSet { + fn spanning_set() -> SpanningSet { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; SpanningSet::compute_spanning_set(&ranges) } - fn get_test_cases_range() -> Vec { + fn get_test_cases_range() -> Vec { vec![ - Range::empty(), - Range::total(), - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_ranges(&[ + CharRange::empty(), + CharRange::total(), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{2}')), AnyRange::from(Char::new('\u{4}')..=Char::new('\u{6}')), ]), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ] } + // Regression: a condition evaluated against a spanning set it was not + // built from used to panic (when too short) or silently drop bits (when + // too long); it now reports the incompatibility. + #[test] + fn to_range_rejects_incompatible_spanning_set() { + let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )]); + let large = spanning_set(); + + let condition = Condition::total(&small); + assert_eq!( + condition.to_range(&large), + Err(EngineError::IncompatibleSpanningSet) + ); + + let condition = Condition::total(&large); + assert_eq!( + condition.to_range(&small), + Err(EngineError::IncompatibleSpanningSet) + ); + } + + // Regression: `ConditionConverter::convert` used to panic on a condition + // that was not built over its source spanning set. + #[test] + fn convert_rejects_incompatible_condition() { + let small = SpanningSet::compute_spanning_set(&[CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )]); + let merged = small.merge(&spanning_set()); + let converter = ConditionConverter::new(&small, &merged).unwrap(); + + let foreign = Condition::total(&merged); + assert_eq!( + converter.convert(&foreign), + Err(EngineError::IncompatibleSpanningSet) + ); + } + #[test] fn test_empty_total() -> Result<(), String> { - let spanning_set = get_spanning_set(); + let spanning_set = spanning_set(); let empty = Condition::empty(&spanning_set); //println!("{empty}"); assert!(empty.is_empty()); - assert_eq!(vec![false, false, false, false], empty.get_bits()); + assert_eq!( + vec![false, false, false, false], + empty.binary_representation() + ); let total = Condition::total(&spanning_set); //println!("{total}"); assert!(total.is_total()); - assert_eq!(vec![true, true, true, true], total.get_bits()); + assert_eq!(vec![true, true, true, true], total.binary_representation()); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); assert_eq!(empty, total.complement()); @@ -218,20 +292,20 @@ mod tests { let empty = Condition::empty(&spanning_set); let total = Condition::total(&spanning_set); - assert_eq!(Range::empty(), empty.to_range(&spanning_set).unwrap()); - assert_eq!(Range::total(), total.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::empty(), empty.to_range(&spanning_set).unwrap()); + assert_eq!(CharRange::total(), total.to_range(&spanning_set).unwrap()); assert_eq!( empty, - Condition::from_range(&Range::empty(), &spanning_set).unwrap() + Condition::from_range(&CharRange::empty(), &spanning_set).unwrap() ); - assert_eq!(vec![false], empty.get_bits()); + assert_eq!(vec![false], empty.binary_representation()); assert_eq!( total, - Condition::from_range(&Range::total(), &spanning_set).unwrap() + Condition::from_range(&CharRange::total(), &spanning_set).unwrap() ); - assert_eq!(vec![true], total.get_bits()); + assert_eq!(vec![true], total.binary_representation()); assert_eq!(empty, total.complement()); assert_eq!(total, empty.complement()); @@ -241,7 +315,7 @@ mod tests { #[test] fn test_from_to_range() -> Result<(), String> { - let spanning_set = get_spanning_set(); + let spanning_set = spanning_set(); for range in get_test_cases_range() { assert_range_convertion_to_range(&range, &spanning_set); @@ -251,7 +325,7 @@ mod tests { Ok(()) } - fn assert_range_convertion_to_range(range: &Range, spanning_set: &SpanningSet) { + fn assert_range_convertion_to_range(range: &CharRange, spanning_set: &SpanningSet) { let condition = Condition::from_range(range, spanning_set).unwrap(); let range_from_condition = condition.to_range(spanning_set).unwrap(); assert_eq!(range, &range_from_condition); @@ -263,14 +337,14 @@ mod tests { #[test] fn test_project_to() -> Result<(), String> { - let current_spanning_set = get_spanning_set(); + let current_spanning_set = spanning_set(); let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), - Range::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), - Range::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), - Range::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{1}')), + CharRange::new_from_range(Char::new('\u{2}')..=Char::new('\u{2}')), + CharRange::new_from_range(Char::new('\u{4}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{5}')..=Char::new('\u{6}')), + CharRange::new_from_range(Char::new('\u{9}')..=Char::new('\u{9}')), ]; let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); let condition_converter = @@ -295,7 +369,7 @@ mod tests { } fn assert_project_to( - range: &Range, + range: &CharRange, currently_used_spanning_set: &SpanningSet, newly_used_spanning_set: &SpanningSet, condition_converter: &ConditionConverter, @@ -320,7 +394,7 @@ mod tests { #[test] fn test_union_intersection_complement() -> Result<(), String> { - let used_characters = get_spanning_set(); + let used_characters = spanning_set(); for range_1 in get_test_cases_range() { for range_2 in get_test_cases_range() { @@ -347,20 +421,20 @@ mod tests { } fn assert_union_intersection_complement( - range_1: &Range, - range_2: &Range, + range_1: &CharRange, + range_2: &CharRange, used_characters: &SpanningSet, ) { let condition_1 = Condition::from_range(range_1, used_characters).unwrap(); let condition_2 = Condition::from_range(range_2, used_characters).unwrap(); assert_eq!( - Condition::empty(&used_characters), + Condition::empty(used_characters), condition_1.intersection(&condition_1.complement()) ); assert_eq!( - Condition::empty(&used_characters), + Condition::empty(used_characters), condition_2.intersection(&condition_2.complement()) ); @@ -377,14 +451,14 @@ mod tests { #[test] fn test_1() -> Result<(), String> { let ranges = vec![ - Range::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), - Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), - Range::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), + CharRange::new_from_range(Char::new('\u{0}')..=Char::new('\u{9}')), + CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')), + CharRange::new_from_range(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), ]; let spanning_set = SpanningSet::compute_spanning_set(&ranges); println!("{:?}", spanning_set); - let range1 = Range::new_from_ranges(&[ + let range1 = CharRange::new_from_ranges(&[ AnyRange::from(Char::new('\u{0}')..=Char::new('\u{9}')), AnyRange::from(Char::new('\u{B}')..=Char::new('\u{63}')), AnyRange::from(Char::new('\u{65}')..=Char::new('\u{10FFFF}')), @@ -392,7 +466,7 @@ mod tests { let condition1 = Condition::from_range(&range1, &spanning_set).unwrap(); assert_eq!(range1, condition1.to_range(&spanning_set).unwrap()); - let range2 = Range::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); + let range2 = CharRange::new_from_range(Char::new('\u{B}')..=Char::new('\u{63}')); let condition2 = Condition::from_range(&range2, &spanning_set).unwrap(); assert_eq!(range2, condition2.to_range(&spanning_set).unwrap()); diff --git a/src/fast_automaton/convert/to_regex/builder/scc.rs b/src/fast_automaton/convert/to_regex/builder/scc.rs deleted file mode 100644 index 815188a..0000000 --- a/src/fast_automaton/convert/to_regex/builder/scc.rs +++ /dev/null @@ -1,207 +0,0 @@ -use super::*; - -impl StateEliminationAutomaton { - pub fn identify_and_apply_components(&mut self) -> Result<(), EngineError> { - let mut index = 0; - let mut stack = Vec::new(); - let mut indices = vec![-1; self.transitions.len()]; - let mut lowlink = vec![-1; self.transitions.len()]; - let mut on_stack = vec![false; self.transitions.len()]; - let mut scc = Vec::new(); - - for state in self.states_iter() { - if self.removed_states.contains(&state) { - continue; - } - if indices[state] == -1 { - self.strongconnect( - state, - &mut index, - &mut stack, - &mut indices, - &mut lowlink, - &mut on_stack, - &mut scc, - ); - } - } - - let scc = scc - .into_iter() - .filter(|states| { - let first_state = states.iter().next().unwrap(); - let self_loop = if let Some(transitions_in) = self.transitions_in.get(first_state) { - transitions_in.contains(first_state) - } else { - false - }; - states.len() != 1 || self_loop - }) - .collect::>(); - - for component in scc { - self.build_component(&component)?; - } - - self.cyclic = false; - - Ok(()) - } - - #[allow(clippy::too_many_arguments)] - fn strongconnect( - &self, - v: usize, - index: &mut usize, - stack: &mut Vec, - indices: &mut Vec, - lowlink: &mut Vec, - on_stack: &mut Vec, - scc: &mut Vec>, - ) { - indices[v] = *index as i32; - lowlink[v] = *index as i32; - *index += 1; - stack.push(v); - on_stack[v] = true; - - if let Some(neighbors) = self.transitions.get(v) { - for &w in neighbors.keys() { - if indices[w] == -1 { - self.strongconnect(w, index, stack, indices, lowlink, on_stack, scc); - lowlink[v] = lowlink[v].min(lowlink[w]); - } else if on_stack[w] { - lowlink[v] = lowlink[v].min(indices[w]); - } - } - } - - if lowlink[v] == indices[v] { - let mut component = Vec::new(); - while let Some(w) = stack.pop() { - on_stack[w] = false; - component.push(w); - if w == v { - break; - } - } - scc.push(component); - } - } - - fn build_component(&mut self, states: &[usize]) -> Result<(), EngineError> { - let state_set = states.iter().copied().collect::>(); - let mut start_states = IntMap::new(); - let mut accept_states = IntMap::new(); - - let mut state_elimination_automaton = StateEliminationAutomaton { - start_state: 0, // start_state is not set yet - accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(states.len()), - transitions_in: IntMap::with_capacity(states.len()), - removed_states: IntSet::new(), - cyclic: true, - }; - - let mut states_map = IntMap::with_capacity(states.len()); - for from_state in states { - if *from_state == self.accept_state { - self.accept_state = self.new_state(); - self.add_transition_to(*from_state, self.accept_state, GraphTransition::Epsilon); - } - if *from_state == self.start_state { - self.start_state = self.new_state(); - self.add_transition_to(self.start_state, *from_state, GraphTransition::Epsilon); - } - let from_state_new = *states_map - .entry(*from_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if !state_set.contains(to_state) { - accept_states - .entry(*to_state) - .or_insert_with(Vec::new) - .push((from_state_new, transition.clone())); - continue; - } - - let to_state_new = *states_map - .entry(*to_state) - .or_insert_with(|| state_elimination_automaton.new_state()); - - state_elimination_automaton.add_transition_to( - from_state_new, - to_state_new, - transition.clone(), - ); - } - - for (parent_state, transition) in self.in_transitions_vec(*from_state) { - if !state_set.contains(&parent_state) { - start_states - .entry(from_state_new) - .or_insert_with(Vec::new) - .push((parent_state, transition.clone())); - } - } - } - - for state in states { - self.remove_state(*state); - } - - for (start_state, parent_states) in &start_states { - for (parent_state, transition) in parent_states { - let new_parent_state = if !transition.is_empty_string() { - let new_parent_state = self.new_state(); - - self.add_transition_to(*parent_state, new_parent_state, transition.clone()); - new_parent_state - } else { - *parent_state - }; - for (target_state, accept_states_transition) in &accept_states { - let mut new_automaton = state_elimination_automaton.clone(); - - let target_state = if accept_states_transition.len() > 1 { - new_automaton.accept_state = new_automaton.new_state(); - for (accept_state, transition) in accept_states_transition { - new_automaton.add_transition_to( - *accept_state, - new_automaton.accept_state, - transition.clone(), - ); - } - *target_state - } else { - let (accept_state, transition) = - accept_states_transition.iter().next().unwrap(); - - new_automaton.accept_state = *accept_state; - if !transition.is_empty_string() { - let new_target_state = self.new_state(); - self.add_transition_to( - new_target_state, - *target_state, - transition.clone(), - ); - new_target_state - } else { - *target_state - } - }; - - new_automaton.start_state = *start_state; - - self.add_transition_to( - new_parent_state, - target_state, - GraphTransition::Graph(new_automaton), - ); - } - } - } - - Ok(()) - } -} diff --git a/src/fast_automaton/convert/to_regex/mod.rs b/src/fast_automaton/convert/to_regex/mod.rs index d9a1dd0..27723a9 100644 --- a/src/fast_automaton/convert/to_regex/mod.rs +++ b/src/fast_automaton/convert/to_regex/mod.rs @@ -1,297 +1,31 @@ -use std::{ - collections::{hash_map::Entry, VecDeque}, - fmt::Display, -}; +use super::*; -use ahash::{HashMapExt, HashSetExt}; -use log::warn; -use nohash_hasher::IntMap; - -use crate::{error::EngineError, execution_profile::ThreadLocalParams, regex::RegularExpression}; - -use super::{FastAutomaton, IntSet, Range, State}; - -mod builder; -mod transform; - -#[derive(Clone, Debug)] -enum GraphTransition { - Graph(StateEliminationAutomaton), - Weight(T), - Epsilon, -} - -impl GraphTransition { - pub fn is_empty_string(&self) -> bool { - matches!(self, GraphTransition::Epsilon) - } - - pub fn get_weight(&self) -> Option<&T> { - if let GraphTransition::Weight(weight) = self { - Some(weight) - } else { - None - } - } -} - -#[derive(Clone, Debug)] -struct StateEliminationAutomaton { - start_state: usize, - accept_state: usize, - transitions: Vec>>, - transitions_in: IntMap>, - removed_states: IntSet, - cyclic: bool, -} - -impl Display for StateEliminationAutomaton { - fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_graph_dot(sb, None) - } -} - -impl StateEliminationAutomaton { - //#[cfg(test)] - #[allow(dead_code)] - #[inline] - pub fn to_dot(&self) { - println!("{}", self); - } - - #[inline] - fn to_graph_dot( - &self, - sb: &mut std::fmt::Formatter<'_>, - prefix: Option<&str>, - ) -> std::fmt::Result { - let is_subgraph; - let indent; - let prefix = if let Some(prefix) = prefix { - writeln!(sb, "\tsubgraph cluster_{} {{", prefix)?; - writeln!(sb, "\t\tlabel = \"{} - cyclic={}\";", prefix, self.cyclic)?; - indent = "\t"; - is_subgraph = true; - prefix - } else { - writeln!(sb, "digraph Automaton {{")?; - writeln!(sb, "\trankdir = LR;")?; - writeln!(sb, "\tlabel = \"cyclic={}\";", self.cyclic)?; - indent = ""; - is_subgraph = false; - "" - }; - - for from_state in self.states_iter() { - let from_state_with_prefix = if is_subgraph { - format!("S{prefix}_{from_state}") - } else { - format!("S{from_state}") - }; - - write!(sb, "{indent}\t{}", from_state_with_prefix)?; - if !is_subgraph && self.accept_state == from_state { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; - } else { - writeln!(sb, "{indent}\t[shape=circle,label=\"{}\"];", from_state)?; - } - - if !is_subgraph && self.start_state == from_state { - writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state_with_prefix)?; - } - for (to_state, weight) in self.transitions_from_state_enumerate_iter(&from_state) { - let to_state_with_prefix = if is_subgraph { - format!("S{prefix}_{to_state}") - } else { - format!("S{to_state}") - }; - - match weight { - GraphTransition::Graph(state_elimination_automaton) => { - let subgraph_prefix = if is_subgraph { - format!("{prefix}_{from_state}_{to_state}") - } else { - format!("{from_state}_{to_state}") - }; - state_elimination_automaton.to_graph_dot(sb, Some(&subgraph_prefix))?; - writeln!(sb)?; - let subgraph_start_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.start_state - ); - writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, subgraph_start_state - )?; - - let subgraph_accept_state = format!( - "S{}_{}", - subgraph_prefix, state_elimination_automaton.accept_state - ); - writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - subgraph_accept_state, to_state_with_prefix - ) - } - GraphTransition::Weight(range) => { - writeln!( - sb, - "{indent}\t{} -> {} [label=\"{}\"]", - from_state_with_prefix, - to_state_with_prefix, - RegularExpression::Character(range.clone()) - .to_string() - .replace('\\', "\\\\") - .replace('"', "\\\"") - ) - } - GraphTransition::Epsilon => writeln!( - sb, - "{indent}\t{} -> {} [label=\"ε\"]", - from_state_with_prefix, to_state_with_prefix - ), - }?; - } - } - write!(sb, "{indent}}}") - } - - #[inline] - pub fn states_iter(&self) -> impl Iterator + '_ { - (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator)> { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) - } - - #[inline] - pub fn transitions_from_state_vec(&self, from_state: &State) -> Vec { - self.transitions[*from_state] - .keys() - .filter(|s| !self.removed_states.contains(s)) - .copied() - .collect() - } - - pub fn in_transitions_vec(&self, to_state: State) -> Vec<(State, GraphTransition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&to_state).unwrap_or(&IntSet::new()) { - for (state, transition) in self.transitions_from_state_enumerate_iter(from_state) { - if to_state == *state { - in_transitions.push((*from_state, transition.clone())); - } - } - } - in_transitions - } - - pub fn states_topo_vec(&self) -> Vec { - if self.cyclic { - panic!("The graph has a cycle"); - } - - let mut in_degree: IntMap = self - .transitions_in - .iter() - .map(|(state, parents)| (*state, parents.len())) - .collect(); - - let mut worklist: VecDeque = VecDeque::new(); - for (&state, °ree) in &in_degree { - if degree == 0 { - worklist.push_back(state); - } - } - - let mut sorted_order = Vec::with_capacity(self.get_number_of_states()); - while let Some(state) = worklist.pop_front() { - sorted_order.push(state); - - if let Some(neighbors) = self.transitions.get(state) { - let neighbors = neighbors.keys(); - for &neighbor in neighbors { - if let Some(degree) = in_degree.get_mut(&neighbor) { - *degree -= 1; - if *degree == 0 { - worklist.push_back(neighbor); - } - } - } - } - } - - if sorted_order.len() == self.get_number_of_states() { - sorted_order - } else { - panic!("The graph has a cycle"); - } - } - - #[inline] - pub fn get_number_of_states(&self) -> usize { - self.transitions.len() - self.removed_states.len() - } -} +mod state_elimination; impl FastAutomaton { - /// Try to convert the current FastAutomaton to a RegularExpression. - /// If it cannot find an equivalent regex it returns None. - /// This method is still a work in progress. - pub fn to_regex(&self) -> Option { - if self.is_empty() { - return Some(RegularExpression::new_empty()); - } - let execution_profile = ThreadLocalParams::get_execution_profile(); - if let Ok(graph) = StateEliminationAutomaton::new(self) { - if let Ok(regex) = graph?.convert_to_regex(&execution_profile) { - let regex = regex?; - match regex.to_automaton() { - Ok(automaton) => match self.is_equivalent_of(&automaton) { - Ok(result) => { - if !result { - warn!("The automaton is not equivalent to the generated regex; automaton={}, regex={}", self, regex); - None - } else { - Some(regex) - } - } - Err(err) => { - warn!("Engine error while checking for equivalence ({}); automaton={}, regex={}", err, self, regex); - None - } - }, - Err(err) => { - if let crate::error::EngineError::RegexSyntaxError(err) = err { - warn!("The generated regex cannot be converted to automaton to be checked for equivalence ({}); automaton={}, regex={}", err, self, regex); - } - None - } - } - } else { - None - } - } else { - None - } + /// Converts the automaton to a [`RegularExpression`]. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states()))] + pub fn to_regex(&self) -> RegularExpression { + state_elimination::convert_to_regex(self) } } #[cfg(test)] mod tests { + use ::regex::Regex; + use super::*; #[test] fn test_convert() -> Result<(), String> { + assert_convert(".*u(ab|de)"); + assert_convert(".*sf.*uif(ab|de)"); + + assert_convert("(a+|,)*"); + assert_convert("((ab)*,(cd)*)*"); + assert_convert("(a*,a*,a*)*"); + assert_convert("(a*,a*)*"); + assert_convert("(ac|ads|a)*"); assert_convert(".*sf"); assert_convert(".*sf.*uif(ab|de)"); @@ -325,47 +59,43 @@ mod tests { } fn assert_convert(regex: &str) { - let input_regex = RegularExpression::new(regex).unwrap(); + let input_regex = RegularExpression::parse(regex, false).unwrap(); println!("IN : {}", input_regex); let input_automaton = input_regex.to_automaton().unwrap(); - //input_automaton.to_dot(); - - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (non deterministic): {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); let input_automaton = input_automaton.determinize().unwrap(); - //input_automaton.to_dot(); - let output_regex = input_automaton.to_regex().unwrap(); + let output_regex = input_automaton.to_regex(); println!("OUT (deterministic) : {}", output_regex); let output_automaton = output_regex.to_automaton().unwrap(); - assert!(input_automaton.is_equivalent_of(&output_automaton).unwrap()); + assert!(input_automaton.equivalent(&output_automaton).unwrap()); } #[test] fn test_convert_after_operation_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(ab|cd)") + let automaton1 = RegularExpression::parse("(ab|cd)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("ab") + let automaton2 = RegularExpression::parse("ab", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); + let result = automaton1.difference(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("cd", output_regex.to_string()); Ok(()) @@ -373,20 +103,20 @@ mod tests { #[test] fn test_convert_after_operation_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - result.to_dot(); + result.print_dot(); - let output_regex = result.to_regex().unwrap(); + let output_regex = result.to_regex(); assert_eq!("", output_regex.to_string()); Ok(()) @@ -394,72 +124,97 @@ mod tests { #[test] fn test_convert_after_operation_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() - .unwrap() - .determinize() .unwrap(); + let automaton2 = automaton2.determinize().unwrap(); - let result = automaton1.subtraction(&automaton2).unwrap(); - result.to_dot(); + let result = automaton1.difference(&automaton2).unwrap(); + result.print_dot(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); - assert_eq!("(x{3})*x{1,2}", result.to_string()); + assert_eq!("x(x{3})*x?", result.to_string()); Ok(()) } #[test] fn test_convert_after_operation_4() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*abc.*") + let automaton1 = RegularExpression::parse(".*abc.*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + let automaton2 = RegularExpression::parse(".*def.*", false) .unwrap() .to_automaton() .unwrap(); let result = automaton1.intersection(&automaton2).unwrap(); - let result = result.to_regex().unwrap(); + let result = result.to_regex(); assert_eq!(".*(abc.*def|def.*abc).*", result.to_string()); Ok(()) } - /*#[test] + #[test] fn test_convert_after_operation_5() -> Result<(), String> { - if std::env::var_os("RUST_LOG").is_none() { - std::env::set_var("RUST_LOG", "regexsolver=debug"); - } - env_logger::init(); - - let automaton1 = RegularExpression::new(".*abc.*") + let automaton = RegularExpression::parse(".*abc.*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new(".*def.*") + let mut automaton = automaton.determinize().unwrap().into_owned(); + + automaton.complement().unwrap(); + + let result = format!("^{}$", automaton.to_regex()); + + println!("{result}"); + + let result = Regex::new(&result).unwrap(); + + assert!(!result.is_match("abc")); + assert!(!result.is_match("2374abc012")); + + assert!(result.is_match("bc")); + assert!(result.is_match("237a4bc012")); + + Ok(()) + } + + #[test] + fn test_automaton() -> Result<(), String> { + let automaton = RegularExpression::parse("a*ba*", false) .unwrap() .to_automaton() + .unwrap(); + automaton.print_dot(); + + let automaton1 = RegularExpression::parse("(a*ba*)*", false) .unwrap() - .determinize() + .to_automaton() .unwrap(); + automaton1.print_dot(); - let result = automaton1.subtraction(&automaton2).unwrap(); - result.to_dot(); + automaton1.determinize().unwrap().print_dot(); - let result = result.to_regex().unwrap(); + // (a*b[ab]*)? + // a*b+a+b+ - assert_eq!("(x{3})*x{1,2}", result.to_string()); + let automaton2 = RegularExpression::parse("(a*b[ab]*)?", false) + .unwrap() + .to_automaton() + .unwrap(); + + assert!(automaton1.equivalent(&automaton2).unwrap()); Ok(()) - }*/ + } } diff --git a/src/fast_automaton/convert/to_regex/builder/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs similarity index 52% rename from src/fast_automaton/convert/to_regex/builder/mod.rs rename to src/fast_automaton/convert/to_regex/state_elimination/builder.rs index b6c8dd5..241faa3 100644 --- a/src/fast_automaton/convert/to_regex/builder/mod.rs +++ b/src/fast_automaton/convert/to_regex/state_elimination/builder.rs @@ -1,70 +1,85 @@ -use super::*; - -mod scc; +use ahash::HashMapExt; -impl StateEliminationAutomaton { - pub fn new(automaton: &FastAutomaton) -> Result, EngineError> { - if automaton.is_empty() { - return Ok(None); - } +use super::*; - let mut state_elimination_automaton = StateEliminationAutomaton { +impl Gnfa { + pub(super) fn from_automaton(automaton: &FastAutomaton) -> Gnfa { + let mut state_elimination_automaton = Gnfa { start_state: 0, // start_state is not set yet accept_state: 0, // accept_state is not set yet - transitions: Vec::with_capacity(automaton.get_number_of_states()), - transitions_in: IntMap::with_capacity(automaton.get_number_of_states()), - removed_states: IntSet::new(), - cyclic: false, + transitions: Vec::with_capacity(automaton.number_of_states()), + transitions_in: IntMap::with_capacity(automaton.number_of_states()), + removed_states: IntSet::with_capacity(automaton.number_of_states()), + empty: false, }; - let mut states_map = IntMap::with_capacity(automaton.get_number_of_states()); + if automaton.is_empty() { + state_elimination_automaton.empty = true; + return state_elimination_automaton; + } + + let mut states_map = IntMap::with_capacity(automaton.number_of_states()); - for from_state in automaton.transitions_iter() { + for from_state in automaton.states() { let new_from_state = *states_map .entry(from_state) .or_insert_with(|| state_elimination_automaton.new_state()); - for (to_state, condition) in - automaton.transitions_from_state_enumerate_into_iter(&from_state) - { + for (condition, to_state) in automaton.transitions_from(from_state) { let new_to_state = *states_map - .entry(to_state) + .entry(*to_state) .or_insert_with(|| state_elimination_automaton.new_state()); - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( new_from_state, new_to_state, - GraphTransition::Weight(condition.to_range(automaton.get_spanning_set())?), + RegularExpression::Character( + condition.to_range(automaton.spanning_set()).unwrap(), + ), ); } } - state_elimination_automaton.start_state = - *states_map.get(&automaton.get_start_state()).unwrap(); // We finally set start_state + if automaton.in_degree(automaton.start_state()) == 0 { + // If the start state does not have any incoming state we just set it + state_elimination_automaton.start_state = + *states_map.get(&automaton.start_state()).unwrap(); + } else { + // If not we create a new state that will be the new start state + state_elimination_automaton.start_state = state_elimination_automaton.new_state(); + + let previous_start_state = *states_map.get(&automaton.start_state()).unwrap(); + // We add an empty string transition to the new start state + state_elimination_automaton.add_transition( + state_elimination_automaton.start_state, + previous_start_state, + RegularExpression::new_empty_string(), + ); + } - if automaton.get_accept_states().len() == 1 { - // If there is only one accept state with just set it + let accept_state = *automaton.accept_states().iter().next().unwrap(); + if automaton.accept_states().len() == 1 && automaton.out_degree(accept_state) == 0 { + // If there is only one accept state we just set it state_elimination_automaton.accept_state = *states_map - .get(automaton.get_accept_states().iter().next().unwrap()) + .get(automaton.accept_states().iter().next().unwrap()) .unwrap(); } else { // If not we create a new state that will be the new accept state state_elimination_automaton.accept_state = state_elimination_automaton.new_state(); - for accept_state in automaton.get_accept_states() { + for accept_state in automaton.accept_states() { let accept_state = *states_map.get(accept_state).unwrap(); // We add an empty string transition to the new accept state - state_elimination_automaton.add_transition_to( + state_elimination_automaton.add_transition( accept_state, state_elimination_automaton.accept_state, - GraphTransition::Epsilon, + RegularExpression::new_empty_string(), ); } } - state_elimination_automaton.identify_and_apply_components()?; - //state_elimination_automaton.to_dot(); - Ok(Some(state_elimination_automaton)) + + state_elimination_automaton } - pub fn new_state(&mut self) -> usize { + fn new_state(&mut self) -> usize { if let Some(new_state) = self.removed_states.clone().iter().next() { self.removed_states.remove(new_state); self.transitions_in.insert(*new_state, IntSet::new()); @@ -78,22 +93,22 @@ impl StateEliminationAutomaton { } #[inline] - pub fn has_state(&self, state: State) -> bool { + pub(super) fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } - pub fn add_transition_to( + pub(crate) fn add_transition( &mut self, from_state: State, to_state: State, - transition: GraphTransition, + transition: RegularExpression, ) { self.assert_state_exists(from_state); if from_state != to_state { @@ -106,13 +121,8 @@ impl StateEliminationAutomaton { .insert(from_state); match self.transitions[from_state].entry(to_state) { Entry::Occupied(mut o) => { - if let (GraphTransition::Weight(current_regex), GraphTransition::Weight(regex)) = - (o.get(), transition) - { - o.insert(GraphTransition::Weight(current_regex.union(®ex))); - } else { - panic!("Cannot add transition"); - } + let merged = transition.union(o.get()); + *o.get_mut() = merged; } Entry::Vacant(v) => { v.insert(transition); @@ -120,12 +130,11 @@ impl StateEliminationAutomaton { }; } - pub fn remove_state(&mut self, state: State) { + pub(super) fn remove_state(&mut self, state: State) { self.assert_state_exists(state); if self.start_state == state || self.accept_state == state { panic!( - "Can not remove the state {}, it is still used as start state or accept state.", - state + "Can not remove the state {state}, it is still used as start state or accept state." ); } self.transitions_in.remove(&state); @@ -150,21 +159,4 @@ impl StateEliminationAutomaton { transitions.remove(&state); } } - - pub fn remove_transition(&mut self, from_state: State, to_state: State) { - self.assert_state_exists(from_state); - if from_state != to_state { - self.assert_state_exists(to_state); - } - - if let Some(from_states) = self.transitions_in.get_mut(&to_state) { - from_states.remove(&from_state); - } - - self.transitions[from_state].remove(&to_state); - } - - pub fn get_transition(&self, from_state: State, to_state: State) -> Option<&GraphTransition> { - self.transitions.get(from_state)?.get(&to_state) - } } diff --git a/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs new file mode 100644 index 0000000..8d5f587 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/eliminate.rs @@ -0,0 +1,118 @@ +use super::*; + +impl Gnfa { + pub(super) fn convert(&mut self) -> RegularExpression { + if self.empty { + return RegularExpression::new_empty(); + } + + while let Some(state) = self.get_next_state_to_eliminate() { + self.eliminate_state(state); + } + + self.get_transition(self.start_state, self.accept_state) + .cloned() + .unwrap_or(RegularExpression::new_empty_string()) + } + + fn get_next_state_to_eliminate(&self) -> Option { + let states: Vec = self + .all_states_iter() + .filter(|&s| s != self.start_state && s != self.accept_state) + .collect(); + + let score_state = |state: usize| -> Option<(u128, usize)> { + let preds = self.transitions_to_vec(state); + let succs = self.transitions_from_vec(state); + + let in_deg = preds.len() as u128; + let out_deg = succs.len() as u128; + + if in_deg == 0 || out_deg == 0 { + let score = (state as u128) & 0xFF; + return Some((score, state)); + } + + let mut score: u128 = in_deg * out_deg; + + if self.has_self_loop(state) { + score = score + (score >> 1); + } + + let mut label_cost: u128 = 0; + + for (_, regex) in &preds { + label_cost += regex.evaluate_complexity() as u128; + } + for (regex, _) in &succs { + label_cost += regex.evaluate_complexity() as u128; + } + if let Some(re) = self.get_transition(state, state) { + label_cost += (re.evaluate_complexity() as u128) * 2; + } + + score = score.saturating_add(label_cost); + + let tie = (state as u128) & 0xFFFF; + Some((score.saturating_add(tie), state)) + }; + + #[cfg(feature = "parallel")] + let best = states + .into_par_iter() + .filter_map(score_state) + .reduce_with(|a, b| if a.0 < b.0 { a } else { b }); + #[cfg(not(feature = "parallel"))] + let best = states + .into_iter() + .filter_map(score_state) + .reduce(|a, b| if a.0 < b.0 { a } else { b }); + + best.map(|(_, state)| state) + } + + fn eliminate_state(&mut self, k: usize) { + if self.removed_states.contains(&k) { + return; + } + + let in_states = self + .transitions_in + .get(&k) + .unwrap() + .iter() + .cloned() + .filter(|&s| s != k) + .collect::>(); + let out_states = self.transitions[k] + .keys() + .cloned() + .filter(|&s| s != k) + .collect::>(); + + for p in in_states { + for &q in &out_states { + self.bridge(p, k, q); + } + } + + self.remove_state(k); + } + + fn bridge(&mut self, p: usize, k: usize, q: usize) { + let rpk = self.get_transition(p, k); + let rkk = self.get_transition(k, k); + let rkq = self.get_transition(k, q); + + if let (Some(rpk), Some(rkq)) = (rpk, rkq) { + let mut regex = rpk.clone(); + if let Some(rkk) = rkk { + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, RegularExpression::Repetition(Box::new(rkk.clone()), 0, None)])); + regex = regex.concat(&rkk.repeat(0, None), true); + } + //regex = RegularExpression::Concat(VecDeque::from_iter(vec![regex, rkq.clone()])); + regex = regex.concat(rkq, true); + self.add_transition(p, q, regex); + } + } +} diff --git a/src/fast_automaton/convert/to_regex/state_elimination/mod.rs b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs new file mode 100644 index 0000000..023d6b1 --- /dev/null +++ b/src/fast_automaton/convert/to_regex/state_elimination/mod.rs @@ -0,0 +1,121 @@ +use super::*; + +mod builder; +mod eliminate; + +struct Gnfa { + start_state: usize, + accept_state: usize, + transitions: Vec>, + transitions_in: IntMap>, + removed_states: IntSet, + empty: bool, +} + +impl Display for Gnfa { + fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(sb, "digraph GNFA {{")?; + writeln!(sb, "\trankdir = LR;")?; + for from_state in self.all_states_iter() { + write!(sb, "\t{from_state}")?; + if self.accept_state == from_state { + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; + } else { + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; + } + + if self.start_state == from_state { + writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; + writeln!(sb, "\tinitial -> {from_state}")?; + } + for (regex, to_state) in self.transitions_from_vec(from_state) { + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{regex}\"]")?; + } + } + write!(sb, "}}") + } +} + +impl Gnfa { + fn get_transition(&self, from_state: State, to_state: State) -> Option<&RegularExpression> { + self.transitions.get(from_state)?.get(&to_state) + } + + #[inline] + fn all_states_iter(&self) -> impl Iterator + '_ { + (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) + } + + fn transitions_to_vec(&self, state: State) -> Vec<(State, RegularExpression)> { + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + for (condition, to_state) in self.transitions_from_vec(*from_state) { + if to_state == state { + in_transitions.push((*from_state, condition)); + break; + } + } + } + in_transitions + } + + #[inline] + fn transitions_from_vec(&self, state: State) -> Vec<(RegularExpression, State)> { + self.transitions[state] + .iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + } + + #[inline] + fn has_self_loop(&self, state: State) -> bool { + self.get_transition(state, state).is_some() + } +} + +pub(super) fn convert_to_regex(automaton: &FastAutomaton) -> RegularExpression { + let mut gnfa = Gnfa::from_automaton(automaton); + gnfa.convert() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_state_elimination() -> Result<(), String> { + test_correct("abc"); + test_correct(".*de"); + test_correct(".*def"); + test_correct("(a*ba*)*"); + test_correct(".*u(ab|d)"); + test_correct(".*u(ab|de)"); + Ok(()) + } + + fn test_correct(pattern: &str) { + println!("Pattern: {pattern}"); + + let automaton = RegularExpression::new(pattern) + .unwrap() + .to_automaton() + .unwrap(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + + let automaton = automaton.determinize().unwrap().into_owned(); + + let regex = Gnfa::from_automaton(&automaton).convert(); + println!("-> {regex}"); + + let new_automaton = regex.to_automaton().unwrap(); + + assert!(automaton.equivalent(&new_automaton).unwrap()); + } +} diff --git a/src/fast_automaton/convert/to_regex/transform.rs b/src/fast_automaton/convert/to_regex/transform.rs deleted file mode 100644 index aaeca76..0000000 --- a/src/fast_automaton/convert/to_regex/transform.rs +++ /dev/null @@ -1,208 +0,0 @@ -use std::hash::BuildHasherDefault; - -use crate::execution_profile::ExecutionProfile; - -use super::*; - -impl StateEliminationAutomaton { - pub fn convert_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.cyclic { - return self.convert_graph_to_regex(execution_profile); - } - execution_profile.assert_not_timed_out()?; - - let mut regex_map: IntMap = IntMap::with_capacity_and_hasher( - self.get_number_of_states(), - BuildHasherDefault::default(), - ); - regex_map.insert(self.start_state, RegularExpression::new_empty_string()); - for from_state in self.states_topo_vec() { - let current_regex = if let Some(current_regex) = regex_map.get(&from_state) { - current_regex.clone() - } else { - RegularExpression::new_empty_string() - }; - if let Some(transitions) = self.transitions.get(from_state) { - for (to_state, transition) in transitions { - let transition_regex = match transition { - GraphTransition::Graph(graph) => { - if let Some(regex) = graph.convert_graph_to_regex(execution_profile)? { - regex - } else { - return Ok(None); - } - } - GraphTransition::Weight(range) => { - RegularExpression::Character(range.clone()) - } - GraphTransition::Epsilon => RegularExpression::new_empty_string(), - }; - let new_regex = current_regex.concat(&transition_regex, true); - match regex_map.entry(*to_state) { - Entry::Occupied(mut o) => { - o.insert(new_regex.union(o.get()).simplify()); - } - Entry::Vacant(v) => { - v.insert(new_regex); - } - }; - } - } - } - - Ok(regex_map.get(&self.accept_state).cloned()) - } - - fn convert_graph_to_regex( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - execution_profile.assert_not_timed_out()?; - if let Some(regex) = self.convert_shape_dot_star(execution_profile)? { - return Ok(Some(regex)); - } else if let Some(regex) = self.convert_shape_self_loop(execution_profile)? { - return Ok(Some(regex)); - } - Ok(None) - } - - /// We try to idenfify the regex following the shape: - /// A*B - fn convert_shape_dot_star( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - if self.get_number_of_states() < 2 { - return Ok(None); - } - //self.to_dot(); - let mut dot_value = - if let Some(dot_value) = self.get_transition(self.start_state, self.start_state) { - if let Some(dot_value) = dot_value.get_weight() { - dot_value.clone() - } else { - return Ok(None); - } - } else { - return Ok(None); - }; - - for state in self.states_iter() { - if state == self.start_state { - continue; - } - let weight = if let Some(weight) = self.get_transition(state, self.start_state) { - if let Some(weight) = weight.get_weight() { - weight - } else { - return Ok(None); - } - } else if state == self.accept_state { - continue; - } else { - return Ok(None); - }; - - if !dot_value.contains_all(weight) { - return Ok(None); - } - } - - let mut graph = self.clone(); - - for (from_state, transition) in graph.in_transitions_vec(graph.start_state) { - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - return Ok(None); - }; - dot_value = dot_value.union(weight); - graph.remove_transition(from_state, graph.start_state); - } - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::with_capacity(graph.get_number_of_states()); - - worklist.push_back(graph.start_state); - seen.insert(self.start_state); - - while let Some(from_state) = worklist.pop_front() { - for to_state in graph.transitions_from_state_vec(&from_state) { - let transition = - if let Some(transition) = graph.get_transition(from_state, to_state) { - transition - } else { - return Ok(None); - }; - let weight = if let Some(weight) = transition.get_weight() { - weight - } else { - continue; - }; - dot_value = dot_value.union(weight); - if seen.contains(&to_state) { - if graph.accept_state != to_state || to_state == from_state { - graph.remove_transition(from_state, to_state); - } - } else { - seen.insert(to_state); - worklist.push_back(to_state); - } - } - } - - graph.add_transition_to( - self.start_state, - self.start_state, - GraphTransition::Weight(dot_value), - ); - - graph.identify_and_apply_components()?; - graph.convert_to_regex(execution_profile) - } - - /// We try to identify the regex following the shape: - /// A*B - fn convert_shape_self_loop( - &self, - execution_profile: &ExecutionProfile, - ) -> Result, EngineError> { - let mut graph = self.clone(); - - graph.accept_state = graph.new_state(); - - for (from_state, transition) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - - graph.add_transition_to(from_state, graph.accept_state, transition); - } - - graph.identify_and_apply_components()?; - - let a_part = if let Some(a_part) = graph.convert_to_regex(execution_profile)? { - a_part - } else { - return Ok(None); - }; - - let mut graph = self.clone(); - - for (from_state, _) in graph.in_transitions_vec(self.start_state) { - graph.remove_transition(from_state, self.start_state); - } - - graph.identify_and_apply_components()?; - let b_part = if let Some(b_part) = graph.convert_to_regex(execution_profile)? { - b_part - } else { - return Ok(None); - }; - - let regex = a_part.repeat(0, None).concat(&b_part, true); - - Ok(Some(regex)) - } -} diff --git a/src/fast_automaton/generate.rs b/src/fast_automaton/generate.rs index 638ba11..581cffc 100644 --- a/src/fast_automaton/generate.rs +++ b/src/fast_automaton/generate.rs @@ -1,139 +1,474 @@ -use std::cmp; - -use crate::{execution_profile::ThreadLocalParams, EngineError}; -use ahash::AHashSet; +use crate::{EngineError, execution_profile::ExecutionProfile}; +use ahash::{AHashSet, RandomState}; +use indexmap::IndexSet; use super::*; +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +#[derive(Clone, Eq, PartialEq)] +struct QueueItem { + score: usize, + depth: usize, + state: usize, + ranges: Vec, + hash: u64, +} + +impl Ord for QueueItem { + fn cmp(&self, other: &Self) -> Ordering { + other + .score + .cmp(&self.score) + .then_with(|| self.depth.cmp(&other.depth)) + .then_with(|| self.state.cmp(&other.state)) + .then_with(|| self.hash.cmp(&other.hash)) + } +} + +impl PartialOrd for QueueItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} impl FastAutomaton { - pub fn generate_strings(&self, number: usize) -> Result, EngineError> { - if self.is_empty() { - return Ok(AHashSet::new()); + /// Generates up to `limit` distinct strings matched by the automaton, skipping the first `offset` strings. + /// + /// Strings are only guaranteed to be distinct **within a single call**: + /// the offset fast-skips by counting paths, and in a non-deterministic + /// automaton the same string can be reached through several paths, so + /// calls with different offsets may repeat strings (or skip some). + /// [`determinize`](Self::determinize) (and ideally + /// [`minimize`](Self::minimize)) first to make pages disjoint. + #[tracing::instrument(level = "debug", skip(self), fields(states = self.number_of_states(), deterministic=self.is_deterministic(), limit=limit, offset=offset))] + pub fn generate_strings( + &self, + limit: usize, + mut offset: usize, + ) -> Result, EngineError> { + if self.is_empty() || limit == 0 { + return Ok(vec![]); } - let mut strings = AHashSet::with_capacity(cmp::min(number, 1000)); + let (_, max) = self.length(); + let max_len = max.unwrap_or(u32::MAX) as usize; - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); + let num_states = self.transitions.len(); - let mut ranges_cache: AHashMap<&Condition, Range> = - AHashMap::with_capacity(self.get_number_of_states()); + // ----------------------------------------------------------------- + // 1. REVERSE BFS: Precalculate exact distances to Accept State + // ----------------------------------------------------------------- + let mut incoming = vec![vec![]; num_states]; + let mut dist_q = std::collections::VecDeque::new(); + let mut dist = vec![usize::MAX; num_states]; - let mut worklist: VecDeque<(Vec, usize)> = - VecDeque::with_capacity(cmp::min(number, 1000)); - let mut visited = AHashSet::with_capacity(cmp::min(number, 1000)); + for state in self.states() { + if self.is_accepted(state as _) { + dist[state] = 0; + dist_q.push_back(state); + } + for (_cond, &to_state) in self.transitions_from(state as _) { + incoming[to_state].push(state); + } + } - worklist.push_back((vec![], self.start_state)); - while let Some((ranges, state)) = worklist.pop_front() { - if self.accept_states.contains(&state) { - if ranges.is_empty() { - strings.insert(String::new()); - } else { - let mut end = false; - let mut ranges_iter: Vec<_> = ranges.iter().map(|range| range.iter()).collect(); - while strings.len() < number { - execution_profile.assert_not_timed_out()?; - let mut string = vec![]; - for i in 0..ranges.len() { - if let Some(character) = ranges_iter[i].next() { - string.push(character); - } else { - ranges_iter[i] = ranges[i].iter(); - if i + 1 < ranges.len() { - string.push(ranges_iter[i].next().unwrap()); - } else { - end = true; - break; - } - } - } - if end { - break; - } - strings.insert(string.into_iter().map(|c| c.to_char()).collect()); + while let Some(state) = dist_q.pop_front() { + let d = dist[state]; + for &prev in &incoming[state] { + if dist[prev] == usize::MAX { + dist[prev] = d + 1; + dist_q.push_back(prev); + } + } + } + + // ----------------------------------------------------------------- + // 2. A* SEARCH: Find matching strings instantly + // ----------------------------------------------------------------- + let mut ranges_cache = AHashMap::with_capacity(num_states); + let mut strings = IndexSet::with_capacity_and_hasher(limit, RandomState::default()); + let mut visited = AHashSet::with_capacity(num_states); + + let mut q = BinaryHeap::new(); + let start_state = self.start_state(); + + // If the start state can't reach an accept state, exit immediately + if dist[start_state] != usize::MAX { + q.push(QueueItem { + score: dist[start_state], + depth: 0, + state: start_state, + ranges: vec![], + hash: 0u64, + }); + } + + while let Some(QueueItem { + score: _, + depth: current_depth, + state, + mut ranges, + hash: h, + }) = q.pop() + { + execution_profile.assert_not_timed_out()?; + + if self.is_accepted(state) { + if current_depth == 0 { + if offset > 0 { + offset -= 1; + } else { + strings.insert(String::new()); } + } else { + Self::ranges_to_strings( + &mut strings, + &ranges, + limit, + &mut offset, + &execution_profile, + )?; } - if strings.len() == number { + if strings.len() >= limit { break; } } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&state) { - execution_profile.assert_not_timed_out()?; - let range = match ranges_cache.entry(cond) { - Entry::Occupied(o) => o.get().clone(), - Entry::Vacant(v) => { - let range = cond.to_range(&self.spanning_set)?; - v.insert(range.clone()); - range - } - }; - if range.is_empty() { + + if current_depth >= max_len { + continue; + } + + let next_depth = current_depth + 1; + let mut valid_transitions = Vec::new(); + + for (cond, &to_state) in self.transitions_from(state) { + let to_state_usize = to_state; + + // DEAD-END PRUNING: Instantly kill paths that cannot accept + if dist[to_state_usize] == usize::MAX { continue; } - let mut new_ranges = ranges.clone(); - new_ranges.push(range); - let element = (new_ranges, *to_state); - if !visited.contains(&element) { - visited.insert(element.clone()); - worklist.push_back(element); + let hash = + Self::path_mix(h, Self::mix64(state as u64 ^ Self::mix64(to_state as u64))); + + if visited.insert((to_state, next_depth, hash)) { + let range = ranges_cache + .entry(cond) + .or_insert_with(|| cond.to_range(&self.spanning_set).unwrap()) + .clone(); + + valid_transitions.push((to_state_usize, range, hash)); } } + + // Vector Reuse Optimization + if let Some((last_state, last_range, last_hash)) = valid_transitions.pop() { + for (to_state, range, hash) in valid_transitions { + let mut new_ranges = ranges.clone(); + new_ranges.push(range); + q.push(QueueItem { + score: next_depth + dist[to_state], // A* Score Formula + depth: next_depth, + state: to_state, + ranges: new_ranges, + hash, + }); + } + + ranges.push(last_range); + q.push(QueueItem { + score: next_depth + dist[last_state], // A* Score Formula + depth: next_depth, + state: last_state, + ranges, + hash: last_hash, + }); + } + } + + Ok(strings.into_iter().collect()) + } + + fn ranges_to_strings( + strings: &mut IndexSet, + ranges: &Vec, + count: usize, + offset: &mut usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + if strings.len() >= count { + return Ok(()); + } + + let range_lengths: Vec = ranges + .iter() + .map(|r| r.get_cardinality() as usize) + .collect(); + + let mut total_combinations = 1usize; + for &len in &range_lengths { + total_combinations = total_combinations.saturating_mul(len); + } + + if *offset >= total_combinations { + *offset -= total_combinations; + return Ok(()); + } + + let mut current_str = String::with_capacity(ranges.len()); + Self::generate_combinations( + ranges, + &range_lengths, + 0, + &mut current_str, + strings, + count, + offset, + execution_profile, + ) + } + + #[allow(clippy::too_many_arguments)] + fn generate_combinations( + ranges: &Vec, + range_lengths: &[usize], + depth: usize, + current_str: &mut String, + strings: &mut IndexSet, + count: usize, + offset: &mut usize, + execution_profile: &ExecutionProfile, + ) -> Result<(), EngineError> { + if strings.len() >= count { + return Ok(()); + } + + if depth == ranges.len() { + if *offset > 0 { + *offset -= 1; + } else { + strings.insert(current_str.clone()); + } + return Ok(()); + } + + // Calculate combinations for the remaining suffix of ranges + let mut sub_combinations = 1usize; + for &len in &range_lengths[depth + 1..] { + sub_combinations = sub_combinations.saturating_mul(len); } - Ok(strings) + for ch in ranges[depth].clone().iter() { + execution_profile.assert_not_timed_out()?; + + // If skipping this character's subtree fits within the remaining offset + if *offset >= sub_combinations { + *offset -= sub_combinations; + continue; + } + + current_str.push(ch.to_char()); + Self::generate_combinations( + ranges, + range_lengths, + depth + 1, + current_str, + strings, + count, + offset, + execution_profile, + )?; + current_str.pop(); + + if strings.len() >= count { + break; + } + } + + Ok(()) + } + + #[inline] + fn mix64(mut x: u64) -> u64 { + // splitmix64 + x = x.wrapping_add(0x9E3779B97F4A7C15); + let mut z = x; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); + z ^ (z >> 31) + } + + #[inline] + fn path_mix(h: u64, x: u64) -> u64 { + h.wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7) ^ x } } #[cfg(test)] mod tests { + use crate::{cardinality::Cardinality, regex::RegularExpression}; use regex::Regex; - use crate::regex::RegularExpression; + #[test] + fn test_generate_strings_1() -> Result<(), String> { + let automaton = + RegularExpression::parse(".*ab.*c(de|fg).*dab.*c(de|fg).*ab.*c(de|fg).*dab.*c", true) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + automaton.generate_strings(30, 0).unwrap(); + + Ok(()) + } + + #[test] + fn test_generate_strings_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|de){2}", true) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + let strings = automaton.generate_strings(2, 0).unwrap(); + assert_eq!(2, strings.len()); + + let strings = automaton.generate_strings(2, 2).unwrap(); + assert_eq!(2, strings.len()); + + Ok(()) + } #[test] - fn test_generate_strings() -> Result<(), String> { + fn test_generate_strings_3() -> Result<(), String> { + assert_generate_strings(r"<([A-Za-z][A-Za-z0-9]*)[^>]*?/>", 500); + assert_generate_strings("a{100}[a-z]", 100); + assert_generate_strings("(ab|cd)e", 100); + assert_generate_strings("[a-z]+", 100); + assert_generate_strings("[a-z]+@", 100); assert_generate_strings("ù", 1000); - assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); - assert_generate_strings( - "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", - 500 - ); assert_generate_strings("[0-9]+[A-Z]*", 500); assert_generate_strings("a+(ba+)*", 200); assert_generate_strings("((a|bc)*|d)", 200); assert_generate_strings(".*", 50); assert_generate_strings("(ac|ads|a)*", 200); assert_generate_strings("((aad|ads|a)*|q)", 200); + + assert_generate_strings( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + 1000, + ); + + assert_generate_strings("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", 500); + assert_generate_strings( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + 500, + ); assert_generate_strings("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", 1000); - //((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,5} + Ok(()) } + #[test] + fn test_generate_strings_offset() -> Result<(), String> { + assert_generate_strings_offset(".{900}"); + assert_generate_strings_offset("[a-z]+"); + assert_generate_strings_offset("[a-z]+@"); + + assert_generate_strings_offset("[0-9]+[A-Z]*"); + assert_generate_strings_offset("a+(ba+)*"); + assert_generate_strings_offset("((a|bc)*|d)"); + assert_generate_strings_offset(".*"); + assert_generate_strings_offset("(ac|ads|a)*"); + assert_generate_strings_offset("((aad|ads|a)*|q)"); + + assert_generate_strings_offset( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_generate_strings_offset("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@"); + assert_generate_strings_offset( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + assert_generate_strings_offset("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)"); + + Ok(()) + } + + fn assert_generate_strings_offset(regex: &str) { + println!("regex: {regex}"); + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + // Generate 30 strings at once + let all_strings = automaton.generate_strings(30, 0).unwrap(); + + //println!("all_strings {:?}", all_strings); + + // Generate the same 30 strings in chunks of 10 + let chunk1 = automaton.generate_strings(10, 0).unwrap(); + let chunk2 = automaton.generate_strings(10, 10).unwrap(); + let chunk3 = automaton.generate_strings(10, 20).unwrap(); + + /* + println!("chunk1 {:?}", chunk1); + println!("chunk2 {:?}", chunk2); + println!("chunk3 {:?}", chunk3); + */ + + assert_eq!(all_strings.len(), 30, "Should generate exactly 30 strings"); + assert_eq!(chunk1.len(), 10); + assert_eq!(chunk2.len(), 10); + assert_eq!(chunk3.len(), 10); + + // Combine the chunks + let mut combined = chunk1; + combined.extend(chunk2); + combined.extend(chunk3); + + // Prove that generating in chunks perfectly matches the bulk generation + assert_eq!( + all_strings, combined, + "Chunked generation did not match bulk generation" + ); + + let cardinality = automaton.cardinality().unwrap(); + + if let Cardinality::Integer(count) = cardinality { + let empty_chunk = automaton.generate_strings(10, count as usize).unwrap(); + assert!(empty_chunk.is_empty(), "Chunk past limits should be empty"); + } + } + fn assert_generate_strings(regex: &str, number: usize) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - println!("{}", automaton.get_number_of_states()); - //automaton.to_dot(); + let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let strings = automaton.generate_strings(number).unwrap(); - let mut strings: Vec<_> = strings.iter().collect(); - strings.sort_unstable(); + // Modified to include an offset of 0 + let strings = automaton.generate_strings(number, 0).unwrap(); println!("nb of strings: {}/{}", strings.len(), number); assert!(number >= strings.len()); for string in strings { - if !re.is_match(string) { + if !re.is_match(&string) { for byte in string.as_bytes() { print!("{:02x} ", byte); } panic!("'{string}'") } - assert!(re.is_match(string), "'{string}'"); + assert!(re.is_match(&string), "'{string}'"); } } } diff --git a/src/fast_automaton/mod.rs b/src/fast_automaton/mod.rs index 6d6fcbc..5599976 100644 --- a/src/fast_automaton/mod.rs +++ b/src/fast_automaton/mod.rs @@ -1,29 +1,30 @@ -use crate::Range; +use crate::error::EngineError; use ahash::{AHashMap, HashSetExt}; use condition::Condition; use regex_charclass::CharacterClass; use spanning_set::SpanningSet; -use std::collections::hash_map::Entry; use std::collections::VecDeque; +use std::collections::hash_map::Entry; use std::fmt::Display; -use crate::{IntMap, IntSet}; +use super::*; -pub(crate) type State = usize; pub(crate) type Transitions = IntMap; +/// The identifier of a state in a [`FastAutomaton`]. +pub type State = usize; + mod analyze; mod builder; pub mod condition; mod convert; mod generate; mod operation; -#[cfg(feature = "serde")] -mod serializer; pub mod spanning_set; -/// Represent a finite state automaton. +/// Represents a finite-state automaton. #[derive(Clone, Debug, PartialEq, Eq)] +#[must_use = "non-`_mut` operations return a new automaton"] pub struct FastAutomaton { transitions: Vec, transitions_in: IntMap>, @@ -32,37 +33,49 @@ pub struct FastAutomaton { removed_states: IntSet, spanning_set: SpanningSet, deterministic: bool, - cyclic: bool, + minimal: bool, +} + +/// Returned by [`FastAutomaton::try_add_transition`] when adding the requested +/// condition would turn a DFA into an NFA. The automaton is left unchanged. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DeterminismLost; + +impl std::fmt::Display for DeterminismLost { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "adding the transition would introduce overlapping conditions" + ) + } } +impl std::error::Error for DeterminismLost {} + impl Display for FastAutomaton { fn fmt(&self, sb: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(sb, "digraph Automaton {{")?; writeln!(sb, "\trankdir = LR;")?; - for from_state in self.transitions_iter() { - write!(sb, "\t{}", from_state)?; + for from_state in self.states() { + write!(sb, "\t{from_state}")?; if self.accept_states.contains(&from_state) { - writeln!(sb, "\t[shape=doublecircle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=doublecircle,label=\"{from_state}\"];")?; } else { - writeln!(sb, "\t[shape=circle,label=\"{}\"];", from_state)?; + writeln!(sb, "\t[shape=circle,label=\"{from_state}\"];")?; } if self.start_state == from_state { writeln!(sb, "\tinitial [shape=plaintext,label=\"\"];")?; - writeln!(sb, "\tinitial -> {}", from_state)?; + writeln!(sb, "\tinitial -> {from_state}")?; } - for (to_state, cond) in self.transitions_from_state_enumerate_iter(&from_state) { - writeln!( - sb, - "\t{} -> {} [label=\"{}\"]", - from_state, - to_state, - cond.to_range(&self.spanning_set) - .expect("Cannot convert condition to range.") - .to_regex() - .replace('\\', "\\\\") - .replace('"', "\\\"") - )?; + for (cond, to_state) in self.transitions_from(from_state) { + // The automata most worth printing are the broken ones: + // never panic mid-format, label desynced conditions instead. + let label = match cond.to_range(&self.spanning_set) { + Ok(range) => range.to_regex().replace('\\', "\\\\").replace('"', "\\\""), + Err(_) => String::from(""), + }; + writeln!(sb, "\t{from_state} -> {to_state} [label=\"{label}\"]")?; } } write!(sb, "}}") @@ -73,10 +86,11 @@ impl FastAutomaton { #[inline] fn assert_state_exists(&self, state: State) { if !self.has_state(state) { - panic!("The state {} does not exist", state); + panic!("The state {state} does not exist"); } } + /// Returns the number of transitions to the provided state. #[inline] pub fn in_degree(&self, state: State) -> usize { self.transitions_in @@ -85,74 +99,94 @@ impl FastAutomaton { .len() } + /// Returns the number of transitions from the provided state. + /// Returns `0` if the state does not exist. #[inline] pub fn out_degree(&self, state: State) -> usize { - self.transitions[state].len() - } - - pub fn in_transitions(&self, state: State) -> Vec<(usize, Condition)> { - let mut in_transitions = vec![]; - for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { - for (to_state, condition) in self.transitions_from_state_enumerate_vec(from_state) { - if to_state == state { - in_transitions.push((*from_state, condition)); - } - } + if !self.has_state(state) { + return 0; } - in_transitions - } - - pub fn in_states(&self, state: State) -> IntSet { - self.transitions_in - .get(&state) - .unwrap_or(&IntSet::new()) - .clone() + self.transitions[state].len() } + /// Returns an iterator over the automaton’s states. #[inline] - pub fn transitions_iter(&self) -> impl Iterator + '_ { + pub fn states(&self) -> impl Iterator + '_ { (0..self.transitions.len()).filter(|s| !self.removed_states.contains(s)) } + /// Returns a vector containing the automaton’s states. #[inline] - pub fn transitions_vec(&self) -> Vec { - self.transitions_iter().collect() + pub fn states_vec(&self) -> Vec { + self.states().collect() } + /// Returns an iterator over states directly reachable from the given state in one transition. + /// Returns an empty iterator if the state does not exist. #[inline] - pub fn transitions_from_state_enumerate_iter( - &self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] - .iter() - .filter(|s| !self.removed_states.contains(s.0)) + pub fn direct_states(&self, state: State) -> impl Iterator + '_ { + self.transitions.get(state).into_iter().flat_map(move |t| { + t.keys() + .copied() + .filter(|s| !self.removed_states.contains(s)) + }) } + /// Returns a vector of states directly reachable from the given state in one transition. #[inline] - pub fn transitions_from_state_enumerate_iter_mut( - &mut self, - from_state: &State, - ) -> impl Iterator { - self.transitions[*from_state] - .iter_mut() - .filter(|s| !self.removed_states.contains(s.0)) + pub fn direct_states_vec(&self, state: State) -> Vec { + self.direct_states(state).collect() } + /// Returns a vector of transitions to the given state. + pub fn transitions_to_vec(&self, state: State) -> Vec<(State, Condition)> { + // Direct `(from, state)` lookups: scanning each predecessor's whole + // out-list made this O(predecessors × out-degree), and `minimize` + // builds its inverse-transition table through here. + if !self.has_state(state) { + return vec![]; + } + let mut in_transitions = vec![]; + for from_state in self.transitions_in.get(&state).unwrap_or(&IntSet::new()) { + if !self.has_state(*from_state) { + continue; + } + if let Some(condition) = self.condition(*from_state, state) { + in_transitions.push((*from_state, condition.clone())); + } + } + in_transitions + } + + /// Returns a vector of transitions from the given state. + /// Returns an empty vector if the state does not exist. #[inline] - pub fn transitions_from_state_enumerate_vec( - &self, - from_state: &State, - ) -> Vec<(State, Condition)> { - self.transitions[*from_state] - .iter() - .map(|(s, c)| (*s, c.clone())) - .filter(|s| !self.removed_states.contains(&s.0)) - .collect() + pub fn transitions_from_vec(&self, state: State) -> Vec<(Condition, State)> { + self.transitions + .get(state) + .map(|t| { + t.iter() + .map(|(s, c)| (c.clone(), *s)) + .filter(|s| !self.removed_states.contains(&s.1)) + .collect() + }) + .unwrap_or_default() + } + + /// Returns an iterator over transitions from the given state. + /// Returns an empty iterator if the state does not exist. + #[inline] + pub fn transitions_from(&self, state: State) -> impl Iterator { + self.transitions.get(state).into_iter().flat_map(move |t| { + t.iter() + .map(|(s, c)| (c, s)) + .filter(|s| !self.removed_states.contains(s.1)) + }) } + /// Returns `true` if there is a directed transition from `from_state` to `to_state`. #[inline] - pub fn does_transition_exists(&self, from_state: State, to_state: State) -> bool { + pub fn has_transition(&self, from_state: State, to_state: State) -> bool { if !self.has_state(from_state) || !self.has_state(to_state) { return false; } @@ -173,126 +207,108 @@ impl FastAutomaton { .collect() } + /// Returns the number of states in the automaton. #[inline] - pub fn transitions_from_state_enumerate_into_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions - .get(*from_state) // Assume transitions is a map; adjust accordingly. - .into_iter() // Creates an iterator over Option<&V> - .flat_map(|transitions| transitions.iter()) // Flattens into Iterator - .filter(move |(state, _)| !self.removed_states.contains(state)) // Filters out removed states - .map(|(state, condition)| (*state, condition.clone())) // Creates owned data; adjust if cloning is expensive - } - - #[inline] - pub fn transitions_from_state_iter( - &self, - from_state: &State, - ) -> impl Iterator + '_ { - self.transitions[*from_state] - .keys() - .cloned() - .filter(|s| !self.removed_states.contains(s)) - } - - #[inline] - pub fn transitions_from_state(&self, from_state: &State) -> Vec { - self.transitions_from_state_iter(from_state).collect() - } - - #[inline] - pub fn transitions_from_state_into_iter<'a>( - &'a self, - from_state: &State, - ) -> impl Iterator + 'a { - self.transitions[*from_state] - .clone() - .into_iter() - .filter(|s| !self.removed_states.contains(&s.0)) - } - - #[inline] - pub fn get_number_of_states(&self) -> usize { + pub fn number_of_states(&self) -> usize { self.transitions.len() - self.removed_states.len() } + /// Returns a reference to the condition of the directed transition between the two states, if any. + /// Returns `None` if either state does not exist. #[inline] - pub fn get_condition(&self, from_state: &State, to_state: &State) -> Option<&Condition> { - self.transitions[*from_state].get(to_state) + pub fn condition(&self, from_state: State, to_state: State) -> Option<&Condition> { + self.transitions + .get(from_state) + .and_then(|t| t.get(&to_state)) } + /// Returns the start state. #[inline] - pub fn get_start_state(&self) -> State { + pub fn start_state(&self) -> State { self.start_state } + /// Returns a reference to the set of accept (final) states. #[inline] - pub fn get_removed_states(&self) -> &IntSet { - &self.removed_states - } - - #[inline] - pub fn get_accept_states(&self) -> &IntSet { + pub fn accept_states(&self) -> &IntSet { &self.accept_states } + /// Returns a reference to the automaton's spanning set. #[inline] - pub fn get_spanning_set(&self) -> &SpanningSet { + pub fn spanning_set(&self) -> &SpanningSet { &self.spanning_set } + /// Returns `true` if the given state is one of the accept states. #[inline] - pub fn is_accepted(&self, state: &State) -> bool { - self.accept_states.contains(state) + pub fn is_accepted(&self, state: State) -> bool { + self.accept_states.contains(&state) } + /// Returns `true` if the automaton is deterministic. + /// + /// Note: this flag degrades monotonically. Once `add_transition` introduces + /// an overlapping condition, the flag flips to `false` and is not + /// re-checked by `remove_transition` or `remove_state`. The automaton may + /// in fact be deterministic again after such removals; call + /// [`determinize`](Self::determinize) if you need a fresh DFA. #[inline] - pub fn is_determinitic(&self) -> bool { + pub fn is_deterministic(&self) -> bool { self.deterministic } + /// Returns `true` if the automaton is minimal. #[inline] - pub fn is_cyclic(&self) -> bool { - self.cyclic + pub fn is_minimal(&self) -> bool { + self.minimal } + /// Returns `true` if the automaton contains the given state. #[inline] pub fn has_state(&self, state: State) -> bool { !(state >= self.transitions.len() || self.removed_states.contains(&state)) } - pub fn match_string(&self, input: &str) -> bool { - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); - worklist.push_back((0, &self.start_state)); + /// Returns `true` if the automaton matches the given string. + #[tracing::instrument(level = "debug", skip(self, string), fields(states = self.number_of_states(), string_len=string.len()))] + pub fn is_match(&self, string: &str) -> bool { + let mut current: IntSet = IntSet::default(); + current.insert(self.start_state); - while let Some((position, current_state)) = worklist.pop_back() { - if input.len() == position { - if self.accept_states.contains(current_state) { - return true; - } - continue; + let mut next: IntSet = IntSet::default(); + for c in string.chars() { + if current.is_empty() { + return false; } - let curr_char = input.chars().nth(position).unwrap() as u32; - for (to_state, cond) in self.transitions_from_state_enumerate_iter(current_state) { - if cond.has_character(&curr_char, &self.spanning_set).unwrap() { - if position + 1 == input.len() { - if self.accept_states.contains(to_state) { - return true; - } - } else { - worklist.push_back((position + 1, to_state)); + let c_u32 = c as u32; + next.clear(); + for &state in ¤t { + for (cond, to_state) in self.transitions_from(state) { + if cond + .has_character(&c_u32, &self.spanning_set) + .unwrap_or(false) + { + next.insert(*to_state); } } } + std::mem::swap(&mut current, &mut next); } - false + + current.iter().any(|s| self.accept_states.contains(s)) } + /// Returns the automaton's DOT representation. #[inline] - pub fn to_dot(&self) { - println!("{}", self); + pub fn to_dot(&self) -> String { + format!("{self}") + } + + /// Prints the automaton's DOT representation. + #[inline] + pub fn print_dot(&self) { + println!("{self}"); } } @@ -315,4 +331,41 @@ mod tests { assert!(automaton.is_total()); Ok(()) } + + fn assert_send() {} + fn assert_sync() {} + + #[test] + fn test_traits() -> Result<(), String> { + assert_send::(); + assert_sync::(); + + Ok(()) + } + + // Regression: read-only query methods used to directly index + // `self.transitions[state]` without checking `has_state` first, panicking + // on out-of-range inputs. They now return gracefully (0 / None / empty + // iterator). + #[test] + fn out_degree_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert_eq!(a.out_degree(999), 0); + } + + #[test] + fn condition_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert!(a.condition(999, 0).is_none()); + assert!(a.condition(0, 999).is_none()); + } + + #[test] + fn direct_states_safe_on_unknown_state() { + let a = FastAutomaton::new_total(); + assert_eq!(a.direct_states(999).count(), 0); + assert_eq!(a.transitions_from(999).count(), 0); + assert!(a.transitions_from_vec(999).is_empty()); + assert!(a.direct_states_vec(999).is_empty()); + } } diff --git a/src/fast_automaton/operation/alternation.rs b/src/fast_automaton/operation/alternation.rs deleted file mode 100644 index 06c386e..0000000 --- a/src/fast_automaton/operation/alternation.rs +++ /dev/null @@ -1,274 +0,0 @@ -use std::hash::BuildHasherDefault; - -use condition::converter::ConditionConverter; - -use crate::error::EngineError; - -use super::*; - -impl FastAutomaton { - pub fn union(&self, that: &FastAutomaton) -> Result { - let mut union = self.clone(); - union.alternate(that)?; - Ok(union) - } - - pub fn alternation(automatons: Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } - let mut new_automaton = FastAutomaton::new_empty(); - if automatons.is_empty() { - return Ok(new_automaton); - } - for automaton in automatons { - new_automaton.alternate(&automaton)?; - } - Ok(new_automaton) - } - - fn prepare_start_states( - &mut self, - other: &FastAutomaton, - new_states: &mut IntMap, - condition_converter: &ConditionConverter, - ) -> Result, EngineError> { - let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); - let self_start_state_in_degree = self.in_degree(self.start_state); - let other_start_state_in_degree = other.in_degree(other.start_state); - if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { - // The start states can be the same state without any consequence - new_states.insert(other.start_state, self.start_state); - imcomplete_states.insert(self.start_state); - } else { - if self_start_state_in_degree != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for (to_state, cond) in self.transitions_from_state_enumerate_vec(&self.start_state) - { - self.add_transition_to(new_state, to_state, &cond); - } - self.start_state = new_state; - } - if other_start_state_in_degree != 0 { - let new_state = self.new_state(); - if other.is_accepted(&other.start_state) { - self.accept(new_state); - self.accept(self.start_state); - } - - new_states.insert(other.start_state, new_state); - imcomplete_states.insert(new_state); - - for (other_to_state, cond) in - other.transitions_from_state_enumerate_vec(&other.start_state) - { - let cond = condition_converter.convert(&cond)?; - let to_state = match new_states.entry(other_to_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - imcomplete_states.insert(new_state); - v.insert(new_state); - new_state - } - }; - self.add_transition_to(self.start_state, to_state, &cond); - } - } - } - Ok(imcomplete_states) - } - - fn prepare_accept_states( - &mut self, - other: &FastAutomaton, - new_states: &mut IntMap, - imcomplete_states: &IntSet, - ) { - let mut self_accept_states_without_outgoing_edges = vec![]; - for &state in &self.accept_states { - if self.out_degree(state) == 0 && !imcomplete_states.contains(&state) { - self_accept_states_without_outgoing_edges.push(state); - } - } - let accept_state_without_outgoing_edges = - match self_accept_states_without_outgoing_edges.len() { - 1 => self_accept_states_without_outgoing_edges[0], - n if n > 1 => { - let new_state = self.new_state(); - self.accept(new_state); - - for &accept_state in &self_accept_states_without_outgoing_edges { - for (from_state, condition) in self.in_transitions(accept_state) { - self.add_transition_to(from_state, new_state, &condition); - } - self.remove_state(accept_state); - } - new_state - } - _ => { - let new_state = self.new_state(); - self.accept(new_state); - new_state - } - }; - - for &state in &other.accept_states { - if other.out_degree(state) == 0 { - new_states - .entry(state) - .or_insert(accept_state_without_outgoing_edges); - } else if new_states.get(&state).is_none() { - let new_accept_state = self.new_state(); - self.accept(new_accept_state); - new_states.insert(state, new_accept_state); - } - } - } - - /* Important things to remember before modifying this method: - * - the start states can't be merged if they have incoming edges - * - the accept states can't be merged if they have outgoing edges - */ - fn alternate(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { - if other.is_empty() || self.is_total() { - return Ok(()); - } else if other.is_total() { - self.make_total(); - return Ok(()); - } else if self.is_empty() { - self.apply_model(other); - return Ok(()); - } - - let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); - self.apply_new_spanning_set(new_spanning_set)?; - let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; - - let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), - BuildHasherDefault::default(), - ); - - let imcomplete_states = - self.prepare_start_states(other, &mut new_states, &condition_converter)?; - self.prepare_accept_states(other, &mut new_states, &imcomplete_states); - - for from_state in other.transitions_iter() { - let new_from_state = match new_states.entry(from_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - v.insert(new_state); - new_state - } - }; - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { - let new_condition = condition_converter.convert(condition)?; - let new_to_state = match new_states.entry(*to_state) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let new_state = self.new_state(); - v.insert(new_state); - new_state - } - }; - self.add_transition_to(new_from_state, new_to_state, &new_condition); - } - } - self.cyclic = self.cyclic || other.cyclic; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - #[test] - fn test_simple_alternation_regex_1() -> Result<(), String> { - let automaton = RegularExpression::new("(abc|ac|aaa)") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("abc")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_2() -> Result<(), String> { - let automaton = RegularExpression::new("(b?|b{2})") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("b")); - assert!(automaton.match_string("bb")); - assert!(!automaton.match_string("bbb")); - assert!(!automaton.match_string("bbbb")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_3() -> Result<(), String> { - let automaton = RegularExpression::new("((a|bc)*|d)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("abcaaabcbc")); - assert!(automaton.match_string("d")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("abcd")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_4() -> Result<(), String> { - let automaton = RegularExpression::new("(a+(ba+)*|ca*c)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("cc")); - assert!(automaton.match_string("caaac")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aababa")); - Ok(()) - } - - #[test] - fn test_simple_alternation_regex_5() -> Result<(), String> { - let automaton = RegularExpression::new("((aad|ads|a)*|q)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("q")); - assert!(automaton.match_string("aad")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aadadsaaa")); - assert!(!automaton.match_string("aaaas")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("adsq")); - assert!(!automaton.match_string("qq")); - Ok(()) - } -} diff --git a/src/fast_automaton/operation/concat.rs b/src/fast_automaton/operation/concat.rs new file mode 100644 index 0000000..34ee4ae --- /dev/null +++ b/src/fast_automaton/operation/concat.rs @@ -0,0 +1,610 @@ +use std::hash::BuildHasherDefault; + +use condition::converter::ConditionConverter; + +use crate::error::EngineError; + +use super::*; + +impl FastAutomaton { + /// Computes the concatenation between `self` and `other`. + pub fn concat(&self, other: &FastAutomaton) -> Result { + Self::concat_all([self, other]) + } + + /// Computes the concatenation of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] + pub fn concat_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { + let mut new_automaton = FastAutomaton::new_empty_string(); + for automaton in automata { + new_automaton.concat_mut(automaton)?; + } + + Ok(new_automaton) + } + + pub(crate) fn concat_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + self.concat_mut_with(other, false) + } + + /// Concatenation where `force_no_merge` prevents merging `other`'s start + /// state into `self`'s accept states, always introducing a fresh start + /// state for `other` reached by epsilon transitions. Used by `repeat` to + /// keep accept states "clean" when they must remain accepting (so they do + /// not inherit the next copy's transitions). + pub(crate) fn concat_mut_with( + &mut self, + other: &FastAutomaton, + force_no_merge: bool, + ) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.concat_state_count_heuristic(other))?; + + if other.is_empty() { + self.make_empty(); + return Ok(()); + } else if other.is_empty_string() { + return Ok(()); + } + + if self.is_empty() { + self.make_empty(); + return Ok(()); + } else if self.is_empty_string() { + self.apply_model(other); + return Ok(()); + } + + let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); + self.apply_new_spanning_set(new_spanning_set)?; + let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; + + let mut new_states: IntMap = IntMap::with_capacity_and_hasher( + other.number_of_states(), + BuildHasherDefault::default(), + ); + + let start_state_and_accept_states_not_mergeable = force_no_merge + || (other.in_degree(other.start_state) > 0 + && self + .accept_states + .iter() + .cloned() + .any(|s| self.out_degree(s) > 0)); + + let accept_states = self.accept_states.iter().cloned().collect::>(); + + self.accept_states.clear(); + + if other.accept_states.contains(&other.start_state) { + for &accept_state in accept_states.iter() { + self.accept(accept_state); + } + } + + if start_state_and_accept_states_not_mergeable { + let new_start_state = new_states + .entry(other.start_state) + .or_insert(self.new_state()); + if other.accept_states.contains(&other.start_state) { + self.accept(*new_start_state); + } + } + + for from_state in other.states() { + let new_from_states = match new_states.entry(from_state) { + Entry::Occupied(o) => { + vec![*o.get()] + } + Entry::Vacant(v) => { + if from_state == other.start_state { + accept_states.clone() + } else { + let new_state = self.new_state(); + if other.accept_states.contains(&from_state) { + self.accept(new_state); + } + v.insert(new_state); + vec![new_state] + } + } + }; + + for (condition, to_state) in other.transitions_from(from_state) { + let new_to_states = match new_states.entry(*to_state) { + Entry::Occupied(o) => { + vec![*o.get()] + } + Entry::Vacant(v) => { + if *to_state == other.start_state { + accept_states.clone() + } else { + let new_state = self.new_state(); + if other.accept_states.contains(to_state) { + self.accept(new_state); + } + v.insert(new_state); + vec![new_state] + } + } + }; + let projected_condition = condition_converter.convert(condition)?; + for new_from_state in new_from_states.iter() { + for new_to_state in new_to_states.iter() { + self.add_transition(*new_from_state, *new_to_state, &projected_condition); + } + } + } + } + + if start_state_and_accept_states_not_mergeable + && let Some(&other_start_state) = new_states.get(&other.start_state) + { + for accept_state in &accept_states { + self.add_epsilon_transition(*accept_state, other_start_state); + } + } + + self.minimal = false; + Ok(()) + } + + pub(crate) fn concat_state_count_heuristic(&self, other: &FastAutomaton) -> usize { + if other.is_empty() { + return 1; + } else if other.is_empty_string() { + return self.number_of_states(); + } + + if self.is_empty() { + return 1; + } else if self.is_empty_string() { + return other.number_of_states(); + } + + // Determine if we are forced to create a new state to avoid unintended loops + let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 + && self + .accept_states + .iter() + .cloned() + .any(|s| self.out_degree(s) > 0); + + let v1 = self.number_of_states(); + let v2 = other.number_of_states(); + + // Apply the heuristic + if start_state_and_accept_states_not_mergeable { + v1 + v2 + } else { + v1 + v2 - 1 + } + } +} + +#[cfg(test)] +mod tests { + use crate::{fast_automaton::FastAutomaton, regex::RegularExpression}; + + #[test] + fn bug_concat_empty_left() { + let e = FastAutomaton::new_empty(); + let t = FastAutomaton::new_total(); + let r = e.concat(&t).unwrap(); + assert!(r.is_empty(), "∅ · Σ* must be ∅, got something non-empty"); + } + + #[test] + fn bug_concat_empty_right() { + let e = FastAutomaton::new_empty(); + let t = FastAutomaton::new_total(); + let r = t.concat(&e).unwrap(); + assert!(r.is_empty(), "Σ* · ∅ must be ∅, got something non-empty"); + } + + #[test] + fn bug_term_concat_with_empty() { + use crate::Term; + let a = Term::from_automaton( + RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(), + ); + let e = Term::from_automaton(FastAutomaton::new_empty()); + let r = a.concat(&[e]).unwrap(); + assert!(r.is_empty().unwrap(), "'abc' · ∅ must be ∅"); + } + + #[test] + fn test_simple_concatenation_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + + automaton.print_dot(); + assert!(automaton.is_match("abc")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_concat_alternation_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("0101(abc|ac|aaa)", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("0101abc")); + assert!(automaton.is_match("0101ac")); + assert!(automaton.is_match("0101aaa")); + assert!(!automaton.is_match("abc")); + assert!(!automaton.is_match("0101abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_concat_repeat_regex() -> Result<(), String> { + let automaton = RegularExpression::parse("A+B*", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("AAABBB")); + assert!(automaton.is_match("AA")); + assert!(automaton.is_match("AB")); + assert!(!automaton.is_match("B")); + assert!(!automaton.is_match("ABA")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_01() -> Result<(), String> { + let automaton = RegularExpression::parse("a+", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaaaaaa")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("")); + + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_02() -> Result<(), String> { + let automaton = RegularExpression::parse("a*c", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("c")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aac")); + assert!(automaton.is_match("aaaaaaac")); + assert!(!automaton.is_match("abc")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_03() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab){3,4}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("ababab")); + assert!(automaton.is_match("abababab")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("abab")); + assert!(!automaton.is_match("ababababab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_04() -> Result<(), String> { + let automaton = RegularExpression::parse("a{3,}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aaaaa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_05() -> Result<(), String> { + let automaton = RegularExpression::parse("a?", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_06() -> Result<(), String> { + let automaton = RegularExpression::parse("a{0,2}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(!automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_07() -> Result<(), String> { + let automaton = RegularExpression::parse("a{1,3}", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("aaaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_08() -> Result<(), String> { + let automaton = RegularExpression::parse("a+(ba+)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(!automaton.is_match("aab")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aaa")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("aaba")); + assert!(automaton.is_match("aabaaa")); + assert!(automaton.is_match("aaabaaabaaba")); + assert!(!automaton.is_match("aaabbaa")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_09() -> Result<(), String> { + let automaton = RegularExpression::parse("(ac|ads|a)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("acaadsac")); + assert!(automaton.is_match("adsaaaaaaaacaa")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("c")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_10() -> Result<(), String> { + let automaton = RegularExpression::parse("(ef|ads|a)+", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(!automaton.is_match("")); + assert!(automaton.is_match("ef")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("efadsa")); + assert!(automaton.is_match("aaadsefef")); + assert!(!automaton.is_match("as")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("e")); + assert!(!automaton.is_match("ds")); + assert!(!automaton.is_match("d")); + assert!(!automaton.is_match("s")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_11() -> Result<(), String> { + let automaton = RegularExpression::parse("(a|bc)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("bc")); + assert!(automaton.is_match("abcbca")); + assert!(automaton.is_match("bcabcbcaaaa")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("c")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_12() -> Result<(), String> { + let automaton = RegularExpression::parse("([ab]*a)?", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_regex_13() -> Result<(), String> { + let automaton = RegularExpression::parse("([ab]*a)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aa")); + assert!(automaton.is_match("ba")); + assert!(automaton.is_match("aba")); + assert!(automaton.is_match("abbaabbaba")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("abab")); + Ok(()) + } + + #[test] + fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { + let automaton = RegularExpression::parse("a*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert_eq!(1, automaton.number_of_states()); + Ok(()) + } + + #[test] + fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(a*bc)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert_eq!(3, automaton.number_of_states()); + Ok(()) + } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic(".{900}", "[a-z]+"); + + assert_heuristic("[a-z]+@", "[0-9]+[A-Z]*"); + + assert_heuristic("a+(ba+)*", "((a|bc)*|d)"); + + assert_heuristic(".*", "(ac|ads|a)*"); + + assert_heuristic( + "((aad|ads|a)*|q)", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic( + "(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", ".*"); + + assert_heuristic( + ".{900}", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + Ok(()) + } + + fn assert_heuristic(regex1: &str, regex2: &str) { + println!( + "Testing concat heuristic for: '{}' and '{}'", + regex1, regex2 + ); + + let automaton1 = RegularExpression::parse(regex1, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton2 = RegularExpression::parse(regex2, false) + .unwrap() + .to_automaton() + .unwrap(); + + // Helper closure to run the test and assert + let test_pair = |a1: &FastAutomaton, a2: &FastAutomaton, desc: &str| { + let mut actual_concat = a1.clone(); + + // Execute the actual mutation + actual_concat.concat_mut(a2).unwrap(); + + let actual_states = actual_concat.number_of_states(); + let heuristic_states = a1.concat_state_count_heuristic(a2); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for {}.\nExpected (heuristic): {}\nActual (computed): {}", + desc, heuristic_states, actual_states + ); + }; + + // Test 1: regex1 + regex2 + test_pair( + &automaton1, + &automaton2, + &format!("'{}' + '{}'", regex1, regex2), + ); + + // Test 2: regex2 + regex1 (Reverse order) + test_pair( + &automaton2, + &automaton1, + &format!("'{}' + '{}'", regex2, regex1), + ); + + // Test 3: regex1 + regex1 (Self-concatenation, crucial for your repeat logic) + test_pair( + &automaton1, + &automaton1, + &format!("'{}' + '{}' (Self)", regex1, regex1), + ); + + // Test 4 & 5: Empty automaton edge cases + let empty_automaton = FastAutomaton::new_empty(); + + test_pair( + &empty_automaton, + &automaton2, + &format!("Empty + '{}'", regex2), + ); + test_pair( + &automaton1, + &empty_automaton, + &format!("'{}' + Empty", regex1), + ); + } +} +//(a|bc)* diff --git a/src/fast_automaton/operation/concatenate.rs b/src/fast_automaton/operation/concatenate.rs deleted file mode 100644 index 3741e01..0000000 --- a/src/fast_automaton/operation/concatenate.rs +++ /dev/null @@ -1,520 +0,0 @@ -use std::hash::BuildHasherDefault; - -use condition::converter::ConditionConverter; - -use crate::error::EngineError; - -use super::*; - -impl FastAutomaton { - pub fn concatenate(automatons: Vec) -> Result { - if automatons.len() == 1 { - return Ok(automatons[0].clone()); - } - let mut new_automaton = FastAutomaton::new_empty_string(); - if automatons.is_empty() { - return Ok(new_automaton); - } - for automaton in automatons { - new_automaton.concat(&automaton)?; - } - - Ok(new_automaton) - } - - pub fn repeat(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { - if let Some(max) = max_opt { - if min > max { - self.make_empty(); - return Ok(()); - } - } - - let automaton_to_repeat = self.clone(); - - if min == 0 && self.in_degree(self.start_state) != 0 { - let new_state = self.new_state(); - if self.is_accepted(&self.start_state) { - self.accept(new_state); - } - - for to_state in self.transitions_from_state(&self.start_state) { - self.add_epsilon(new_state, to_state); - } - self.start_state = new_state; - - if max_opt.is_none() { - for accept_state in self.accept_states.clone() { - self.add_epsilon(accept_state, self.start_state); - } - self.accept(self.start_state); - return Ok(()); - } - } - - if let Some(max) = max_opt { - if min <= 1 && max == 1 { - if min == 0 { - self.accept_states.insert(self.start_state); - } - return Ok(()); - } - } - - let iter = if min == 0 { 0..0 } else { 0..min - 1 }; - for _ in iter { - self.concat(&automaton_to_repeat)?; - } - - if max_opt.is_none() { - let mut automaton_to_repeat = automaton_to_repeat.clone(); - - let accept_state = *automaton_to_repeat.accept_states.iter().next().unwrap(); - if automaton_to_repeat.accept_states.len() == 1 - && automaton_to_repeat.out_degree(accept_state) == 0 - && automaton_to_repeat.in_degree(automaton_to_repeat.start_state) == 0 - { - automaton_to_repeat.add_epsilon(accept_state, automaton_to_repeat.start_state); - let old_start_state = automaton_to_repeat.start_state; - automaton_to_repeat.start_state = accept_state; - automaton_to_repeat.remove_state(old_start_state); - } else { - let t = Self::transitions_from_state_set( - &automaton_to_repeat.transitions, - automaton_to_repeat.start_state, - ); - let transitions = - Self::transitions_from_state_enumerate(&t, &automaton_to_repeat.removed_states); - - for state in automaton_to_repeat.accept_states.clone() { - for &(to_state, condition) in &transitions { - automaton_to_repeat.add_transition_to(state, *to_state, condition); - } - } - - automaton_to_repeat.accept(automaton_to_repeat.get_start_state()); - } - automaton_to_repeat.cyclic = true; - - if min == 0 { - self.apply_model(&automaton_to_repeat); - } else { - self.concat(&automaton_to_repeat)?; - } - - return Ok(()); - } - - let mut end_states = self.accept_states.iter().cloned().collect::>(); - for _ in cmp::max(min, 1)..max_opt.unwrap() { - self.concat(&automaton_to_repeat)?; - end_states.extend(self.accept_states.iter()); - } - self.accept_states.extend(end_states); - if min == 0 { - self.accept(self.start_state); - } - Ok(()) - } - - fn concat(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { - if other.is_empty() { - return Ok(()); - } - if self.is_empty() { - self.apply_model(other); - return Ok(()); - } - - let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); - self.apply_new_spanning_set(new_spanning_set)?; - let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; - - let mut new_states: IntMap = IntMap::with_capacity_and_hasher( - other.get_number_of_states(), - BuildHasherDefault::default(), - ); - - let start_state_and_accept_states_not_mergeable = other.in_degree(other.start_state) > 0 - && self - .accept_states - .iter() - .cloned() - .any(|s| self.out_degree(s) > 0); - - let accept_states = self.accept_states.iter().cloned().collect::>(); - - self.accept_states.clear(); - - if other.accept_states.contains(&other.start_state) { - for &accept_state in accept_states.iter() { - self.accept(accept_state); - } - } - - if start_state_and_accept_states_not_mergeable { - let new_start_state = new_states - .entry(other.start_state) - .or_insert(self.new_state()); - if other.accept_states.contains(&other.start_state) { - self.accept(*new_start_state); - } - } - - for from_state in other.transitions_iter() { - let new_from_states = match new_states.entry(from_state) { - Entry::Occupied(o) => { - vec![*o.get()] - } - Entry::Vacant(v) => { - if from_state == other.start_state { - accept_states.clone() - } else { - let new_state = self.new_state(); - if other.accept_states.contains(&from_state) { - self.accept(new_state); - } - v.insert(new_state); - vec![new_state] - } - } - }; - - for (to_state, condition) in other.transitions_from_state_enumerate_iter(&from_state) { - let new_to_states = match new_states.entry(*to_state) { - Entry::Occupied(o) => { - vec![*o.get()] - } - Entry::Vacant(v) => { - if *to_state == other.start_state { - accept_states.clone() - } else { - let new_state = self.new_state(); - if other.accept_states.contains(to_state) { - self.accept(new_state); - } - v.insert(new_state); - vec![new_state] - } - } - }; - let projected_condition = condition_converter.convert(condition)?; - for new_from_state in new_from_states.iter() { - for new_to_state in new_to_states.iter() { - self.add_transition_to( - *new_from_state, - *new_to_state, - &projected_condition, - ); - } - } - } - } - - if start_state_and_accept_states_not_mergeable { - if let Some(&other_start_state) = new_states.get(&other.start_state) { - for accept_state in &accept_states { - self.add_epsilon(*accept_state, other_start_state); - } - } - } - self.cyclic = self.cyclic || other.cyclic; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - #[test] - fn test_simple_concatenation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("abc") - .unwrap() - .to_automaton() - .unwrap(); - - automaton.to_dot(); - assert!(automaton.match_string("abc")); - assert!(!automaton.match_string("abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_concat_alternation_regex() -> Result<(), String> { - let automaton = RegularExpression::new("0101(abc|ac|aaa)") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("0101abc")); - assert!(automaton.match_string("0101ac")); - assert!(automaton.match_string("0101aaa")); - assert!(!automaton.match_string("abc")); - assert!(!automaton.match_string("0101abcd")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("acc")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aaaa")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_concat_repeat_regex() -> Result<(), String> { - let automaton = RegularExpression::new("A+B*") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("AAABBB")); - assert!(automaton.match_string("AA")); - assert!(automaton.match_string("AB")); - assert!(!automaton.match_string("B")); - assert!(!automaton.match_string("ABA")); - assert!(!automaton.match_string("")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_01() -> Result<(), String> { - let automaton = RegularExpression::new("a+") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaaaaaa")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("")); - - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_02() -> Result<(), String> { - let automaton = RegularExpression::new("a*c") - .unwrap() - .to_automaton() - .unwrap(); - assert!(automaton.match_string("c")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("aac")); - assert!(automaton.match_string("aaaaaaac")); - assert!(!automaton.match_string("abc")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_03() -> Result<(), String> { - let automaton = RegularExpression::new("(ab){3,4}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("ababab")); - assert!(automaton.match_string("abababab")); - assert!(!automaton.match_string("ab")); - assert!(!automaton.match_string("abab")); - assert!(!automaton.match_string("ababababab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_04() -> Result<(), String> { - let automaton = RegularExpression::new("a{3,}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aaaaa")); - assert!(!automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_05() -> Result<(), String> { - let automaton = RegularExpression::new("a?") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_06() -> Result<(), String> { - let automaton = RegularExpression::new("a{0,2}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(!automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_07() -> Result<(), String> { - let automaton = RegularExpression::new("a{1,3}") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("aaa")); - assert!(!automaton.match_string("aaaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_08() -> Result<(), String> { - let automaton = RegularExpression::new("a+(ba+)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(!automaton.match_string("aab")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aaa")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("aaba")); - assert!(automaton.match_string("aabaaa")); - assert!(automaton.match_string("aaabaaabaaba")); - assert!(!automaton.match_string("aaabbaa")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_09() -> Result<(), String> { - let automaton = RegularExpression::new("(ac|ads|a)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("ac")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("acaadsac")); - assert!(automaton.match_string("adsaaaaaaaacaa")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("c")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_10() -> Result<(), String> { - let automaton = RegularExpression::new("(ef|ads|a)+") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(!automaton.match_string("")); - assert!(automaton.match_string("ef")); - assert!(automaton.match_string("ads")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("efadsa")); - assert!(automaton.match_string("aaadsefef")); - assert!(!automaton.match_string("as")); - assert!(!automaton.match_string("ad")); - assert!(!automaton.match_string("e")); - assert!(!automaton.match_string("ds")); - assert!(!automaton.match_string("d")); - assert!(!automaton.match_string("s")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_11() -> Result<(), String> { - let automaton = RegularExpression::new("(a|bc)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("bc")); - assert!(automaton.match_string("abcbca")); - assert!(automaton.match_string("bcabcbcaaaa")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("c")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_12() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)?") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_regex_13() -> Result<(), String> { - let automaton = RegularExpression::new("([ab]*a)*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert!(automaton.match_string("")); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("aa")); - assert!(automaton.match_string("ba")); - assert!(automaton.match_string("aba")); - assert!(automaton.match_string("abbaabbaba")); - assert!(!automaton.match_string("b")); - assert!(!automaton.match_string("abab")); - Ok(()) - } - - #[test] - fn test_simple_repeat_right_number_of_states_1() -> Result<(), String> { - let automaton = RegularExpression::new("a*") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert_eq!(1, automaton.get_number_of_states()); - Ok(()) - } - - #[test] - fn test_simple_concat_right_number_of_states_2() -> Result<(), String> { - let automaton = RegularExpression::new("(a*bc)") - .unwrap() - .to_automaton() - .unwrap(); - automaton.to_dot(); - assert_eq!(3, automaton.get_number_of_states()); - Ok(()) - } -} -//(a|bc)* diff --git a/src/fast_automaton/operation/determinize.rs b/src/fast_automaton/operation/determinize.rs index 3d4057b..89f7fa7 100644 --- a/src/fast_automaton/operation/determinize.rs +++ b/src/fast_automaton/operation/determinize.rs @@ -1,98 +1,136 @@ -use ahash::HashMapExt; +use bit_set::BitSet; -use crate::{execution_profile::ThreadLocalParams, EngineError}; +use crate::{EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - pub fn determinize(&self) -> Result { + /// [`determinize`](Self::determinize) on behalf of an operation that + /// requires a deterministic automaton: when the execution profile + /// disables implicit determinization, a non-deterministic input is + /// rejected with [`EngineError::DeterministicAutomatonRequired`] instead + /// of being converted. Already-deterministic automata always pass. + pub(crate) fn determinize_implicit(&self) -> Result, EngineError> { + if !self.deterministic { + ExecutionProfile::get().assert_implicit_determinization_allowed()?; + } + self.determinize() + } + + /// Determinizes the automaton and returns the result. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] + pub fn determinize(&self) -> Result, EngineError> { if self.deterministic { - return Ok(self.clone()); + return Ok(Cow::Borrowed(self)); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); - let ranges = self.get_ranges()?; + let bases = self.spanning_bases()?; - let initial_vec = VecDeque::from(vec![self.start_state]); + let mut worklist = VecDeque::with_capacity(self.number_of_states()); - let mut worklist = VecDeque::with_capacity(self.get_number_of_states()); + let map_capacity = (self.number_of_states() as f64 / 0.75).ceil() as usize; + let mut new_states = AHashMap::with_capacity(map_capacity); - let map_capacity = (self.get_number_of_states() as f64 / 0.75).ceil() as usize; - let mut new_states = IntMap::with_capacity(map_capacity); + let mut accept_states = BitSet::new(); + for &state in &self.accept_states { + accept_states.insert(state); + } let mut new_automaton = FastAutomaton::new_empty(); new_automaton.spanning_set = self.spanning_set.clone(); - worklist.push_back((vec![self.start_state], new_automaton.start_state)); - new_states.insert(Self::simple_hash(&initial_vec), new_automaton.start_state); + let mut initial_state = BitSet::new(); + initial_state.insert(self.start_state); - let mut new_states_to_add = VecDeque::with_capacity(self.get_number_of_states()); + worklist.push_back((initial_state.clone(), new_automaton.start_state)); + new_states.insert(initial_state, new_automaton.start_state); + + let mut new_states_to_add = BitSet::new(); while let Some((states, r)) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; - for state in &states { - if self.accept_states.contains(state) { - new_automaton.accept_states.insert(r); - break; - } + if !states.is_disjoint(&accept_states) { + new_automaton.accept(r); } - for base in &ranges { + for base in &bases { for from_state in &states { - for (to_state, cond) in self.transitions_from_state_enumerate_iter(from_state) { + for (cond, to_state) in self.transitions_from(from_state) { if cond.has_intersection(base) { - match new_states_to_add.binary_search(to_state) { - Ok(_) => {} // element already in vector @ `pos` - Err(pos) => new_states_to_add.insert(pos, *to_state), - }; + new_states_to_add.insert(*to_state); } } } if !new_states_to_add.is_empty() { - let q = match new_states.entry(Self::simple_hash(&new_states_to_add)) { - Entry::Occupied(o) => *o.get(), + match new_states.entry(new_states_to_add.clone()) { + Entry::Occupied(o) => { + let q = *o.get(); + + new_states_to_add.clear(); + + new_automaton.add_transition(r, q, base); + } Entry::Vacant(v) => { let new_q = new_automaton.new_state(); - worklist - .push_back((new_states_to_add.iter().cloned().collect(), new_q)); v.insert(new_q); - new_q + + let new_states = std::mem::take(&mut new_states_to_add); + worklist.push_back((new_states, new_q)); + + new_automaton.add_transition(r, new_q, base); } }; - - new_automaton.add_transition_to(r, q, base); } - new_states_to_add.clear(); } } - Ok(new_automaton) - } - fn simple_hash(list: &VecDeque) -> u64 { - let mut hasher = AHasher::default(); - for &item in list { - hasher.write_usize(item); - } - hasher.finish() + Ok(Cow::Owned(new_automaton)) } } #[cfg(test)] mod tests { + use crate::CharRange; + use crate::fast_automaton::FastAutomaton; + use crate::fast_automaton::condition::Condition; + use crate::fast_automaton::spanning_set::SpanningSet; use crate::regex::RegularExpression; + use regex_charclass::char::Char; + // Regression: subset construction iterates `spanning_bases`, which used + // to omit the spanning set's "rest" range. A transition whose condition + // lies in the rest range was therefore silently dropped, so determinizing a + // non-deterministic automaton that uses the rest range produced a DFA with + // the wrong (smaller) language. #[test] - fn test_determinize_1() -> Result<(), String> { - let automaton = RegularExpression::new(".*ab") - .unwrap() - .to_automaton() - .unwrap(); - - let deterministic_automaton = automaton.determinize().unwrap(); - - assert!(deterministic_automaton.is_determinitic()); - - Ok(()) + fn determinize_keeps_rest_range_transitions() { + let rng = |c: char| { + let c = Char::new(c); + CharRange::new_from_range(c..=c) + }; + let ss = SpanningSet::compute_spanning_set(&[rng('a'), rng('b')]); + let rest = ss.rest().clone(); + + let mut a = FastAutomaton::new_empty(); + a.apply_new_spanning_set(&ss).unwrap(); + a.new_state(); + a.add_transition(0, 1, &Condition::from_range(&rest, &ss).unwrap()); // 0 -[^ab]-> 1 + a.add_transition(1, 0, &Condition::from_range(&rng('a'), &ss).unwrap()); + a.add_transition(1, 1, &Condition::from_range(&rng('a'), &ss).unwrap()); // nondeterministic + a.accept(1); + + assert!(!a.is_deterministic()); + assert!(a.is_match("\u{0}"), "a should accept a [^ab] character"); + + let d = a.determinize().unwrap(); + assert!(d.is_deterministic()); + assert!( + d.is_match("\u{0}"), + "determinize dropped the [^ab] transition" + ); + assert!(a.equivalent(&d).unwrap()); } #[test] @@ -112,22 +150,23 @@ mod tests { fn assert_determinization(regex: &str) { println!(":{}", regex); - let automaton = RegularExpression::new(regex) + let automaton = RegularExpression::parse(regex, false) .unwrap() .to_automaton() .unwrap(); - //automaton.compute_determinization_cost(); - //println!("Determinization Cost: {:?}", automaton.determinisation_cost); - println!("States Before: {}", automaton.get_number_of_states()); + println!("States Before: {}", automaton.number_of_states()); let deterministic_automaton = automaton.determinize().unwrap(); println!( "States After: {}", - deterministic_automaton.get_number_of_states() + deterministic_automaton.number_of_states() + ); + assert!(deterministic_automaton.is_deterministic()); + //deterministic_automaton.print_dot(); + assert!( + automaton + .difference(&deterministic_automaton) + .unwrap() + .is_empty() ); - assert!(deterministic_automaton.is_determinitic()); - assert!(automaton - .subtraction(&deterministic_automaton) - .unwrap() - .is_empty()); } } diff --git a/src/fast_automaton/operation/difference.rs b/src/fast_automaton/operation/difference.rs new file mode 100644 index 0000000..b5410c6 --- /dev/null +++ b/src/fast_automaton/operation/difference.rs @@ -0,0 +1,116 @@ +use std::hash::BuildHasherDefault; + +use crate::EngineError; + +use super::*; + +impl FastAutomaton { + /// Totalize the automaton. Precondition: `self.deterministic` is true + /// (the only caller, `complement`, determinizes first). + fn totalize(&mut self) -> Result<(), EngineError> { + debug_assert!(self.deterministic, "totalize requires a DFA"); + + let crash_state = self.new_state(); + let mut transitions_to_crash_state: IntMap = + IntMap::with_capacity_and_hasher( + self.number_of_states(), + BuildHasherDefault::default(), + ); + + let mut ranges = Vec::with_capacity(self.number_of_states()); + for from_state in self.states() { + let mut new_condition = Condition::empty(&self.spanning_set); + for (condition, _) in self.transitions_from(from_state) { + new_condition = new_condition.union(condition); + ranges.push(condition.to_range(self.spanning_set())?); + } + + new_condition = new_condition.complement(); + + transitions_to_crash_state.insert(from_state, new_condition); + } + + for (from_state, condition) in &transitions_to_crash_state { + self.add_transition(*from_state, crash_state, condition); + ranges.push(condition.to_range(self.spanning_set())?); + } + + let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); + self.apply_new_spanning_set(&new_spanning_set)?; + + if self.in_degree(crash_state) == 1 { + // Only the self-loop points to crash; nothing else needs it. + self.remove_state(crash_state); + } + Ok(()) + } + + /// Complements the automaton. + /// + /// If `self` is non-deterministic, it is determinized in place first, + /// unless the execution profile disables implicit determinization, in + /// which case [`EngineError::DeterministicAutomatonRequired`] is + /// returned. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic()))] + pub fn complement(&mut self) -> Result<(), EngineError> { + if !self.deterministic { + *self = self.determinize_implicit()?.into_owned(); + } + self.totalize()?; + + let mut new_accept_states = IntSet::default(); + for state in self.states() { + if self.accept_states.contains(&state) { + continue; + } + new_accept_states.insert(state); + } + + self.accept_states = new_accept_states; + Ok(()) + } + + /// Computes the difference between `self` and `other`. + /// + /// If `other` is non-deterministic, it is determinized first, unless + /// the execution profile disables implicit determinization, in which + /// case [`EngineError::DeterministicAutomatonRequired`] is returned. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), self_deterministic = self.is_deterministic(), other_states = other.number_of_states(), other_deterministic = other.is_deterministic()))] + pub fn difference(&self, other: &FastAutomaton) -> Result { + let mut complement = other.determinize_implicit()?.into_owned(); + complement.complement()?; + self.intersection(&complement) + } +} + +#[cfg(test)] +mod tests { + use crate::fast_automaton::FastAutomaton; + use crate::regex::RegularExpression; + + // `totalize` adds a `crash_state` with a total self-loop, so the complement + // of a finite language is infinite (it matches arbitrarily long strings via + // the crash-state loop). + #[test] + fn complement_of_finite_is_infinite() { + let mut a = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + + a.complement().unwrap(); + + assert!(!a.is_match("abc"), "complement must not match 'abc'"); + assert!(a.is_match("x")); + assert!(a.is_match("xx")); + assert!(a.is_match("xxxxxxxxxx")); + } + + // empty.complement() = Σ*. + #[test] + fn complement_of_empty_is_total() { + let mut a = FastAutomaton::new_empty(); + a.complement().unwrap(); + assert!(a.is_total(), "complement of ∅ must be Σ*"); + } +} diff --git a/src/fast_automaton/operation/intersection.rs b/src/fast_automaton/operation/intersection.rs index 96007e6..c89e623 100644 --- a/src/fast_automaton/operation/intersection.rs +++ b/src/fast_automaton/operation/intersection.rs @@ -1,19 +1,78 @@ +use std::borrow::Cow; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + use condition::converter::ConditionConverter; -use crate::{error::EngineError, execution_profile::ThreadLocalParams}; +use crate::{error::EngineError, execution_profile::ExecutionProfile}; use super::*; impl FastAutomaton { - pub fn intersection(&self, other: &FastAutomaton) -> Result { + /// Computes the intersection between `self` and `other`. + pub fn intersection(&self, other: &FastAutomaton) -> Result { + FastAutomaton::intersection_all([self, other]) + } + + /// Computes the intersection of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] + pub fn intersection_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { + let mut result: Cow<'a, FastAutomaton> = Cow::Owned(FastAutomaton::new_total()); + + for automaton in automata { + result = result.intersection_internal(automaton)?; + + if result.is_empty() { + break; + } + } + + Ok(result.into_owned()) + } + + /// Computes in parallel the intersection of all automata in the given iterator. + /// + /// Only available with the `parallel` feature (enabled by default). + #[cfg(feature = "parallel")] + #[tracing::instrument(level = "debug", skip_all)] + pub fn intersection_all_par<'a, I: IntoParallelIterator>( + automata: I, + ) -> Result { + let execution_profile = ExecutionProfile::get(); + + let total = FastAutomaton::new_total(); + + automata + .into_par_iter() + .try_fold( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(next)?.into_owned())) + }, + ) + .try_reduce( + || total.clone(), + |acc, next| { + execution_profile.apply(|| Ok(acc.intersection_internal(&next)?.into_owned())) + }, + ) + } + + fn intersection_internal<'a>( + &self, + other: &'a FastAutomaton, + ) -> Result, EngineError> { if self.is_empty() || other.is_empty() { - return Ok(Self::new_empty()); + return Ok(Cow::Owned(Self::new_empty())); } else if self.is_total() { - return Ok(other.clone()); + return Ok(Cow::Borrowed(other)); } else if other.is_total() { - return Ok(self.clone()); + return Ok(Cow::Owned(self.clone())); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -24,9 +83,9 @@ impl FastAutomaton { let mut new_automaton = FastAutomaton::new_empty(); let mut worklist = - VecDeque::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + VecDeque::with_capacity(self.number_of_states() + other.number_of_states()); let mut new_states: AHashMap<(usize, usize), (usize, usize, usize), _> = - AHashMap::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + AHashMap::with_capacity(self.number_of_states() + other.number_of_states()); let initial_pair = ( new_automaton.start_state, @@ -39,6 +98,7 @@ impl FastAutomaton { while let Some(p) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; if self.accept_states.contains(&p.1) && other.accept_states.contains(&p.2) { new_automaton.accept(p.0); } @@ -48,8 +108,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -64,22 +124,24 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } new_automaton.spanning_set = new_spanning_set; - new_automaton.remove_dead_transitions(); - Ok(new_automaton) + new_automaton.remove_dead_states(); + Ok(Cow::Owned(new_automaton)) } + /// Returns `true` if the two automata have a non-empty intersection. + #[tracing::instrument(level = "debug", skip_all, fields(self_states = self.number_of_states(), other_states = other.number_of_states()))] pub fn has_intersection(&self, other: &FastAutomaton) -> Result { if self.is_empty() || other.is_empty() { return Ok(false); } else if self.is_total() || other.is_total() { return Ok(true); } - let execution_profile = ThreadLocalParams::get_execution_profile(); + let execution_profile = ExecutionProfile::get(); let new_spanning_set = self.spanning_set.merge(&other.spanning_set); @@ -90,9 +152,9 @@ impl FastAutomaton { let mut new_automaton = FastAutomaton::new_empty(); let mut worklist = - VecDeque::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + VecDeque::with_capacity(self.number_of_states() + other.number_of_states()); let mut new_states: AHashMap<(usize, usize), (usize, usize, usize), _> = - AHashMap::with_capacity(self.get_number_of_states() + other.get_number_of_states()); + AHashMap::with_capacity(self.number_of_states() + other.number_of_states()); let initial_pair = ( new_automaton.start_state, @@ -105,6 +167,7 @@ impl FastAutomaton { while let Some(p) = worklist.pop_front() { execution_profile.assert_not_timed_out()?; + execution_profile.assert_max_number_of_states(new_states.len())?; if self.accept_states.contains(&p.1) && other.accept_states.contains(&p.2) { return Ok(true); } @@ -114,8 +177,8 @@ impl FastAutomaton { let transitions_2 = other.get_projected_transitions(p.2, &condition_converter_other_to_new)?; - for (n1, condition_1) in transitions_1 { - for (n2, condition_2) in &transitions_2 { + for (condition_1, n1) in transitions_1 { + for (condition_2, n2) in &transitions_2 { let intersection = condition_1.intersection(condition_2); if intersection.is_empty() { continue; @@ -130,7 +193,7 @@ impl FastAutomaton { new_r } }; - new_automaton.add_transition_to(p.0, r.0, &intersection); + new_automaton.add_transition(p.0, r.0, &intersection); } } } @@ -141,11 +204,11 @@ impl FastAutomaton { &self, state: State, condition_converter: &ConditionConverter, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let transitions_1: Result, EngineError> = self - .transitions_from_state_enumerate_iter(&state) - .map(|(&s, c)| match condition_converter.convert(c) { - Ok(condition) => Ok((s, condition)), + .transitions_from(state) + .map(|(c, &s)| match condition_converter.convert(c) { + Ok(condition) => Ok((condition, s)), Err(err) => Err(err), }) .collect(); @@ -158,102 +221,144 @@ impl FastAutomaton { mod tests { use crate::regex::RegularExpression; + // Regression: `has_intersection` enforced the timeout but not the state + // budget, unlike `intersection`: the product pair map could grow + // unchecked. + #[test] + fn has_intersection_respects_state_budget() { + use crate::error::EngineError; + use crate::execution_profile::ExecutionProfileBuilder; + + let a = RegularExpression::parse("abcd", false) + .unwrap() + .to_automaton() + .unwrap(); + let b = RegularExpression::parse("abcd", false) + .unwrap() + .to_automaton() + .unwrap(); + + let result = ExecutionProfileBuilder::new() + .max_number_of_states(2) + .build() + .run(|| a.has_intersection(&b)); + assert!(matches!( + result, + Err(EngineError::AutomatonHasTooManyStates) + )); + } + + // a* ∩ a* = a*: the intersection keeps the (infinite) looping language. + #[test] + fn intersection_keeps_infinite_language() { + let a_star = RegularExpression::parse("a*", false) + .unwrap() + .to_automaton() + .unwrap(); + + let inter = a_star.intersection(&a_star).unwrap(); + assert!(inter.is_match("")); + assert!(inter.is_match("aaaaaaaa")); + assert!(!inter.is_match("b")); + assert!(inter.equivalent(&a_star).unwrap()); + } + #[test] fn test_simple_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } #[test] fn test_simple_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("a*") + let automaton1 = RegularExpression::parse("a*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("b*") + let automaton2 = RegularExpression::parse("b*", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(!intersection.match_string("a")); - assert!(!intersection.match_string("b")); + assert!(intersection.is_match("")); + assert!(!intersection.is_match("a")); + assert!(!intersection.is_match("b")); Ok(()) } #[test] fn test_simple_intersection_regex_3() -> Result<(), String> { - let automaton1 = RegularExpression::new("x*") + let automaton1 = RegularExpression::parse("x*", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(xxx)*") + let automaton2 = RegularExpression::parse("(xxx)*", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("")); - assert!(intersection.match_string("xxx")); - assert!(intersection.match_string("xxxxxx")); - assert!(!intersection.match_string("xx")); - assert!(!intersection.match_string("xxxx")); + assert!(intersection.is_match("")); + assert!(intersection.is_match("xxx")); + assert!(intersection.is_match("xxxxxx")); + assert!(!intersection.is_match("xx")); + assert!(!intersection.is_match("xxxx")); Ok(()) } #[test] fn test_complex_intersection_regex_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*(abc|ac|aaa)") + let automaton1 = RegularExpression::parse(".*(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert!(intersection.match_string("ac")); - assert!(!intersection.match_string("aaac")); - assert!(!intersection.match_string("abc")); - assert!(!intersection.match_string("aaa")); - assert!(!intersection.match_string("abcd")); - assert!(!intersection.match_string("aba")); + assert!(intersection.is_match("ac")); + assert!(!intersection.is_match("aaac")); + assert!(!intersection.is_match("abc")); + assert!(!intersection.is_match("aaa")); + assert!(!intersection.is_match("abcd")); + assert!(!intersection.is_match("aba")); Ok(()) } #[test] fn test_complex_intersection_regex_2() -> Result<(), String> { - let automaton1 = RegularExpression::new("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])") + let automaton1 = RegularExpression::parse("(?:[a-z0-9]+(?:\\.[a-z0-9]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", false) .unwrap() .to_automaton().unwrap(); - let automaton2 = RegularExpression::new("avb@.*") + let automaton2 = RegularExpression::parse("avb@.*", false) .unwrap() .to_automaton() .unwrap(); - automaton1.to_dot(); - automaton2.to_dot(); + automaton1.print_dot(); + automaton2.print_dot(); let intersection = automaton1.intersection(&automaton2).unwrap(); assert!(!intersection.is_empty()); - assert!(intersection.match_string("avb@gmail.com")); + assert!(intersection.is_match("avb@gmail.com")); Ok(()) } } diff --git a/src/fast_automaton/operation/minimize.rs b/src/fast_automaton/operation/minimize.rs new file mode 100644 index 0000000..13cbd63 --- /dev/null +++ b/src/fast_automaton/operation/minimize.rs @@ -0,0 +1,274 @@ +use crate::execution_profile::ExecutionProfile; + +use super::*; + +impl FastAutomaton { + /// Minimizes the automaton using Hopcroft's Algorithm. + /// + /// If `self` is non-deterministic, it is determinized in place first, + /// unless the [`ExecutionProfile`] disables implicit determinization, in + /// which case [`EngineError::DeterministicAutomatonRequired`] is + /// returned. + #[tracing::instrument(level = "debug", skip_all, fields(states = self.number_of_states(), deterministic = self.is_deterministic(), minimal = self.is_minimal()))] + pub fn minimize(&mut self) -> Result<(), EngineError> { + // The `minimal` flag is conservatively cleared on every mutation, so + // it can be trusted here; this also keeps the + // `minimize_after_determinization` profile from paying a second + // Hopcroft pass when callers minimize an already-minimized result. + if self.minimal { + return Ok(()); + } + if !self.deterministic { + *self = self.determinize_implicit()?.into_owned(); + } + let execution_profile = ExecutionProfile::get(); + + // Drop states unreachable from the start. A minimal automaton has none, + // and downstream invariants rely on it; in particular `is_empty`'s + // fast path treats any minimal automaton with an accept state as + // non-empty, which only holds if every accept state is reachable. + let reachable = self.forward_reachable_states(); + let unreachable: IntSet = self.states().filter(|s| !reachable.contains(s)).collect(); + if !unreachable.is_empty() { + self.remove_states(&unreachable); + } + + let max_states = self.transitions.len(); + + let all_states: IntSet = self.states().collect(); + let accept_states: IntSet = self.accept_states().iter().cloned().collect(); + let non_accept_states: IntSet = + all_states.difference(&accept_states).cloned().collect(); + + let mut partitions: Vec> = vec![accept_states, non_accept_states]; + partitions.retain(|p| !p.is_empty()); + + let mut state_to_partition = vec![0; max_states]; + for (i, partition) in partitions.iter().enumerate() { + for &state in partition { + state_to_partition[state] = i; + } + } + + let mut worklist: Vec = (0..partitions.len()).collect(); + let mut in_worklist: Vec = vec![true; max_states]; + + let bases = self.spanning_bases()?; + + let mut inverse_transitions: Vec> = vec![Vec::new(); max_states]; + for to_state in self.states() { + for (from_state, condition) in self.transitions_to_vec(to_state) { + inverse_transitions[to_state].push((from_state, condition)); + } + } + + let mut x = IntSet::with_capacity(self.number_of_states()); + + let mut intersection_states: Vec> = vec![Vec::new(); max_states]; + let mut touched_partitions: Vec = Vec::with_capacity(max_states); + + while let Some(a_idx) = worklist.pop() { + execution_profile.assert_not_timed_out()?; + in_worklist[a_idx] = false; + + let a = partitions[a_idx].clone(); + + for base in &bases { + x.clear(); + + // Find states that transition into partition 'A' on 'base' + for &to_state in &a { + for (from_state, condition) in &inverse_transitions[to_state] { + if base.has_intersection(condition) { + x.insert(*from_state); + } + } + } + + if x.is_empty() { + continue; + } + + // TARGETED SPLITTING: Only evaluate partitions we know overlap with 'x' + for &state in &x { + let p_idx = state_to_partition[state]; + if intersection_states[p_idx].is_empty() { + touched_partitions.push(p_idx); + } + intersection_states[p_idx].push(state); + } + + // Process only the affected partitions + for &p_idx in &touched_partitions { + let int_states = &mut intersection_states[p_idx]; + let y_len = partitions[p_idx].len(); + + // If the partition is fully contained in 'x', no split happens. + if int_states.len() == y_len { + int_states.clear(); + continue; + } + + // A split happens! 'int_states' becomes the new partition. + let new_idx = partitions.len(); + let mut new_part = IntSet::with_capacity(int_states.len()); + + for &state in int_states.iter() { + partitions[p_idx].remove(&state); // Remove from original (forming the difference) + new_part.insert(state); // Add to new partition (forming the intersection) + state_to_partition[state] = new_idx; // Update the lookup array + } + + let diff_len = partitions[p_idx].len(); + let int_len = new_part.len(); + + partitions.push(new_part); + in_worklist.push(false); + + // Worklist update + if in_worklist[p_idx] || int_len <= diff_len { + worklist.push(new_idx); + in_worklist[new_idx] = true; + } else { + worklist.push(p_idx); + in_worklist[p_idx] = true; + } + + int_states.clear(); + } + touched_partitions.clear(); + } + } + + if partitions.len() == all_states.len() { + self.minimal = true; + return Ok(()); + } + + self.rebuild_automaton_from_partition(&partitions)?; + + self.minimal = true; + Ok(()) + } + + fn rebuild_automaton_from_partition( + &mut self, + partitions: &[IntSet], + ) -> Result<(), EngineError> { + let mut state_to_rep = vec![0; self.transitions.len()]; + let mut representatives = Vec::with_capacity(partitions.len()); + + for partition in partitions { + let representative = if partition.contains(&self.start_state()) { + self.start_state() + } else { + *partition + .iter() + .next() + .expect("A partition cannot be empty") + }; + + representatives.push(representative); + + for &state in partition { + state_to_rep[state] = representative; + } + } + + let mut transitions_to_update = Vec::new(); + for &rep in &representatives { + for (condition, old_target) in self.transitions_from_vec(rep) { + let new_target = state_to_rep[old_target]; + transitions_to_update.push((rep, condition, new_target)); + } + } + + for partition in partitions { + for &state in partition { + let rep = state_to_rep[state]; + if state != rep { + self.remove_state(state); + } + } + } + + for (from, condition, to) in transitions_to_update { + self.add_transition(from, to, &condition); + } + + self.recompute_minimal_spanning_set() + } +} + +#[cfg(test)] +mod tests { + use crate::regex::RegularExpression; + + #[test] + fn test_minimize_various_regexes() -> Result<(), String> { + let test_cases = [ + "a", + "a|b", + "ab", + "a|a", + "a(b|c)d|a(b|c)d", + "(ab|ab|ab)", + "a*|b*", + "(a|b)*a(a|b)*", + "(abc|de)", + "a(b|c)*d", + "((a|b)c|(a|b)d)", + "a+b?", + "(a+b)*", + ]; + + for regex in test_cases { + assert_minimize(regex)?; + } + + Ok(()) + } + + fn assert_minimize(regex: &str) -> Result<(), String> { + println!("{regex}"); + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap().into_owned(); + let mut minimized_automaton = automaton.clone(); + minimized_automaton.minimize().unwrap(); + + assert!(automaton.equivalent(&minimized_automaton).unwrap()); + + assert!(minimized_automaton.is_deterministic()); + assert!(minimized_automaton.is_minimal()); + Ok(()) + } + + #[test] + fn test_minimize_union_complement_total() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|de)", false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton = automaton.determinize().unwrap(); + let mut complement = automaton.clone().into_owned(); + complement.complement().unwrap(); + + let union = automaton.union(&complement).unwrap(); + let mut union = union.determinize().unwrap().into_owned(); + + assert!(union.is_deterministic()); + assert!(!union.is_minimal()); + + union.minimize().unwrap(); + assert!(union.is_total()); + + assert!(union.is_deterministic()); + assert!(union.is_minimal()); + Ok(()) + } +} diff --git a/src/fast_automaton/operation/mod.rs b/src/fast_automaton/operation/mod.rs index 7c7c0f1..9b558c6 100644 --- a/src/fast_automaton/operation/mod.rs +++ b/src/fast_automaton/operation/mod.rs @@ -1,23 +1,26 @@ -use std::{cmp, hash::Hasher}; - -use ahash::AHasher; +use std::cmp; use super::*; -mod alternation; -mod concatenate; +mod concat; mod determinize; +mod difference; mod intersection; -mod subtraction; +mod minimize; +mod repeat; +mod union; impl FastAutomaton { - pub fn remove_dead_transitions(&mut self) { + /// Removes "dead" states (those that cannot reach any accept state), since + /// they never contribute to the language. If the language is empty the whole + /// automaton collapses to the canonical empty automaton. + pub fn remove_dead_states(&mut self) { if !self.is_empty() { - let reacheable_states = self.get_reacheable_states(); + let live_states = self.live_states(); let mut dead_states = IntSet::default(); - for from_state in self.transitions_iter() { - if !reacheable_states.contains(&from_state) { + for from_state in self.states() { + if !live_states.contains(&from_state) { dead_states.insert(from_state); } } @@ -34,17 +37,17 @@ mod tests { #[test] fn test_remove_dead_states() -> Result<(), String> { - let automaton1 = RegularExpression::new("(abc|ac|aaa)") + let automaton1 = RegularExpression::parse("(abc|ac|aaa)", false) .unwrap() .to_automaton() .unwrap(); - let automaton2 = RegularExpression::new("(abcd|ac|aba)") + let automaton2 = RegularExpression::parse("(abcd|ac|aba)", false) .unwrap() .to_automaton() .unwrap(); let intersection = automaton1.intersection(&automaton2).unwrap(); - assert_eq!(3, intersection.get_number_of_states()); - assert_eq!(3, intersection.get_reacheable_states().len()); + assert_eq!(3, intersection.number_of_states()); + assert_eq!(3, intersection.live_states().len()); Ok(()) } } diff --git a/src/fast_automaton/operation/repeat.rs b/src/fast_automaton/operation/repeat.rs new file mode 100644 index 0000000..7c2198d --- /dev/null +++ b/src/fast_automaton/operation/repeat.rs @@ -0,0 +1,498 @@ +use super::*; + +impl FastAutomaton { + /// Computes the repetition of the automaton between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + #[tracing::instrument(level = "debug", skip(self), fields(states = self.number_of_states(), deterministic = self.is_deterministic(), min = min, max_opt = tracing::field::debug(max_opt)))] + pub fn repeat(&self, min: u32, max_opt: Option) -> Result { + let mut automaton = self.clone(); + if let Err(error) = automaton.repeat_mut(min, max_opt) { + Err(error) + } else { + Ok(automaton) + } + } + + pub(crate) fn repeat_mut(&mut self, min: u32, max_opt: Option) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.repeat_state_count_heuristic(min, max_opt))?; + + if let Some(max) = max_opt + && min > max + { + self.make_empty(); + return Ok(()); + } + + // r⁰ = {""} for any language (max == 0 implies min == 0 here, since + // min > max already returned above). Without this, the general path + // below would leave the original language reachable and return + // L ∪ {""} instead of just {""}. + if max_opt == Some(0) { + self.make_empty_string(); + return Ok(()); + } + + // The empty-string language is a fixpoint of repetition: {""}{m,n} = {""} + // for any valid m ≤ n. Returning early also avoids the unbounded + // construction below, whose single-state "tight loop" branch would + // otherwise try to remove the start state and panic. + if self.is_empty_string() { + return Ok(()); + } + + // Empty language: ∅⁰ = {""}, ∅ⁿ = ∅ for n ≥ 1. The general algorithm + // below assumes a non-empty language; bail out before it can panic. + // This must be the semantic `is_empty()` check, not just + // `accept_states.is_empty()`: an automaton whose accept states are + // all unreachable is the empty language too, and the construction + // below breaks on it (concatenation prunes the dead accepts, leaving + // stale state ids in the accept frontier). + if self.is_empty() { + if min == 0 { + // ∅⁰ is exactly {""}: replace the whole automaton instead + // of marking the start accepting: a dead automaton can still + // have reachable transitions (e.g. a self-loop on a + // non-accepting start), and an accepting start would wrongly + // revive them into (label)*. + self.make_empty_string(); + } + return Ok(()); + } + + let automaton_to_repeat = self.clone(); + + if min == 0 && self.in_degree(self.start_state) != 0 { + let new_state = self.new_state(); + if self.is_accepted(self.start_state) { + self.accept(new_state); + } + + self.add_epsilon_transition(new_state, self.start_state); + self.start_state = new_state; + + if max_opt.is_none() { + for accept_state in self.accept_states.clone() { + self.add_epsilon_transition(accept_state, self.start_state); + } + self.accept(self.start_state); + return Ok(()); + } + } + + if let Some(max) = max_opt + && min <= 1 + && max == 1 + { + if min == 0 { + // Through `accept()`, not a direct insert: the language + // changes (it gains ""), so the `minimal` flag must clear. + self.accept(self.start_state); + } + return Ok(()); + } + + let iter = if min == 0 { 0..0 } else { 0..min - 1 }; + for _ in iter { + self.concat_mut(&automaton_to_repeat)?; + } + + if max_opt.is_none() { + if min == 0 { + // r* with a start state that has no incoming edges (the + // in_degree > 0 case already returned above): loop the single + // copy in place by letting each accept state re-enter the + // start, and make the start accepting. + let mut star = automaton_to_repeat.clone(); + + let accept_state = *star.accept_states.iter().next().unwrap(); + if star.accept_states.len() == 1 + && star.out_degree(accept_state) == 0 + && star.in_degree(star.start_state) == 0 + { + star.add_epsilon_transition(accept_state, star.start_state); + let old_start_state = star.start_state; + star.start_state = accept_state; + star.remove_state(old_start_state); + } else { + let t = Self::transitions_from_state_set(&star.transitions, star.start_state); + let transitions = + Self::transitions_from_state_enumerate(&t, &star.removed_states); + + for state in star.accept_states.clone() { + for &(to_state, condition) in &transitions { + star.add_transition(state, *to_state, condition); + } + } + + star.accept(star.start_state()); + } + + self.apply_model(&star); + } else { + // r{min,} = rᵐⁱⁿ · r*. Build the star part via recursion rather + // than looping `automaton_to_repeat` in place: when the start + // state has incoming edges, `repeat(0, None)` introduces a + // clean accepting start instead of marking the looping start + // accepting, which would otherwise accept partial copies + // (e.g. `(a*b)+` matching "aaba"). + let star = automaton_to_repeat.repeat(0, None)?; + self.concat_mut(&star)?; + } + + return Ok(()); + } + + // Finite maximum: append the optional copies one at a time, keeping + // `self` with a single accept frontier so the chain stays linear, and + // collect each copy boundary in `end_states` to mark accepting at the + // end (stopping after any copy in `min..=max` is valid). + // + // When the copy's start state has incoming edges, merging it into the + // previous copy's accept state would let that (re-marked accepting) + // junction inherit the copy's own transitions and accept partial + // copies (e.g. `(a*b){1,3}` matching "ba"). In that case we force a + // non-merging concatenation so each boundary is a clean accept state + // reached by an epsilon transition. + let force_no_merge = automaton_to_repeat.in_degree(automaton_to_repeat.start_state) > 0; + let mut end_states = self.accept_states.iter().cloned().collect::>(); + for _ in cmp::max(min, 1)..max_opt.unwrap() { + self.concat_mut_with(&automaton_to_repeat, force_no_merge)?; + end_states.extend(self.accept_states.iter()); + } + for end_state in end_states { + self.accept(end_state); + } + if min == 0 { + self.accept(self.start_state); + } + Ok(()) + } + + /// Computes the expected number of states after calling `repeat_mut`, + /// reusing the concatenation heuristic to determine loop costs. + fn repeat_state_count_heuristic(&self, min: u32, max_opt: Option) -> usize { + // 1. Invalid range clears the automaton + if let Some(max) = max_opt + && min > max + { + return 0; + } + + // 1b. r⁰ = {""} (a single state); see `repeat_mut`. + if max_opt == Some(0) { + return 1; + } + + let v_original = self.number_of_states(); + if v_original == 0 { + return 0; + } + + let mut current_states = v_original; + let in_deg_start = self.in_degree(self.start_state) > 0; + + // --- REUSE CONCAT HEURISTIC HERE --- + // Calculate the state delta for a single concatenation. The concat + // heuristic short-circuits to a *smaller* value than `v_original` + // for degenerate languages (∅ → 1, {""} → the operand size), so the + // delta must saturate: `repeat_mut` early-returns for those inputs + // right after this estimate anyway. + let concat_cost = self + .concat_state_count_heuristic(self) + .saturating_sub(v_original); + + // 2. Early state allocation for 0-minimum repeats with incoming start edges + if min == 0 && in_deg_start { + current_states += 1; + if max_opt.is_none() { + return current_states; + } + } + + // 3. Simple cases: 0..=1 or 1..=1 repetitions + if let Some(max) = max_opt + && min <= 1 + && max == 1 + { + return current_states; + } + + // 4. Minimum repetitions loop + let min_iters = if min == 0 { 0 } else { min - 1 }; + current_states += min_iters as usize * concat_cost; + + // 5. Infinite repetition (max_opt is None) + if max_opt.is_none() { + if min == 0 { + // In-place looped r*: a single accept state with no outgoing + // edges and an incoming-edge-free start drops the old start + // state (`v_original - 1`); otherwise the state count is + // unchanged (the `min == 0 && in_deg_start` case already + // returned in step 2). + let mut v_modified = v_original; + if self.accept_states.len() == 1 { + let accept_state = *self.accept_states.iter().next().unwrap(); + if self.out_degree(accept_state) == 0 && !in_deg_start { + v_modified -= 1; + } + } + return v_modified; + } else { + // r{min,} = rᵐⁱⁿ · r*. `current_states` already accounts for the + // rᵐⁱⁿ part. The star r* = repeat(0, None) is independent of + // `min` and small, so build it to obtain its exact contribution + // under the merging concatenation onto rᵐⁱⁿ (whose accept states + // carry outgoing edges iff `acc_out_gt_0`). + let acc_out_gt_0 = self.accept_states.iter().any(|&s| self.out_degree(s) > 0); + match self.repeat(0, None) { + Ok(star) => { + let star_states = star.number_of_states(); + let not_mergeable = star.in_degree(star.start_state) > 0 && acc_out_gt_0; + let final_concat_cost = if not_mergeable { + star_states + } else { + star_states.saturating_sub(1) + }; + return current_states + final_concat_cost; + } + Err(_) => return current_states, + } + } + } + + // 6. Finite maximum repetition loop + // + // The mandatory copies (handled above) merge as plain `r`. Each + // optional tail copy merges as well (`v - 1` new states), except when + // the start state has an incoming edge: the non-merging concatenation + // then introduces a fresh start state, costing `v` per copy. + let max = max_opt.unwrap(); + let loop_start = if min > 1 { min } else { 1 }; + let max_iters = max.saturating_sub(loop_start); + + let optional_states = v_original + if in_deg_start { 1 } else { 0 }; + current_states += max_iters as usize * (optional_states - 1); + + current_states + } +} + +#[cfg(test)] +mod tests { + // Regression: the r{0,1} fast path used to insert into `accept_states` + // directly, leaving a stale `minimal = true` on a mutated automaton; + // `minimize()` (which trusts the flag) then silently refused to + // minimize it. + // Regression (found by the repeat decomposition-oracle proptest): the + // empty-language guard checked `accept_states.is_empty()` only, so an + // automaton whose accepts are all unreachable (language ∅ too) fell + // through to the general construction, which panicked on the stale + // accept ids after concatenation pruned them. + #[test] + fn repeat_of_unreachable_accept_empty_language() { + let mut a = crate::fast_automaton::FastAutomaton::new_empty(); + let s1 = a.new_state(); + a.accept(s1); // unreachable accept: the language is ∅ + assert!(a.is_empty()); + + let star = a.repeat(0, None).unwrap(); // ∅* = {""} + assert!(star.is_match("")); + assert!(!star.is_match("a")); + + assert!(a.repeat(1, Some(2)).unwrap().is_empty()); // ∅{1,2} = ∅ + assert!(a.repeat(2, None).unwrap().is_empty()); // ∅{2,} = ∅ + + // A dead automaton with REACHABLE transitions: ∅* must still be + // exactly {""}; marking the start accepting used to revive the + // dead self-loop into b*. + let range_b = crate::CharRange::new_from_range( + regex_charclass::char::Char::new('b')..=regex_charclass::char::Char::new('b'), + ); + let mut dead_loop = crate::fast_automaton::FastAutomaton::new_empty(); + dead_loop.add_transition_from_range(0, 0, &range_b).unwrap(); + assert!(dead_loop.is_empty()); + + let star = dead_loop.repeat(0, None).unwrap(); + assert!(star.is_match("")); + assert!(!star.is_match("b"), "∅* must not contain \"b\""); + assert!(dead_loop.repeat(1, None).unwrap().is_empty()); + } + + // state-count heuristic underflowed on empty-language automata with + // more than one state, because the concat heuristic short-circuits ∅ + // to 1, panicking in the public `repeat` before the empty-language + // early-return could run. + #[test] + fn repeat_of_multi_state_empty_language_does_not_underflow() { + let mut a = crate::fast_automaton::FastAutomaton::new_empty(); + a.new_state(); // ≥ 2 states, no accept states: the empty language + + let star = a.repeat(0, None).unwrap(); // ∅* = {""} + assert!(star.is_match("")); + assert!(!star.is_match("a")); + + let plus = a.repeat(1, None).unwrap(); // ∅⁺ = ∅ + assert!(plus.is_empty()); + + let bounded = a.repeat(2, Some(3)).unwrap(); // ∅{2,3} = ∅ + assert!(bounded.is_empty()); + } + + #[test] + fn repeat_zero_or_one_clears_the_minimal_flag() { + let mut a = crate::regex::RegularExpression::new("ab") + .unwrap() + .to_automaton() + .unwrap(); + a.minimize().unwrap(); + assert!(a.is_minimal()); + assert!(!a.is_match("")); + + a.repeat_mut(0, Some(1)).unwrap(); + assert!(a.is_match("")); + assert!(a.is_match("ab")); + assert!(!a.is_minimal(), "the language changed: the flag must clear"); + } + + use crate::fast_automaton::FastAutomaton; + use crate::regex::RegularExpression; + + // Regression: `repeat(0, Some(0))` on a non-empty language used to return + // L ∪ {""} instead of just {""}; the general path left the original + // language reachable and only made the start accepting. r⁰ must be {""}. + #[test] + fn bug_repeat_zero_zero_on_non_empty() { + let a = RegularExpression::parse("abc", false) + .unwrap() + .to_automaton() + .unwrap(); + let r = a.repeat(0, Some(0)).unwrap(); + assert!(r.is_match(""), "L^0 must contain \"\""); + assert!( + !r.is_match("abc"), + "L^0 must NOT contain L (got 'abc' match)" + ); + } + + // Regression: repeating the empty-string automaton ({""}) used to reach the + // unbounded "tight loop" branch, which removed the single state while it was + // still the start state and panicked. {""} is a fixpoint of repetition, so + // every bound must return {""} without panicking. + #[test] + fn repeat_of_empty_string_is_fixpoint() { + let empty_string = FastAutomaton::new_empty_string(); + for (min, max) in [ + (0, None), + (1, None), + (3, None), + (0, Some(1)), + (2, Some(5)), + (0, Some(0)), + ] { + let r = empty_string.repeat(min, max).unwrap(); + assert!(r.is_match(""), "{{\"\"}}{{{min},{max:?}}} must match \"\""); + assert!( + !r.is_match("a"), + "{{\"\"}}{{{min},{max:?}}} must match only \"\"" + ); + } + } + + // Regression: empty.repeat(_, None) used to panic on + // `accept_states.iter().next().unwrap()` because the unbounded-repeat + // branch assumed at least one accept state. Language theory: ∅* = {""} + // and ∅⁺ = ∅; both must be returnable without panic. + #[test] + fn empty_repeat_unbounded_does_not_panic() { + let empty = FastAutomaton::new_empty(); + // Expected: ∅* = {""}. + let r = empty.repeat(0, None).expect("should not error"); + assert!(r.is_match("")); + assert!(!r.is_match("a")); + + // Expected: ∅⁺ = ∅. + let r = empty.repeat(1, None).expect("should not error"); + assert!(!r.is_match("")); + assert!(!r.is_match("a")); + } + + #[test] + fn test_repeat_1() -> Result<(), String> { + let automaton = RegularExpression::parse("(a*,a*)?", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("")); + assert!(automaton.is_match(",")); + assert!(automaton.is_match("aaa,")); + assert!(automaton.is_match("aaaa,aa")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aa")); + Ok(()) + } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic("b*a"); + assert_heuristic("a*b"); + assert_heuristic("ba*"); + assert_heuristic(".{900}"); + assert_heuristic("[a-z]+"); + assert_heuristic("[a-z]+@"); + + assert_heuristic("[0-9]+[A-Z]*"); + assert_heuristic("a+(ba+)*"); + assert_heuristic("((a|bc)*|d)"); + assert_heuristic(".*"); + assert_heuristic("(ac|ads|a)*"); + assert_heuristic("((aad|ads|a)*|q)"); + + assert_heuristic( + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic("(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@"); + assert_heuristic( + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)"); + Ok(()) + } + + fn assert_heuristic(regex: &str) { + println!("Testing regex: {regex}"); + + let automaton = RegularExpression::parse(regex, false) + .unwrap() + .to_automaton() + .unwrap(); + + // A matrix of test cases covering all edge cases in the repeat logic + let test_cases = vec![ + (0, Some(0)), // Zero-repeat + (0, Some(1)), // Optional once + (1, Some(1)), // Exactly once + (5, Some(10)), // Standard finite range + (0, None), // Zero or more (Kleene star) + (1, None), // One or more (Kleene plus) + (3, None), // Finite minimum, infinite maximum + ]; + + for (min, max_opt) in test_cases { + // Clone the original automaton to avoid mutating it across iterations + let mut actual_automaton = automaton.clone(); + + // Execute the actual mutation (assuming repeat_mut is the core method) + actual_automaton.repeat_mut(min, max_opt).unwrap(); + + let actual_states = actual_automaton.number_of_states(); + let heuristic_states = automaton.repeat_state_count_heuristic(min, max_opt); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for regex '{}' with min={}, max={:?}.\nExpected (heuristic): {}\nActual (computed): {}", + regex, min, max_opt, heuristic_states, actual_states + ); + } + } +} diff --git a/src/fast_automaton/operation/subtraction.rs b/src/fast_automaton/operation/subtraction.rs deleted file mode 100644 index d513fbb..0000000 --- a/src/fast_automaton/operation/subtraction.rs +++ /dev/null @@ -1,71 +0,0 @@ -use std::hash::BuildHasherDefault; - -use crate::EngineError; - -use super::*; - -impl FastAutomaton { - fn totalize(&mut self) -> Result<(), EngineError> { - if !self.is_determinitic() { - return Err(EngineError::AutomatonShouldBeDeterministic); - } - let crash_state = self.new_state(); - let mut transitions_to_crash_state: IntMap = - IntMap::with_capacity_and_hasher( - self.get_number_of_states(), - BuildHasherDefault::default(), - ); - - let mut ranges = Vec::with_capacity(self.get_number_of_states()); - for from_state in self.transitions_iter() { - let mut new_condition = Condition::empty(&self.spanning_set); - for (_, condition) in self.transitions_from_state_enumerate_iter(&from_state) { - new_condition = new_condition.union(condition); - ranges.push(condition.to_range(self.get_spanning_set())?); - } - - new_condition = new_condition.complement(); - - transitions_to_crash_state.insert(from_state, new_condition); - } - - for (from_state, condition) in &transitions_to_crash_state { - self.add_transition_to(*from_state, crash_state, condition); - ranges.push(condition.to_range(self.get_spanning_set())?); - } - - let new_spanning_set = SpanningSet::compute_spanning_set(&ranges); - self.apply_new_spanning_set(&new_spanning_set)?; - - if self.in_degree(crash_state) == 1 { - self.remove_state(crash_state); - } - Ok(()) - } - - pub fn complement(&mut self) -> Result<(), EngineError> { - self.totalize()?; - - let mut new_accept_states = IntSet::default(); - for state in self.transitions_iter() { - if self.accept_states.contains(&state) { - continue; - } - new_accept_states.insert(state); - } - - self.accept_states = new_accept_states; - Ok(()) - } - - pub fn subtraction(&self, other: &FastAutomaton) -> Result { - let mut complement = other.clone(); - match complement.complement() { - Ok(()) => self.intersection(&complement), - Err(err) => Err(err), - } - } -} - -#[cfg(test)] -mod tests {} diff --git a/src/fast_automaton/operation/union.rs b/src/fast_automaton/operation/union.rs new file mode 100644 index 0000000..9eb515b --- /dev/null +++ b/src/fast_automaton/operation/union.rs @@ -0,0 +1,672 @@ +use std::hash::BuildHasherDefault; + +use condition::converter::ConditionConverter; +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +use crate::{error::EngineError, execution_profile::ExecutionProfile}; + +use super::*; + +impl FastAutomaton { + /// Computes the union between `self` and `other`. + pub fn union(&self, other: &FastAutomaton) -> Result { + Self::union_all([self, other]) + } + + /// Computes the union of all automata in the given iterator. + #[tracing::instrument(level = "debug", skip_all)] + pub fn union_all<'a, I: IntoIterator>( + automata: I, + ) -> Result { + let mut new_automaton = FastAutomaton::new_empty(); + for automaton in automata { + new_automaton.union_mut(automaton)?; + } + Ok(new_automaton) + } + + /// Computes in parallel the union of all automata in the given iterator. + /// + /// Only available with the `parallel` feature (enabled by default). + #[cfg(feature = "parallel")] + #[tracing::instrument(level = "debug", skip_all)] + pub fn union_all_par<'a, I: IntoParallelIterator>( + automata: I, + ) -> Result { + let execution_profile = ExecutionProfile::get(); + + let empty = FastAutomaton::new_empty(); + + automata + .into_par_iter() + .try_fold( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(next)?; + Ok(acc) + }) + }, + ) + .try_reduce( + || empty.clone(), + |mut acc, next| { + execution_profile.apply(|| { + acc.union_mut(&next)?; + Ok(acc) + }) + }, + ) + } + + fn prepare_start_states( + &mut self, + other: &FastAutomaton, + new_states: &mut IntMap, + condition_converter: &ConditionConverter, + ) -> Result, EngineError> { + let mut imcomplete_states = IntSet::with_capacity(other.out_degree(other.start_state) + 1); + // If `other` accepts the empty string we must make the union's *entry* + // state accepting, but only after the start state is finalized below. + // Marking the current start eagerly is wrong when it has incoming edges + // (e.g. a self-loop) and is about to be demoted behind a fresh start: + // the demoted state would then wrongly accept the strings on its loop. + let self_start_state_in_degree = self.in_degree(self.start_state); + let other_start_state_in_degree = other.in_degree(other.start_state); + if self_start_state_in_degree == 0 && other_start_state_in_degree == 0 { + // The start states can be the same state without any consequence + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); + } else { + if self_start_state_in_degree != 0 { + let new_state = self.new_state(); + + self.add_epsilon_transition(new_state, self.start_state); + self.start_state = new_state; + new_states.insert(other.start_state, self.start_state); + imcomplete_states.insert(self.start_state); + } + if other_start_state_in_degree != 0 { + let new_state = self.new_state(); + if other.is_accepted(other.start_state) { + self.accept(new_state); + } + + new_states.insert(other.start_state, new_state); + imcomplete_states.insert(new_state); + + for (cond, other_to_state) in other.transitions_from_vec(other.start_state) { + let cond = condition_converter.convert(&cond)?; + let to_state = match new_states.entry(other_to_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + imcomplete_states.insert(new_state); + v.insert(new_state); + new_state + } + }; + self.add_transition(self.start_state, to_state, &cond); + } + } + } + // Now that `self.start_state` is the final entry state, record `other`'s + // empty-string acceptance there. `self`'s own empty-string acceptance is + // preserved by the start handling above (a freshly created start + // inherits it through the epsilon transition). + if other.is_accepted(other.start_state) { + self.accept(self.start_state); + } + Ok(imcomplete_states) + } + + fn prepare_accept_states( + &mut self, + other: &FastAutomaton, + new_states: &mut IntMap, + imcomplete_states: &IntSet, + ) { + let mut self_accept_states_without_outgoing_edges = vec![]; + for &state in &self.accept_states { + // The start state must never be a merge candidate: the n > 1 + // branch below removes the merged states, and removing the start + // state panics (e.g. an accepting start with no outgoing edges, + // unioned with an operand whose start has incoming edges). + if self.out_degree(state) == 0 + && !imcomplete_states.contains(&state) + && state != self.start_state + { + self_accept_states_without_outgoing_edges.push(state); + } + } + let accept_state_without_outgoing_edges = + match self_accept_states_without_outgoing_edges.len() { + 1 => Some(self_accept_states_without_outgoing_edges[0]), + n if n > 1 => { + let new_state = self.new_state(); + self.accept(new_state); + + for &accept_state in &self_accept_states_without_outgoing_edges { + for (from_state, condition) in self.transitions_to_vec(accept_state) { + self.add_transition(from_state, new_state, &condition); + } + self.remove_state(accept_state); + } + Some(new_state) + } + _ => None, + }; + + for &state in &other.accept_states { + // Resolve the self-state that represents `state`, allocating one if + // it is not mapped yet, then mark it accepting. The accept flag must + // be applied even when `state` was already mapped during + // `prepare_start_states` (e.g. a start state with incoming edges + // whose outgoing edges reach this accept state); otherwise the + // union would silently drop `other`'s acceptance. + let mapped = match accept_state_without_outgoing_edges { + Some(accept_state) if other.out_degree(state) == 0 => { + *new_states.entry(state).or_insert(accept_state) + } + _ => match new_states.get(&state) { + Some(&mapped) => mapped, + None => { + let new_accept_state = self.new_state(); + new_states.insert(state, new_accept_state); + new_accept_state + } + }, + }; + self.accept(mapped); + } + } + + /* Important things to remember before modifying this method: + * - the start states can't be merged if they have incoming edges + * - the accept states can't be merged if they have outgoing edges + */ + pub(crate) fn union_mut(&mut self, other: &FastAutomaton) -> Result<(), EngineError> { + ExecutionProfile::get() + .assert_max_number_of_states(self.union_state_count_heuristic(other))?; + + if other.is_empty() || self.is_total() { + return Ok(()); + } else if other.is_total() { + self.make_total(); + return Ok(()); + } else if self.is_empty() { + self.apply_model(other); + return Ok(()); + } + + let new_spanning_set = &self.spanning_set.merge(&other.spanning_set); + self.apply_new_spanning_set(new_spanning_set)?; + let condition_converter = ConditionConverter::new(&other.spanning_set, new_spanning_set)?; + + let mut new_states: IntMap = IntMap::with_capacity_and_hasher( + other.number_of_states(), + BuildHasherDefault::default(), + ); + + let imcomplete_states = + self.prepare_start_states(other, &mut new_states, &condition_converter)?; + self.prepare_accept_states(other, &mut new_states, &imcomplete_states); + + for from_state in other.states() { + let new_from_state = match new_states.entry(from_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + v.insert(new_state); + new_state + } + }; + for (condition, to_state) in other.transitions_from(from_state) { + let new_condition = condition_converter.convert(condition)?; + let new_to_state = match new_states.entry(*to_state) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_state = self.new_state(); + v.insert(new_state); + new_state + } + }; + self.add_transition(new_from_state, new_to_state, &new_condition); + } + } + self.minimal = false; + Ok(()) + } + + /// Computes the expected number of states after calling `union_mut`. + fn union_state_count_heuristic(&self, other: &FastAutomaton) -> usize { + // Edge cases + if other.is_empty() || self.is_total() { + return self.number_of_states(); + } else if other.is_total() || self.is_empty() { + return other.number_of_states(); + } + + let v1 = self.number_of_states(); + let v2 = other.number_of_states(); + + let self_in = self.in_degree(self.start_state); + let other_in = other.in_degree(other.start_state); + + let mut total_delta: i32 = 0; + + // --- 1. Start States Math --- + if self_in == 0 && other_in == 0 { + total_delta -= 1; + } else if self_in != 0 && other_in != 0 { + total_delta += 1; + } + + // Track which 'other' states are already mapped in the start phase + // so we don't double-count them when calculating accept state savings. + let mut mapped_other_states = std::collections::HashSet::new(); + mapped_other_states.insert(other.start_state); + + if other_in != 0 { + for (_, to_state) in other.transitions_from(other.start_state) { + mapped_other_states.insert(*to_state); + } + } + + // --- 2. Accept States Math --- + // Gather self's accept states. If other.start_state is accepted, + // it virtually triggers self.accept(self.start_state) early. + let mut self_accepts: std::collections::HashSet = + self.accept_states.iter().cloned().collect(); + + if other.is_accepted(other.start_state) { + self_accepts.insert(self.start_state); + } + + let mut n = 0; + + for &state in &self_accepts { + // Mirror `prepare_accept_states`: a state that is (still) the + // start after the start-state phase is never a merge candidate. + // When `self_in != 0` the original start gets demoted behind a + // fresh start, so it *does* participate. + let is_excluded = self_in == 0 && state == self.start_state; + if self.out_degree(state) == 0 && !is_excluded { + n += 1; + } + } + + let has_acc_target = n >= 1; + + // If n > 1, we replace `n` states with exactly 1 unified state. + if n > 1 { + total_delta += 1 - n; + } + + // Calculate mappings for other's accept states + if has_acc_target { + for &state in &other.accept_states { + if other.out_degree(state) == 0 && !mapped_other_states.contains(&state) { + total_delta -= 1; + } + } + } + + (v1 as i32 + v2 as i32 + total_delta) as usize + } +} + +#[cfg(test)] +mod tests { + use crate::{Term, fast_automaton::FastAutomaton, regex::RegularExpression}; + + // Regression: unioning with the empty-string language used to drop the + // other operand's acceptance. When `other`'s start state has incoming edges + // its outgoing edges (and the accept states they reach) are mapped during + // `prepare_start_states`; `prepare_accept_states` then failed to mark those + // already-mapped images accepting, so `union({""}, "a+")` matched only "" + // instead of "" and "a", "aa", ... + #[test] + fn union_with_empty_string_keeps_other_accepts() { + let empty_string = RegularExpression::parse("", false) + .unwrap() + .to_automaton() + .unwrap(); + let a_plus = RegularExpression::parse("a+", false) + .unwrap() + .to_automaton() + .unwrap(); + + let u = empty_string.union(&a_plus).unwrap(); + assert!(u.is_match(""), "union must keep \"\""); + assert!( + u.is_match("a"), + "union dropped the other operand's language" + ); + assert!(u.is_match("aaa")); + + // It must be equivalent regardless of operand order. + let u2 = a_plus.union(&empty_string).unwrap(); + assert!( + Term::from_automaton(u) + .equivalent(&Term::from_automaton(u2)) + .unwrap() + ); + } + + // Regression: `prepare_accept_states` merges accept states without + // outgoing edges and removes the originals. When `self`'s accepting start + // (no outgoing edges) met an operand whose start has incoming edges, the + // start landed in the merge list and `remove_state(start)` panicked. + #[test] + fn union_does_not_remove_accepting_start() { + use crate::CharRange; + use crate::fast_automaton::condition::Condition; + use crate::fast_automaton::spanning_set::SpanningSet; + use regex_charclass::char::Char; + + let rng = |c: char| { + let c = Char::new(c); + CharRange::new_from_range(c..=c) + }; + let ss = SpanningSet::compute_spanning_set(&[rng('a'), rng('b')]); + + // a: two accepting states without outgoing edges, one being the start. + let mut a = FastAutomaton::new_empty(); + a.apply_new_spanning_set(&ss).unwrap(); + a.new_state(); + a.accept(0); + a.accept(1); + + // b: start has an incoming edge (1 -a-> 0) but no outgoing edges. + let mut b = FastAutomaton::new_empty(); + b.apply_new_spanning_set(&ss).unwrap(); + b.new_state(); + b.add_transition(1, 0, &Condition::from_range(&rng('a'), &ss).unwrap()); + b.accept(0); + + let u = a.union(&b).unwrap(); // used to panic + assert!(u.is_match(""), "union must keep the empty string"); + } + + // Regression: unioning a language whose start state has a self-loop with the + // empty string used to mark that looping start accepting, so `a*b | ""` + // wrongly matched "a", "aa", ... The empty-string acceptance must land on + // the union's entry state, not on a demoted looping state. + #[test] + fn union_with_empty_string_does_not_over_accept() { + let a_star_b = RegularExpression::parse("a*b", false) + .unwrap() + .to_automaton() + .unwrap(); + let empty_string = RegularExpression::parse("", false) + .unwrap() + .to_automaton() + .unwrap(); + + let u = a_star_b.union(&empty_string).unwrap(); + assert!(u.is_match(""), "(a*b)? must match \"\""); + assert!(u.is_match("b")); + assert!(u.is_match("ab")); + assert!(u.is_match("aab")); + assert!( + !u.is_match("a"), + "union wrongly accepted 'a' (looping start marked accepting)" + ); + assert!(!u.is_match("aa")); + } + + #[test] + fn test_simple_alternation_regex_1() -> Result<(), String> { + let automaton = RegularExpression::parse("(abc|ac|aaa)", false) + .unwrap() + .to_automaton() + .unwrap(); + assert!(automaton.is_match("abc")); + assert!(automaton.is_match("ac")); + assert!(automaton.is_match("aaa")); + assert!(!automaton.is_match("abcd")); + assert!(!automaton.is_match("ab")); + assert!(!automaton.is_match("acc")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("aaaa")); + assert!(!automaton.is_match("aa")); + assert!(!automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_2() -> Result<(), String> { + let automaton = RegularExpression::parse("(b?|b{2})", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("b")); + assert!(automaton.is_match("bb")); + assert!(!automaton.is_match("bbb")); + assert!(!automaton.is_match("bbbb")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3() -> Result<(), String> { + let automaton = RegularExpression::parse("((a|bc)*|d)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3b() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_3t() -> Result<(), String> { + let automaton = RegularExpression::parse("(d*|(a|bc)*)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("abcaaabcbc")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ddd")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("abcd")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_4() -> Result<(), String> { + let automaton = RegularExpression::parse("(a+(ba+)*|ca*c)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("cc")); + assert!(automaton.is_match("caaac")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aababa")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_5() -> Result<(), String> { + let automaton = RegularExpression::parse("((aad|ads|a)*|q)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("q")); + assert!(automaton.is_match("aad")); + assert!(automaton.is_match("ads")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("aadadsaaa")); + assert!(!automaton.is_match("aaaas")); + assert!(!automaton.is_match("ad")); + assert!(!automaton.is_match("adsq")); + assert!(!automaton.is_match("qq")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_6() -> Result<(), String> { + let automaton = RegularExpression::parse("(ab|)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + assert!(!automaton.is_match("a")); + assert!(!automaton.is_match("b")); + assert!(!automaton.is_match("aab")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_7() -> Result<(), String> { + let automaton = RegularExpression::parse("(d|a?|ab)", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("d")); + assert!(automaton.is_match("ab")); + assert!(automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_simple_alternation_regex_8() -> Result<(), String> { + let automaton = RegularExpression::parse("((d|a?|ab)u)*", false) + .unwrap() + .to_automaton() + .unwrap(); + automaton.print_dot(); + assert!(automaton.is_match("au")); + assert!(automaton.is_match("du")); + assert!(automaton.is_match("abu")); + assert!(automaton.is_match("u")); + assert!(automaton.is_match("")); + Ok(()) + } + + #[test] + fn test_heuristic() -> Result<(), String> { + assert_heuristic(".{900}", "[a-z]+"); + + assert_heuristic("[a-z]+@", "[0-9]+[A-Z]*"); + + assert_heuristic("a+(ba+)*", "((a|bc)*|d)"); + + assert_heuristic(".*", "(ac|ads|a)*"); + + assert_heuristic( + "((aad|ads|a)*|q)", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + assert_heuristic( + "(?:A+(?:\\.[AB]+)*|\"(?:C|\\\\D)*\")@", + "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@", + ); + + assert_heuristic("((aad|ads|a)*abc.*uif(aad|ads|x)*|q)", ".*"); + + assert_heuristic( + ".{900}", + r"john[!#-'\*\+\-/-9=\?\^-\u{007e}]*(\.[!#-'\*\+\-/-9=\?\^-\u{007e}](\.?[!#-'\*\+\-/-9=\?\^-\u{007e}])*)?\.?doe@example\.com", + ); + + Ok(()) + } + + fn assert_heuristic(regex1: &str, regex2: &str) { + println!("Testing union heuristic for: '{}' | '{}'", regex1, regex2); + + let automaton1 = RegularExpression::parse(regex1, false) + .unwrap() + .to_automaton() + .unwrap(); + + let automaton2 = RegularExpression::parse(regex2, false) + .unwrap() + .to_automaton() + .unwrap(); + + let test_pair = |a1: &FastAutomaton, a2: &FastAutomaton, desc: &str| { + let mut actual_union = a1.clone(); + actual_union.union_mut(a2).unwrap(); + + let actual_states = actual_union.number_of_states(); + let heuristic_states = a1.union_state_count_heuristic(a2); + + assert_eq!( + actual_states, heuristic_states, + "Mismatch for {}.\nExpected (heuristic): {}\nActual (computed): {}", + desc, heuristic_states, actual_states + ); + }; + + // Test standard union: A | B + test_pair( + &automaton1, + &automaton2, + &format!("'{}' | '{}'", regex1, regex2), + ); + + // Test reverse union: B | A + test_pair( + &automaton2, + &automaton1, + &format!("'{}' | '{}'", regex2, regex1), + ); + + // Test self-union: A | A + test_pair( + &automaton1, + &automaton1, + &format!("'{}' | '{}' (Self)", regex1, regex1), + ); + + // Test Empty states + let empty_automaton = FastAutomaton::new_empty(); + + test_pair( + &empty_automaton, + &automaton2, + &format!("Empty | '{}'", regex2), + ); + test_pair( + &automaton1, + &empty_automaton, + &format!("'{}' | Empty", regex1), + ); + } +} diff --git a/src/fast_automaton/serializer.rs b/src/fast_automaton/serializer.rs deleted file mode 100644 index 017341b..0000000 --- a/src/fast_automaton/serializer.rs +++ /dev/null @@ -1,225 +0,0 @@ -use super::*; -use lazy_static::lazy_static; -use rand::Rng; -use serde::{de, ser, Deserializer, Serializer}; -use serde::{Deserialize, Serialize}; -use std::env; -use z85::{decode, encode}; -use crate::tokenizer::Tokenizer; - -use sha2::{Digest, Sha256}; - -use aes_gcm_siv::{ - aead::{Aead, KeyInit}, - Aes256GcmSiv, Nonce, -}; -use flate2::read::ZlibDecoder; -use flate2::write::ZlibEncoder; -use flate2::Compression; -use std::io::prelude::*; - -use crate::tokenizer::token::{automaton_token::AutomatonToken, Token}; - -pub struct FastAutomatonReader { - cipher: Aes256GcmSiv, -} - -impl FastAutomatonReader { - pub fn new() -> Self { - let env_var = env::var("RS_FAIR_SECRET_KEY").unwrap_or("DEFAULT PASSKEY".to_string()); - let key = Sha256::digest(env_var.as_bytes()); - FastAutomatonReader { - cipher: Aes256GcmSiv::new(&key), - } - } - - pub fn random_nonce() -> [u8; 12] { - let mut nonce = [0u8; 12]; - rand::thread_rng().fill(&mut nonce); - nonce - } -} - -lazy_static! { - static ref SINGLETON_INSTANCE: FastAutomatonReader = FastAutomatonReader::new(); -} - -fn get_fast_automaton_reader() -> &'static FastAutomatonReader { - &SINGLETON_INSTANCE -} - -#[derive(Serialize, Deserialize, Debug)] -struct SerializedAutomaton(Vec, SpanningSet); - -impl serde::Serialize for FastAutomaton { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let tokenizer = Tokenizer::new(self); - match AutomatonToken::to_fair_tokens(&tokenizer.to_embedding()) { - Ok(tokens) => { - let serialized_automaton = - SerializedAutomaton(tokens, self.get_spanning_set().clone()); - - let mut serialized = Vec::with_capacity(self.get_number_of_states() * 8); - if let Err(err) = ciborium::into_writer(&serialized_automaton, &mut serialized) { - return Err(ser::Error::custom(err.to_string())); - } - - serialized = compress_data(&serialized); - - let nonce = FastAutomatonReader::random_nonce(); - - match get_fast_automaton_reader() - .cipher - .encrypt(Nonce::from_slice(&nonce), serialized.as_ref()) - { - Ok(ciphertext) => { - let mut encrypted = Vec::from_iter(nonce); - encrypted.extend(ciphertext); - - serializer.serialize_str(&encode(&encrypted)) - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } - } - Err(err) => Err(ser::Error::custom(err.to_string())), - } - } -} - -impl<'de> serde::Deserialize<'de> for FastAutomaton { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - match String::deserialize(deserializer) { - Ok(decoded) => match decode(decoded) { - Ok(encrypted) => { - let nonce = &encrypted[0..12]; - let payload = encrypted[12..].to_vec(); - let cipher_result = get_fast_automaton_reader() - .cipher - .decrypt(Nonce::from_slice(nonce), payload.as_ref()); - - match cipher_result { - Ok(cipher_result) => { - let decrypted = decompress_data(&cipher_result); - - let automaton: Result< - SerializedAutomaton, - ciborium::de::Error, - > = ciborium::from_reader(&decrypted[..]); - match automaton { - Ok(automaton) => { - let mut temp_automaton = FastAutomaton::new_empty(); - temp_automaton.spanning_set = automaton.1; - let tokenizer = Tokenizer::new(&temp_automaton); - - match tokenizer.from_embedding( - &automaton - .0 - .into_iter() - .map(AutomatonToken::from_fair_token) - .collect::>(), - ) { - Ok(res) => Ok(res), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - } - } - Err(err) => Err(de::Error::custom(err.to_string())), - }, - Err(err) => Err(err), - } - } -} - -fn compress_data(data: &[u8]) -> Vec { - let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default()); - encoder.write_all(data).expect("Failed to write data"); - encoder.finish().expect("Failed to finish compression") -} - -fn decompress_data(data: &[u8]) -> Vec { - let mut decoder = ZlibDecoder::new(data); - let mut decompressed_data = Vec::new(); - decoder - .read_to_end(&mut decompressed_data) - .expect("Failed to read data"); - decompressed_data -} - -#[cfg(test)] -mod tests { - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_serialization() -> Result<(), String> { - assert_serialization("..."); - assert_serialization(".*abc"); - assert_serialization(".*"); - assert_serialization(".*abcdef.*dsqd"); - assert_serialization( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,2}", - ); - assert_serialization("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); - - Ok(()) - } - - fn assert_serialization(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{regex}"); - - let automaton = regex.to_automaton().unwrap(); - - let serialized = serde_json::to_string(&automaton).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = automaton.determinize().unwrap(); - - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - } - - #[test] - fn test_serialization_case_1() -> Result<(), String> { - let automaton1 = RegularExpression::new(".*") - .unwrap() - .to_automaton() - .unwrap(); - let automaton2 = RegularExpression::new("\\d+") - .unwrap() - .to_automaton() - .unwrap() - .determinize() - .unwrap(); - - let subtraction = automaton1.subtraction(&automaton2).unwrap(); - - let serialized = serde_json::to_string(&subtraction).unwrap(); - println!("{serialized}"); - - let unserialized: FastAutomaton = serde_json::from_str(&serialized).unwrap(); - - let unserialized = unserialized.determinize().unwrap(); - let automaton = subtraction.determinize().unwrap(); - - assert!(automaton.subtraction(&unserialized).unwrap().is_empty()); - assert!(unserialized.subtraction(&automaton).unwrap().is_empty()); - - Ok(()) - } -} diff --git a/src/fast_automaton/spanning_set/mod.rs b/src/fast_automaton/spanning_set/mod.rs index 2aa2780..dda6178 100644 --- a/src/fast_automaton/spanning_set/mod.rs +++ b/src/fast_automaton/spanning_set/mod.rs @@ -1,28 +1,42 @@ use std::slice::Iter; use ahash::AHashSet; -use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; -/// Contains a set of [`RangeSet`] that span all the transition of a [`crate::FastAutomaton`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +use crate::CharRange; + +/// A set of [`CharRange`] that spans all transitions of a [`crate::FastAutomaton`]. #[derive(Clone, Debug, PartialEq, Eq)] -pub struct SpanningSet(Vec>, RangeSet); +pub struct SpanningSet(Vec, CharRange); impl SpanningSet { + /// Creates a spanning set from explicit disjoint `ranges` plus the `rest` + /// range covering every character they don't. The caller is responsible for + /// these invariants; prefer [`compute_spanning_set`](Self::compute_spanning_set), + /// which derives a minimal, well-formed set from arbitrary ranges. + pub fn new(ranges: Vec, rest: CharRange) -> Self { + SpanningSet(ranges, rest) + } + + /// Creates the spanning set of an automaton with no transitions: no + /// explicit ranges, with the rest covering all characters. pub fn new_empty() -> Self { - SpanningSet(vec![], RangeSet::total()) + SpanningSet(vec![], CharRange::total()) } + /// Creates the spanning set with a single range covering all characters and + /// an empty rest. pub fn new_total() -> Self { - SpanningSet(vec![RangeSet::total()], RangeSet::empty()) + SpanningSet(vec![CharRange::total()], CharRange::empty()) } + /// Returns `true` if this is the empty spanning set (no explicit ranges; + /// see [`new_empty`](Self::new_empty)). pub fn is_empty(&self) -> bool { self.0.is_empty() && self.1.is_total() } + /// Returns `true` if this is the total spanning set (one all-covering + /// range; see [`new_total`](Self::new_total)). pub fn is_total(&self) -> bool { self.0.len() == 1 && self.0[0].is_total() && self.1.is_empty() } @@ -35,7 +49,7 @@ impl SpanningSet { } } - pub(crate) fn get_spanning_ranges_with_rest(&self) -> Vec> { + pub(crate) fn spanning_ranges_with_rest(&self) -> Vec { if self.1.is_empty() { self.0.clone() } else { @@ -45,22 +59,27 @@ impl SpanningSet { } } - pub fn get_spanning_ranges(&self) -> Iter> { + /// Returns an iterator over the explicit (non-rest) ranges in the spanning set. + pub fn spanning_ranges(&self) -> Iter<'_, CharRange> { self.0.iter() } - pub fn get_number_of_spanning_ranges(&self) -> usize { + /// Returns the number of explicit (non-rest) ranges in the spanning set. + pub fn number_of_spanning_ranges(&self) -> usize { self.0.len() } - pub fn get_spanning_range(&self, i: usize) -> Option<&RangeSet> { + /// Returns the explicit range at index `i`, or `None` if out of bounds. + pub fn spanning_range(&self, i: usize) -> Option<&CharRange> { self.0.get(i) } - pub fn get_rest(&self) -> &RangeSet { + /// Returns the "rest" range covering all characters not in any explicit range. + pub fn rest(&self) -> &CharRange { &self.1 } + /// Compute a new minimal spanning set by merging the provided spanning set. pub fn merge(&self, other: &Self) -> Self { let mut ranges = Vec::with_capacity(self.0.len() + other.0.len()); ranges.extend_from_slice(&self.0); @@ -69,8 +88,9 @@ impl SpanningSet { Self::compute_spanning_set(&ranges) } - pub fn compute_spanning_set(ranges: &[RangeSet]) -> Self { - let mut spanning_ranges: Vec> = ranges.to_vec(); + /// Compute a new minimal spanning set for the provided ranges. + pub fn compute_spanning_set(ranges: &[CharRange]) -> Self { + let mut spanning_ranges: Vec = ranges.to_vec(); spanning_ranges.sort_unstable(); spanning_ranges.dedup(); @@ -87,13 +107,13 @@ impl SpanningSet { let other_set = spanning_ranges.swap_remove(index); let intersection_set = set.intersection(&other_set); new_spanning_ranges.insert(intersection_set); - let subtraction_set = set.difference(&other_set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = set.difference(&other_set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } - let subtraction_set = other_set.difference(&set); - if !subtraction_set.is_empty() { - new_spanning_ranges.insert(subtraction_set); + let difference_set = other_set.difference(&set); + if !difference_set.is_empty() { + new_spanning_ranges.insert(difference_set); } changed = true; } else if !set.is_empty() { @@ -105,7 +125,7 @@ impl SpanningSet { spanning_ranges.sort_unstable(); - let mut total = RangeSet::empty(); + let mut total = CharRange::empty(); for base in &spanning_ranges { total = total.union(base); } diff --git a/src/lib.rs b/src/lib.rs index 91493c7..b75408e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,355 +1,911 @@ use std::{ - borrow::Cow, - collections::{HashMap, HashSet}, + borrow::{Borrow, Cow}, + collections::{HashMap, HashSet, VecDeque}, + fmt::Display, hash::BuildHasherDefault, + ops::{Bound, RangeBounds}, + str::FromStr, }; use cardinality::Cardinality; use error::EngineError; -use execution_profile::ThreadLocalParams; use fast_automaton::FastAutomaton; use nohash_hasher::NoHashHasher; +#[cfg(feature = "parallel")] +use rayon::prelude::*; use regex::RegularExpression; use regex_charclass::{char::Char, irange::RangeSet}; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; + +use crate::execution_profile::ExecutionProfile; pub mod cardinality; pub mod error; pub mod execution_profile; pub mod fast_automaton; pub mod regex; -pub mod tokenizer; -type IntMap = HashMap>>; -type IntSet = HashSet>>; -type Range = RangeSet; +pub type IntMap = HashMap>>; +pub type IntSet = HashSet>>; +pub type CharRange = RangeSet; /// Represents a term that can be either a regular expression or a finite automaton. This term can be manipulated with a wide range of operations. /// -/// To put constraint and limitation on the execution of operations please refer to [`execution_profile::ExecutionProfile`]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +/// # Examples +/// ```rust +/// use regexsolver::Term; +/// use regexsolver::error::EngineError; +/// +/// fn main() -> Result<(), EngineError> { +/// // Create terms from regex +/// let t1 = Term::from_pattern("abc.*")?; +/// let t2 = Term::from_pattern(".*xyz")?; +/// +/// // Concatenate +/// let concat = t1.concat(&[t2])?; +/// assert_eq!(concat.to_pattern(), "abc.*xyz"); +/// +/// // Union +/// let union = t1.union(&[Term::from_pattern("fgh")?])?; +/// assert_eq!(union.to_pattern(), "(abc.*|fgh)"); +/// +/// // Intersection +/// let inter = Term::from_pattern("(ab|xy){2}")? +/// .intersection(&[Term::from_pattern(".*xy")?])?; +/// assert_eq!(inter.to_pattern(), "(ab|xy)xy"); +/// +/// // Difference +/// let diff = Term::from_pattern("a*")? +/// .difference(&Term::from_pattern("")?)?; +/// assert_eq!(diff.to_pattern(), "a+"); +/// +/// // Repetition +/// let rep = Term::from_pattern("abc")? +/// .repeat(2..=4)?; +/// assert_eq!(rep.to_pattern(), "(abc){2,4}"); +/// +/// // Analyze +/// assert_eq!(rep.length(), (Some(6), Some(12))); +/// assert!(!rep.is_empty()?); +/// +/// // Generate examples +/// let samples = Term::from_pattern("(x|y){1,3}")? +/// .generate_strings(5, 0)?; +/// println!("Some matches: {:?}", samples); +/// +/// // Equivalence & subset +/// let a = Term::from_pattern("a+")?; +/// let b = Term::from_pattern("a*")?; +/// assert!(!a.equivalent(&b)?); +/// assert!(a.subset(&b)?); +/// +/// Ok(()) +/// } +/// # main(); +/// ``` +/// +/// To put constraint and limitation on the execution of operations please refer to [`ExecutionProfile`]. +/// +/// # Tracing +/// +/// The core operations on [`Term`], [`FastAutomaton`], and [`RegularExpression`] +/// are instrumented with [`tracing`](https://docs.rs/tracing) spans (mostly at +/// `debug` level). Install a [`tracing-subscriber`](https://docs.rs/tracing-subscriber) +/// (or any other `tracing` subscriber) in your application to observe them; if +/// no subscriber is installed, instrumentation has negligible overhead and +/// produces no output. +/// +/// # Equality +/// +/// `PartialEq`/`Eq` (`==`) compare the **underlying representation**, not the +/// language. Two terms that match exactly the same strings can compare +/// unequal (for example, an automaton and an equivalent regular expression, or +/// two differently-written regexes for the same language). To compare +/// *languages*, use [`equivalent`](Self::equivalent); for `self ⊆ other`, use +/// [`subset`](Self::subset). #[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] +#[must_use = "terms are immutable; operations return a new term"] pub enum Term { - #[cfg_attr(feature = "serde", serde(rename = "regex"))] RegularExpression(RegularExpression), - #[cfg_attr(feature = "serde", serde(rename = "fair"))] Automaton(FastAutomaton), } +/// The default term is the empty language (matches nothing), the identity for +/// [`union`](Term::union). See [`new_empty`](Term::new_empty). +impl Default for Term { + fn default() -> Self { + Term::new_empty() + } +} + +impl Display for Term { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Term::RegularExpression(regular_expression) => write!(f, "{regular_expression}"), + Term::Automaton(fast_automaton) => write!(f, "{fast_automaton}"), + } + } +} + +/// Parses a pattern into a [`Term`], so patterns can be built with +/// [`str::parse`]. +/// +/// # Examples +/// +/// ``` +/// use regexsolver::Term; +/// +/// let term: Term = ".*abc.*".parse().unwrap(); +/// ``` +impl FromStr for Term { + type Err = EngineError; + + fn from_str(pattern: &str) -> Result { + Term::from_pattern(pattern) + } +} + +impl From for Term { + fn from(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } +} + +impl From for Term { + fn from(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) + } +} + impl Term { - /// Create a term based on the given pattern. + /// `Term` operations manage the underlying representation themselves, so + /// the determinizations they perform are by definition explicit: + /// they run with the profile's `implicit_determinization` setting + /// re-enabled (that knob targets direct [`FastAutomaton`] usage). The + /// rest of the profile is preserved. + fn run_with_implicit_determinization(f: impl FnOnce() -> R) -> R { + ExecutionProfile::get() + .with_implicit_determinization(true) + .apply(f) + } + + /// Creates a term that matches the empty language. + pub fn new_empty() -> Self { + Term::RegularExpression(RegularExpression::new_empty()) + } + + /// Creates a term that matches all possible strings. + pub fn new_total() -> Self { + Term::RegularExpression(RegularExpression::new_total()) + } + + /// Creates a term that only matches the empty string `""`. + pub fn new_empty_string() -> Self { + Term::RegularExpression(RegularExpression::new_empty_string()) + } + + /// Parses and simplifies the provided pattern and returns a new [`Term`] holding the resulting [`RegularExpression`]. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex(".*abc.*").unwrap(); + /// let term = Term::from_pattern(".*abc.*").unwrap(); /// ``` - pub fn from_regex(regex: &str) -> Result { - Ok(Term::RegularExpression(RegularExpression::new(regex)?)) + pub fn from_pattern(pattern: &str) -> Result { + Ok(Term::RegularExpression(RegularExpression::new(pattern)?)) } - /// Compute the union of the given collection of terms. - /// Returns the resulting term. + /// Creates a new `Term` holding the provided [`RegularExpression`]. + pub fn from_regex(regex: RegularExpression) -> Self { + Term::RegularExpression(regex) + } + + /// Creates a new `Term` holding the provided [`FastAutomaton`]. + pub fn from_automaton(automaton: FastAutomaton) -> Self { + Term::Automaton(automaton) + } + + /// Computes the concatenation of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("abc").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); - /// let term3 = Term::from_regex("fghi").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("d.").unwrap(); + /// let term3 = Term::from_pattern(".*").unwrap(); /// - /// let union = term1.union(&[term2, term3]).unwrap(); + /// let concat = term1.concat([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = union { - /// assert_eq!("(abc|de|fghi)", regex.to_string()); - /// } + /// assert_eq!("abcd.+", concat.to_pattern()); /// ``` - pub fn union(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - + #[tracing::instrument(level = "debug", skip_all)] + pub fn concat( + &self, + terms: impl IntoIterator>, + ) -> Result { let mut return_regex = RegularExpression::new_empty(); let mut return_automaton = FastAutomaton::new_empty(); + let mut has_automaton = false; match self { Term::RegularExpression(regular_expression) => { - return_regex = regular_expression.clone(); + return_regex = regular_expression.clone() } Term::Automaton(fast_automaton) => { + has_automaton = true; return_automaton = fast_automaton.clone(); } } - for operand in terms { - match operand { - Term::RegularExpression(regex) => { - return_regex = return_regex.union(regex); - if return_regex.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + for term in terms { + let term = term.borrow(); + if has_automaton { + return_automaton = return_automaton.concat(term.to_automaton()?.as_ref())?; + } else { + match term { + Term::RegularExpression(regular_expression) => { + return_regex = return_regex.concat(regular_expression, true); } - } - Term::Automaton(automaton) => { - return_automaton = return_automaton.union(automaton)?; - if return_automaton.is_total() { - return Ok(Term::RegularExpression(RegularExpression::new_total())); + Term::Automaton(fast_automaton) => { + has_automaton = true; + return_automaton = return_regex.to_automaton()?.concat(fast_automaton)?; } } } } - if return_automaton.is_empty() { + if !has_automaton { Ok(Term::RegularExpression(return_regex)) } else { - if !return_regex.is_empty() { - return_automaton = return_automaton.union(&return_regex.to_automaton()?)?; - } - - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) - } else { - Ok(Term::Automaton(return_automaton)) - } + Ok(Term::Automaton(return_automaton)) } } - /// Compute the intersection of the given collection of terms. - /// Returns the resulting term. + /// Computes the union of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de){2}").unwrap(); - /// let term2 = Term::from_regex("de.*").unwrap(); - /// let term3 = Term::from_regex(".*abc").unwrap(); + /// let term1 = Term::from_pattern("abc").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); + /// let term3 = Term::from_pattern("fghi").unwrap(); /// - /// let intersection = term1.intersection(&[term2, term3]).unwrap(); + /// let union = term1.union([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = intersection { - /// assert_eq!("deabc", regex.to_string()); - /// } + /// assert_eq!("(abc|de|fghi)", union.to_pattern()); /// ``` - pub fn intersection(&self, terms: &[Term]) -> Result { - Self::check_number_of_terms(terms)?; - let mut return_automaton = self.get_automaton()?; - for term in terms { - let automaton = term.get_automaton()?; - return_automaton = Cow::Owned(return_automaton.intersection(&automaton)?); - if return_automaton.is_empty() { - return Ok(Term::RegularExpression(RegularExpression::new_empty())); + #[tracing::instrument(level = "debug", skip_all)] + pub fn union( + &self, + terms: impl IntoIterator>, + ) -> Result { + let terms: Vec<_> = terms.into_iter().collect(); + let terms: Vec<&Term> = terms.iter().map(Borrow::borrow).collect(); + + let mut has_automaton = matches!(self, Term::Automaton(_)); + if !has_automaton { + for term in &terms { + if matches!(term, Term::Automaton(_)) { + has_automaton = true; + break; + } } } - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + if has_automaton { + let parallel = cfg!(feature = "parallel") && terms.len() > 3; + + let automaton_list = self.get_automata(&terms, parallel)?; + + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + #[cfg(feature = "parallel")] + let return_automaton = if parallel { + FastAutomaton::union_all_par(automaton_list) + } else { + FastAutomaton::union_all(automaton_list) + }?; + #[cfg(not(feature = "parallel"))] + let return_automaton = FastAutomaton::union_all(automaton_list)?; + + Ok(Term::Automaton(return_automaton)) } else { - Ok(Term::Automaton(return_automaton.into_owned())) + let regexes_list = self.get_regexes(&terms); + + let regexes_list = regexes_list.iter().map(AsRef::as_ref).collect::>(); + + Ok(Term::RegularExpression(RegularExpression::union_all( + regexes_list, + ))) } } - /// Compute the subtraction/difference of the two given terms. - /// Returns the resulting term. + /// Computes the intersection of the given terms. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("de").unwrap(); + /// let term1 = Term::from_pattern("(abc|de){2}").unwrap(); + /// let term2 = Term::from_pattern("de.*").unwrap(); + /// let term3 = Term::from_pattern(".*abc").unwrap(); /// - /// let subtraction = term1.subtraction(&term2).unwrap(); + /// let intersection = term1.intersection([&term2, &term3]).unwrap(); /// - /// if let Term::RegularExpression(regex) = subtraction { - /// assert_eq!("abc", regex.to_string()); - /// } + /// assert_eq!("deabc", intersection.to_pattern()); /// ``` - pub fn subtraction(&self, subtrahend: &Term) -> Result { - let minuend_automaton = self.get_automaton()?; - let subtrahend_automaton = subtrahend.get_automaton()?; - let subtrahend_automaton = - Self::determinize_subtrahend(&minuend_automaton, &subtrahend_automaton)?; - let return_automaton = minuend_automaton.subtraction(&subtrahend_automaton)?; + #[tracing::instrument(level = "debug", skip_all)] + pub fn intersection( + &self, + terms: impl IntoIterator>, + ) -> Result { + let terms: Vec<_> = terms.into_iter().collect(); + let terms: Vec<&Term> = terms.iter().map(Borrow::borrow).collect(); + + let parallel = cfg!(feature = "parallel") && terms.len() > 3; + + let automaton_list = self.get_automata(&terms, parallel)?; - if let Some(regex) = return_automaton.to_regex() { - Ok(Term::RegularExpression(regex)) + let automaton_list = automaton_list.iter().map(AsRef::as_ref).collect::>(); + + #[cfg(feature = "parallel")] + let return_automaton = if terms.len() > 3 { + FastAutomaton::intersection_all_par(automaton_list) } else { - Ok(Term::Automaton(return_automaton)) - } + FastAutomaton::intersection_all(automaton_list) + }?; + #[cfg(not(feature = "parallel"))] + let return_automaton = FastAutomaton::intersection_all(automaton_list)?; + + Ok(Term::Automaton(return_automaton)) } - /// See [`Self::subtraction`]. - #[inline] - pub fn difference(&self, subtrahend: &Term) -> Result { - self.subtraction(subtrahend) + /// Computes the difference between `self` and `other`. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("de").unwrap(); + /// + /// let difference = term1.difference(&term2).unwrap(); + /// + /// assert_eq!("abc", difference.to_pattern()); + /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] + pub fn difference(&self, other: &Term) -> Result { + Self::run_with_implicit_determinization(|| { + let minuend_automaton = self.to_automaton()?; + let subtrahend_automaton = other.to_automaton()?; + // `FastAutomaton::difference` determinizes the subtrahend itself. + let return_automaton = minuend_automaton.difference(&subtrahend_automaton)?; + + Ok(Term::Automaton(return_automaton)) + }) } - /// Returns the Details of the given term. + /// Computes the complement of `self`. /// - /// # Example: + /// # Examples /// /// ``` - /// use regexsolver::{Term, cardinality::Cardinality}; + /// use regexsolver::Term; /// - /// let term = Term::from_regex("(abc|de)").unwrap(); + /// let term = Term::from_pattern("(abc|de)").unwrap(); /// - /// let details = term.get_details().unwrap(); + /// let complement = term.complement().unwrap(); /// - /// assert_eq!(Some(Cardinality::Integer(2)), *details.get_cardinality()); - /// assert_eq!((Some(2), Some(3)), *details.get_length()); - /// assert!(!details.is_empty()); - /// assert!(!details.is_total()); + /// assert!(term.intersection(&[complement.clone()]).unwrap().is_empty().unwrap()); + /// assert!(term.union(&[complement]).unwrap().is_total().unwrap()); /// ``` - pub fn get_details(&self) -> Result { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn complement(&self) -> Result { + Self::run_with_implicit_determinization(|| { + // `FastAutomaton::complement` determinizes `self` itself. + let mut automaton = self.to_automaton()?.into_owned(); + automaton.complement()?; + + Ok(Term::Automaton(automaton)) + }) + } + + /// Computes the repetition of the current term over the given range of + /// counts. + /// + /// An unbounded end (`n..`) means unlimited repetition; an unset start + /// (`..n` or `..=n`) means zero. Exclusive bounds are normalized to inclusive. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("abc").unwrap(); + /// + /// assert_eq!("(abc)+", term.repeat(1..).unwrap().to_pattern()); + /// assert_eq!("(abc){3,5}", term.repeat(3..=5).unwrap().to_pattern()); + /// assert_eq!("(abc){3,5}", term.repeat(3..6).unwrap().to_pattern()); + /// assert_eq!("(abc){0,2}", term.repeat(..=2).unwrap().to_pattern()); + /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), min = tracing::field::Empty, max = tracing::field::Empty))] + pub fn repeat(&self, range: impl RangeBounds) -> Result { + let min = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n.saturating_add(1), + Bound::Unbounded => 0, + }; + let max_opt = match range.end_bound() { + Bound::Included(&n) => Some(n), + Bound::Excluded(&n) => Some(n.saturating_sub(1)), + Bound::Unbounded => None, + }; + let span = tracing::Span::current(); + span.record("min", min); + span.record("max", tracing::field::debug(max_opt)); match self { - Term::RegularExpression(regex) => Ok(Details { - cardinality: Some(regex.get_cardinality()), - length: regex.get_length(), - empty: regex.is_empty(), - total: regex.is_total(), - }), - Term::Automaton(automaton) => Ok(Details { - cardinality: automaton.get_cardinality(), - length: automaton.get_length(), - empty: automaton.is_empty(), - total: automaton.is_total(), - }), + Term::RegularExpression(regular_expression) => Ok(Term::RegularExpression( + regular_expression.repeat(min, max_opt), + )), + Term::Automaton(fast_automaton) => { + let repeat_automaton = fast_automaton.repeat(min, max_opt)?; + Ok(Term::Automaton(repeat_automaton)) + } + } + } + + /// Generates up to `limit` distinct strings matched by the term, skipping the first `offset` strings. + /// + /// Strings are only guaranteed to be distinct **within a single call**: + /// the offset fast-skips by counting paths, and in a non-deterministic + /// automaton the same string can be reached through several paths, so + /// calls with different offsets may repeat strings (or skip some). The + /// enumeration order also depends on the automaton's structure, so + /// offsets are only consistent across calls made on the same term. + /// + /// For pagination without repetition or skipped strings, make the term deterministic once and generate + /// from it. To check if a term is deterministic use [`is_deterministic`](Self::is_deterministic). + /// To determinize run [`determinize`](Self::determinize). + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// // Minimize once, then paginate with consistent offsets. + /// let term = Term::from_pattern("(abc|de){2}").unwrap().minimize().unwrap(); + /// + /// let batch = term.generate_strings(2, 0).unwrap(); + /// assert_eq!(2, batch.len()); // ["dede", "deabc"] + /// + /// let batch = term.generate_strings(2, 2).unwrap(); + /// assert_eq!(2, batch.len()); // ["abcde", "abcabc"] + /// ``` + #[tracing::instrument(level = "debug", skip(self), fields(self_deterministic = self.is_deterministic(), limit = limit, offset = offset))] + pub fn generate_strings( + &self, + limit: usize, + offset: usize, + ) -> Result, EngineError> { + self.to_automaton()?.generate_strings(limit, offset) + } + + /// Returns a lazy iterator over the strings matched by the term, fetched in + /// batches behind the scenes so you can stop early without choosing a limit + /// up front. + /// + /// The underlying automaton is computed once at construction time, not on + /// every batch. Each item is a `Result`: a construction or generation error + /// (e.g. a timeout from the active [`ExecutionProfile`]) surfaces as an + /// `Err`, after which the iterator ends. The same determinism caveat as + /// [`generate_strings`](Self::generate_strings) applies: call + /// [`determinize`](Self::determinize) (or [`minimize`](Self::minimize)) + /// first for distinct, stable enumeration. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("(abc|de){2}").unwrap().minimize().unwrap(); + /// + /// // Take the first three matches lazily. + /// let first_three = term + /// .iter_strings() + /// .take(3) + /// .collect::, _>>() + /// .unwrap(); + /// assert_eq!(3, first_three.len()); + /// ``` + pub fn iter_strings(&self) -> StringGenerator<'_> { + match self.to_automaton() { + Ok(automaton) => StringGenerator { + automaton: Some(automaton), + pending_error: None, + offset: 0, + buffer: VecDeque::new(), + }, + Err(e) => StringGenerator { + automaton: None, + pending_error: Some(e), + offset: 0, + buffer: VecDeque::new(), + }, } } - /// Generate strings matched by the given term. + /// Returns an equivalent term backed by a deterministic automaton. + /// + /// Already-deterministic terms are returned as-is. /// - /// # Example: + /// Determinization is always explicit, so it runs regardless of the + /// profile's [`implicit_determinization`](crate::execution_profile::ExecutionProfileBuilder::implicit_determinization) + /// setting. + /// + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term = Term::from_regex("(abc|de){2}").unwrap(); + /// let term = Term::from_pattern(".*abc").unwrap(); + /// assert!(!term.is_deterministic()); /// - /// let strings = term.generate_strings(3).unwrap(); + /// let dfa = term.determinize().unwrap(); + /// assert!(dfa.is_deterministic()); + /// assert!(term.equivalent(&dfa).unwrap()); + /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn determinize(&self) -> Result { + let automaton = self.to_automaton()?; + let determinized = automaton.determinize()?.into_owned(); + Ok(Term::Automaton(determinized)) + } + + /// Returns an equivalent term backed by the minimal deterministic + /// automaton. + /// + /// # Examples /// - /// assert_eq!(3, strings.len()); // ex: ["deabc", "dede", "abcde"] /// ``` - pub fn generate_strings(&self, count: usize) -> Result, EngineError> { - Ok(self - .get_automaton()? - .generate_strings(count)? - .into_iter() - .collect()) + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern(".*abc").unwrap(); + /// let minimal = term.minimize().unwrap(); + /// assert!(minimal.is_minimal()); + /// assert!(term.equivalent(&minimal).unwrap()); + /// ``` + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), self_minimal = self.is_minimal()))] + pub fn minimize(&self) -> Result { + Self::run_with_implicit_determinization(|| { + let mut automaton = self.to_automaton()?.into_owned(); + automaton.minimize()?; + Ok(Term::Automaton(automaton)) + }) } - /// Compute if the two given terms are equivalent. + /// Returns `true` if both terms accept the same language. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("(abc|de)").unwrap(); - /// let term2 = Term::from_regex("(abc|de)*").unwrap(); + /// let term1 = Term::from_pattern("(abc|de)").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)*").unwrap(); /// - /// assert!(!term1.are_equivalent(&term2).unwrap()); + /// assert!(!term1.equivalent(&term2).unwrap()); /// ``` - pub fn are_equivalent(&self, that: &Term) -> Result { - if self == that { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] + pub fn equivalent(&self, other: &Term) -> Result { + if self == other { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; - automaton_1.is_equivalent_of(&automaton_2) + Self::run_with_implicit_determinization(|| { + let automaton_1 = self.to_automaton()?; + let automaton_2 = other.to_automaton()?; + automaton_1.equivalent(&automaton_2) + }) } - /// Compute if the first term is a subset of the second one. + /// Returns `true` if all strings matched by the current term are also matched by the given term. /// - /// # Example: + /// # Examples /// /// ``` /// use regexsolver::Term; /// - /// let term1 = Term::from_regex("de").unwrap(); - /// let term2 = Term::from_regex("(abc|de)").unwrap(); + /// let term1 = Term::from_pattern("de").unwrap(); + /// let term2 = Term::from_pattern("(abc|de)").unwrap(); /// - /// assert!(term1.is_subset_of(&term2).unwrap()); + /// assert!(term1.subset(&term2).unwrap()); /// ``` - pub fn is_subset_of(&self, that: &Term) -> Result { - if self == that { + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic(), other_deterministic = other.is_deterministic()))] + pub fn subset(&self, other: &Term) -> Result { + if self == other { return Ok(true); } - let automaton_1 = self.get_automaton()?; - let automaton_2 = that.get_automaton()?; - automaton_1.is_subset_of(&automaton_2) + Self::run_with_implicit_determinization(|| { + let automaton_1 = self.to_automaton()?; + let automaton_2 = other.to_automaton()?; + automaton_1.subset(&automaton_2) + }) } - fn check_number_of_terms(terms: &[Term]) -> Result<(), EngineError> { - let number_of_terms = terms.len() + 1; - let max_number_of_terms = ThreadLocalParams::get_max_number_of_terms(); - if number_of_terms > max_number_of_terms { - Err(EngineError::TooMuchTerms( - max_number_of_terms, - number_of_terms, - )) - } else { - Ok(()) + /// Returns `true` if the term matches the given string. + /// + /// Matching is **anchored** (full-string), consistent with the rest of the + /// crate: the whole input must be accepted, not just a substring. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// let term = Term::from_pattern("abc.*").unwrap(); + /// + /// assert!(term.matches("abcdef").unwrap()); + /// assert!(!term.matches("xyzabc").unwrap()); + /// ``` + #[tracing::instrument(level = "debug", skip(self, input), fields(self_deterministic = self.is_deterministic(), input_len = input.len()))] + pub fn matches(&self, input: &str) -> Result { + Ok(self.to_automaton()?.is_match(input)) + } + + /// Returns `true` if the term matches the empty language (no strings at all). + /// + /// Note: the empty language is distinct from the language containing only + /// the empty string `""`. Use [`is_empty_string`](Self::is_empty_string) to + /// test for the latter. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::new_empty().is_empty().unwrap()); + /// assert!(!Term::new_empty_string().is_empty().unwrap()); // matches "" + /// assert!(!Term::from_pattern("abc").unwrap().is_empty().unwrap()); + /// ``` + pub fn is_empty(&self) -> Result { + Ok(match self { + Term::RegularExpression(regex) => regex.is_empty(), + Term::Automaton(automaton) => automaton.is_empty(), + }) + } + + /// Returns `true` if the term matches all possible strings. + pub fn is_total(&self) -> Result { + match self { + Term::RegularExpression(regex) => Ok(regex.is_total()), + Term::Automaton(automaton) => { + if automaton.is_total() { + Ok(true) + } else if automaton.is_deterministic() { + Ok(false) + } else { + Ok(automaton.determinize()?.is_total()) + } + } } } - fn determinize_subtrahend<'a>( - minuend: &FastAutomaton, - subtrahend: &'a FastAutomaton, - ) -> Result, EngineError> { - if subtrahend.is_determinitic() { - Ok(Cow::Borrowed(subtrahend)) - } else if !minuend.is_cyclic() && subtrahend.is_cyclic() { - Ok(Cow::Owned(minuend.intersection(subtrahend)?.determinize()?)) - } else { - Ok(Cow::Owned(subtrahend.determinize()?)) + /// Returns `true` if the term matches only the empty string `""`. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::new_empty_string().is_empty_string().unwrap()); + /// assert!(!Term::new_empty().is_empty_string().unwrap()); + /// assert!(!Term::from_pattern("a*").unwrap().is_empty_string().unwrap()); + /// ``` + pub fn is_empty_string(&self) -> Result { + Ok(match self { + Term::RegularExpression(regex) => regex.is_empty_string(), + Term::Automaton(automaton) => automaton.is_empty_string(), + }) + } + + /// Returns `true` if the term is *already backed by* a deterministic + /// automaton. + /// + /// A deterministic automaton has one path per accepted string. + /// + /// To determinize a term call [`determinize`](Self::determinize). + #[must_use] + pub fn is_deterministic(&self) -> bool { + match self { + Term::RegularExpression(_) => false, + Term::Automaton(automaton) => automaton.is_deterministic(), + } + } + + /// Returns `true` if the term is *already backed by* the minimal + /// deterministic automaton. + /// + /// The minimal deterministic automaton of a given language is unique. + /// + /// To minimize a term call [`minimize`](Self::minimize). + #[must_use] + pub fn is_minimal(&self) -> bool { + match self { + Term::RegularExpression(_) => false, + Term::Automaton(automaton) => automaton.is_minimal(), + } + } + + /// Returns the minimum and maximum length of matched strings. + /// + /// `None` for the minimum means the language is empty (no strings are + /// matched). `None` for the maximum means the language is infinite + /// (unbounded match length). + #[must_use] + pub fn length(&self) -> (Option, Option) { + match self { + Term::RegularExpression(regex) => regex.length(), + Term::Automaton(automaton) => automaton.length(), } } - fn get_automaton(&self) -> Result, EngineError> { + /// Returns the cardinality of the term (the number of possible matched strings). + /// + /// The exact count is represented as `u32`. If the exact count exceeds + /// `u32::MAX`, the result is `Cardinality::BigInteger` rather than a + /// truncated value. Infinite languages return `Cardinality::Infinite`. + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn cardinality(&self) -> Result, EngineError> { + match self { + Term::RegularExpression(regex) => Ok(regex.cardinality()), + Term::Automaton(automaton) => { + Self::run_with_implicit_determinization(|| automaton.cardinality()) + } + } + } + + /// Returns `true` if the term matches a finite number of strings. + /// + /// A finite language is one with no unbounded repetition (`*`, `+`, ...). + /// Convenience over [`cardinality`](Self::cardinality) when only the + /// finite/infinite distinction matters. + /// + /// # Examples + /// + /// ``` + /// use regexsolver::Term; + /// + /// assert!(Term::from_pattern("(ab|c){2}").unwrap().is_finite().unwrap()); + /// assert!(!Term::from_pattern("a+").unwrap().is_finite().unwrap()); + /// ``` + pub fn is_finite(&self) -> Result { + Ok(!matches!(self.cardinality()?, Cardinality::Infinite)) + } + + /// Converts the term to a [`FastAutomaton`]. + /// + /// Returns a [`Cow`]: borrows the automaton when the term is already + /// automaton-backed, and allocates a new one when converting from a + /// [`RegularExpression`]. + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn to_automaton(&self) -> Result, EngineError> { Ok(match self { Term::RegularExpression(regex) => Cow::Owned(regex.to_automaton()?), Term::Automaton(automaton) => Cow::Borrowed(automaton), }) } -} -/// Represents details about a [Term]. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone, PartialEq, Eq, Debug)] -#[cfg_attr(feature = "serde", serde(tag = "type", rename = "details"))] -pub struct Details { - cardinality: Option>, - length: (Option, Option), - empty: bool, - total: bool, -} + /// Converts the term to a [`RegularExpression`]. + /// + /// Returns a [`Cow`]: borrows the expression when the term is already + /// regex-backed, and allocates a new one when converting from a + /// [`FastAutomaton`] via state elimination. + #[must_use] + #[tracing::instrument(level = "debug", skip_all, fields(self_deterministic = self.is_deterministic()))] + pub fn to_regex(&self) -> Cow<'_, RegularExpression> { + match self { + Term::RegularExpression(regex) => Cow::Borrowed(regex), + Term::Automaton(automaton) => Cow::Owned(automaton.to_regex()), + } + } -impl Details { - /// Return the number of unique strings matched. - pub fn get_cardinality(&self) -> &Option> { - &self.cardinality + /// Converts the term to a regular expression pattern. + #[must_use] + pub fn to_pattern(&self) -> String { + self.to_regex().to_string() } - /// Return the minimum and the maximum length of matched strings. - pub fn get_length(&self) -> &(Option, Option) { - &self.length + fn get_automata<'a>( + &'a self, + terms: &[&'a Term], + parallel: bool, + ) -> Result>, EngineError> { + let mut automaton_list = Vec::with_capacity(terms.len() + 1); + automaton_list.push(self.to_automaton()?); + + #[cfg(feature = "parallel")] + let mut terms_automata = if parallel { + let execution_profile = ExecutionProfile::get(); + terms + .par_iter() + .map(|a| execution_profile.apply(|| a.to_automaton())) + .collect::, _>>() + } else { + terms + .iter() + .map(|a| a.to_automaton()) + .collect::, _>>() + }?; + #[cfg(not(feature = "parallel"))] + let mut terms_automata = { + let _ = parallel; + terms + .iter() + .map(|a| a.to_automaton()) + .collect::, EngineError>>()? + }; + automaton_list.append(&mut terms_automata); + + Ok(automaton_list) } - /// Return `true` if it does not match any string. - pub fn is_empty(&self) -> bool { - self.empty + fn get_regexes<'a>(&'a self, terms: &[&'a Term]) -> Vec> { + let mut regex_list = Vec::with_capacity(terms.len() + 1); + regex_list.push(self.to_regex()); + + let mut terms_regexes = terms.iter().map(|a| a.to_regex()).collect::>(); + regex_list.append(&mut terms_regexes); + + regex_list } +} + +/// Lazy iterator over the strings matched by a [`Term`], created by +/// [`Term::iter_strings`]. +/// +/// The underlying automaton is computed once at construction. Yields +/// `Result`: errors (from construction or generation) +/// are surfaced as `Err` items, after which the iterator ends. +pub struct StringGenerator<'a> { + automaton: Option>, + pending_error: Option, + offset: usize, + buffer: VecDeque, +} + +impl Iterator for StringGenerator<'_> { + type Item = Result; - /// Return `true` if it match all possible strings. - pub fn is_total(&self) -> bool { - self.total + fn next(&mut self) -> Option { + const BATCH: usize = 32; + + if let Some(s) = self.buffer.pop_front() { + return Some(Ok(s)); + } + if let Some(e) = self.pending_error.take() { + return Some(Err(e)); + } + let automaton = self.automaton.as_ref()?; + match automaton.generate_strings(BATCH, self.offset) { + Ok(batch) => { + if batch.len() < BATCH { + self.automaton = None; + } + self.offset += batch.len(); + self.buffer.extend(batch); + self.buffer.pop_front().map(Ok) + } + Err(e) => { + self.automaton = None; + Some(Err(e)) + } + } } } @@ -360,43 +916,72 @@ mod tests { use super::*; #[test] - fn test_details() -> Result<(), String> { - let regex1 = Term::from_regex("a").unwrap(); - let regex2 = Term::from_regex("b").unwrap(); + fn test_complement() -> Result<(), String> { + let term = Term::from_pattern("(abc|de)").unwrap(); + + let complement = term.complement().unwrap(); + + assert!( + term.intersection([&complement]) + .unwrap() + .is_empty() + .unwrap() + ); + + println!("term: {}", term.to_automaton().unwrap().to_dot()); - let details = regex1.intersection(&vec![regex2]); - assert!(details.is_ok()); + if let Term::Automaton(complement) = &complement { + println!("complement: {}", complement.to_dot()); + } + + let union = term.union(&[complement]).unwrap(); + if let Term::Automaton(union) = &union { + println!("{}", union.to_dot()); + let union = union.determinize().unwrap(); + println!("{}", union.to_dot()); + } + + assert!(union.is_total().unwrap()); Ok(()) } #[test] - fn test_subtraction_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("").unwrap(); + fn test_intersection() -> Result<(), String> { + let regex1 = Term::from_pattern("a").unwrap(); + let regex2 = Term::from_pattern("b").unwrap(); + + let intersection = regex1.intersection(&[regex2]).unwrap(); + assert!(intersection.is_empty().unwrap()); + assert_eq!("[]", intersection.to_pattern()); + + Ok(()) + } - let result = regex1.subtraction(®ex2); + #[test] + fn test_difference_1() -> Result<(), String> { + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("").unwrap(); + + let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!( - Term::RegularExpression(RegularExpression::new("a+").unwrap()), - result - ); + let result = result.unwrap().to_pattern(); + assert_eq!("a+", result); Ok(()) } #[test] - fn test_subtraction_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + fn test_difference_2() -> Result<(), String> { + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.subtraction(®ex2); + let result = regex1.difference(®ex2); assert!(result.is_ok()); - let result = result.unwrap(); + let result = result.unwrap().to_regex().into_owned(); assert_eq!( - Term::RegularExpression(RegularExpression::new("(xxx)*(x|xx)").unwrap()), - result + Term::RegularExpression(RegularExpression::new("x(x{3})*x?").unwrap()), + Term::RegularExpression(result) ); Ok(()) @@ -404,41 +989,227 @@ mod tests { #[test] fn test_intersection_1() -> Result<(), String> { - let regex1 = Term::from_regex("a*").unwrap(); - let regex2 = Term::from_regex("b*").unwrap(); + let regex1 = Term::from_pattern("a*").unwrap(); + let regex2 = Term::from_pattern("b*").unwrap(); - let result = regex1.intersection(&vec![regex2]); + let result = regex1.intersection(&[regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!(Term::from_regex("").unwrap(), result); + let result = result.unwrap().to_pattern(); + assert_eq!("", result); Ok(()) } #[test] fn test_intersection_2() -> Result<(), String> { - let regex1 = Term::from_regex("x*").unwrap(); - let regex2 = Term::from_regex("(xxx)*").unwrap(); + let regex1 = Term::from_pattern("x*").unwrap(); + let regex2 = Term::from_pattern("(xxx)*").unwrap(); - let result = regex1.intersection(&vec![regex2]); + let result = regex1.intersection(&[regex2]); assert!(result.is_ok()); - let result = result.unwrap(); - assert_eq!( - Term::RegularExpression(RegularExpression::new("(x{3})*").unwrap()), - result - ); + let result = result.unwrap().to_pattern(); + assert_eq!("(x{3})*", result); Ok(()) } #[test] - fn test__() -> Result<(), String> { - let term = Term::from_regex("(abc|de){2}").unwrap(); + fn test_default_is_empty_language() { + assert!(Term::default().is_empty().unwrap()); + assert_eq!(Term::default(), Term::new_empty()); + } - let strings = term.generate_strings(3).unwrap(); + #[test] + fn test_iter_strings_exhaustive_matches_generate_strings() { + // A finite, deterministic term: lazy iteration must yield exactly the + // same multiset as a single large `generate_strings` call, with no + // duplicates or omissions across batch boundaries. + let term = Term::from_pattern("[A-Za-z0-9]") + .unwrap() + .minimize() + .unwrap(); - println!("strings={:?}", strings); + let eager = term.generate_strings(1000, 0).unwrap(); + let lazy = term.iter_strings().collect::, _>>().unwrap(); - Ok(()) + assert_eq!(eager.len(), lazy.len()); + assert_eq!(eager, lazy); + assert_eq!(62, lazy.len()); + } + + #[test] + fn test_is_finite() { + assert!( + Term::from_pattern("(ab|c){2}") + .unwrap() + .is_finite() + .unwrap() + ); + assert!(!Term::from_pattern("a+").unwrap().is_finite().unwrap()); + } + + #[test] + fn test_matches_is_anchored() { + let term = Term::from_pattern("abc.*").unwrap(); + assert!(term.matches("abc").unwrap()); + assert!(term.matches("abcdef").unwrap()); + // Anchored: a prefix/suffix match is not enough. + assert!(!term.matches("xyzabc").unwrap()); + + let exact = Term::from_pattern("abc").unwrap(); + assert!(exact.matches("abc").unwrap()); + assert!(!exact.matches("abcd").unwrap()); + + // Works on an automaton-backed term too. + let automaton_backed = exact.intersection([&term]).unwrap(); + assert!(matches!(automaton_backed, Term::Automaton(_))); + assert!(automaton_backed.matches("abc").unwrap()); + assert!(!automaton_backed.matches("abcd").unwrap()); + + // The empty language matches nothing; the empty string matches only "". + assert!(!Term::new_empty().matches("").unwrap()); + assert!(Term::new_empty_string().matches("").unwrap()); + assert!(!Term::new_empty_string().matches("a").unwrap()); + } + + #[test] + fn test_from_str_and_from_conversions() { + // `FromStr` agrees with `from_pattern`. + let parsed: Term = "abc".parse().unwrap(); + assert_eq!(parsed, Term::from_pattern("abc").unwrap()); + + // Invalid patterns surface as parse errors (backreferences are not regular). + assert!(r"(a)\1".parse::().is_err()); + + // `From` / `From` match the explicit constructors. + let regex = RegularExpression::new("abc").unwrap(); + let from_into: Term = regex.clone().into(); + assert_eq!(from_into, Term::from_regex(regex)); + + let automaton = Term::from_pattern("abc") + .unwrap() + .to_automaton() + .unwrap() + .into_owned(); + let from_into: Term = automaton.clone().into(); + assert_eq!(from_into, Term::from_automaton(automaton)); + } + + #[test] + fn test_is_deterministic_and_determinize() { + // A pattern-backed term is never reported deterministic (NFA form). + let regex_term = Term::from_pattern("(abc|de){2}").unwrap(); + assert!(!regex_term.is_deterministic()); + + // `determinize` produces a deterministic, language-equivalent term. + let dfa = regex_term.determinize().unwrap(); + assert!(dfa.is_deterministic()); + assert!(regex_term.equivalent(&dfa).unwrap()); + + // Determinizing an already-deterministic term keeps it deterministic + // and equivalent. + let dfa2 = dfa.determinize().unwrap(); + assert!(dfa2.is_deterministic()); + assert!(dfa.equivalent(&dfa2).unwrap()); + } + + #[test] + fn test_is_minimal_and_minimize() { + // A pattern-backed term is never reported minimal. + let regex_term = Term::from_pattern("(abc|de){2}").unwrap(); + assert!(!regex_term.is_minimal()); + + // `minimize` produces a minimal, language-equivalent term. + let minimal = regex_term.minimize().unwrap(); + assert!(minimal.is_minimal()); + assert!(minimal.is_deterministic()); // minimal implies deterministic + assert!(regex_term.equivalent(&minimal).unwrap()); + } + + #[test] + fn test_eq_is_structural_not_language() { + // Same language, different representation: structurally unequal, but + // language-equivalent. `==` must not be mistaken for `equivalent`. + let regex_term = Term::from_pattern("(a|b)*").unwrap(); + let automaton_term = Term::from_automaton(regex_term.to_automaton().unwrap().into_owned()); + + assert_ne!(regex_term, automaton_term); + assert!(regex_term.equivalent(&automaton_term).unwrap()); + } + + #[test] + fn test_repeat_range_edges() { + let term = Term::from_pattern("abc").unwrap(); + + // Unbounded / unset bounds. + assert_eq!("(abc)*", term.repeat(..).unwrap().to_pattern()); + assert_eq!("(abc){2,}", term.repeat(2..).unwrap().to_pattern()); + assert_eq!("(abc){0,2}", term.repeat(..3).unwrap().to_pattern()); + + // Zero repetitions is the empty string. + assert!(term.repeat(0..=0).unwrap().is_empty_string().unwrap()); + + // A range whose normalized max < min denotes no valid repetition count, + // so the simplifier reduces it to the empty language (matches nothing). + // (Bounds from variables: a literal reversed range trips a lint.) + let (min, max) = (5u32, 3u32); + assert!(term.repeat(min..max).unwrap().is_empty().unwrap()); + } + + #[test] + fn test_iter_strings_is_lazy_on_infinite_language() { + // Must not hang on an infinite language: take a finite prefix. + let term = Term::from_pattern("a+").unwrap(); + let first = term + .iter_strings() + .take(5) + .collect::, _>>() + .unwrap(); + assert_eq!(5, first.len()); + } + + #[test] + fn test_iter_strings_propagates_error_then_ends() { + use crate::execution_profile::ExecutionProfileBuilder; + + // A tight state budget makes the underlying `to_automaton` fail; the + // iterator must surface that error once and then terminate. + let term = Term::from_pattern("abcdef").unwrap(); + let profile = ExecutionProfileBuilder::new() + .max_number_of_states(1) + .build(); + + profile.run(|| { + let mut it = term.iter_strings(); + assert!(matches!( + it.next(), + Some(Err(EngineError::AutomatonHasTooManyStates)) + )); + assert!(it.next().is_none()); + }); + } + + #[test] + fn test_variadic_ops_with_no_operands_equal_self() { + let term = Term::from_pattern("abc").unwrap(); + + assert!( + term.concat(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); + assert!( + term.union(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); + assert!( + term.intersection(std::iter::empty::<&Term>()) + .unwrap() + .equivalent(&term) + .unwrap() + ); } } diff --git a/src/regex/analyze/affixes.rs b/src/regex/analyze/affixes.rs index 4213e3f..bb8a5ee 100644 --- a/src/regex/analyze/affixes.rs +++ b/src/regex/analyze/affixes.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { - pub fn get_common_affixes( + pub(crate) fn get_common_affixes( &self, other: &RegularExpression, ) -> ( @@ -21,7 +21,7 @@ impl RegularExpression { (common_prefix, (self_regex, other_regex), common_suffix) } - pub fn get_common_affix( + pub(crate) fn get_common_affix( &self, other: &RegularExpression, is_prefix: bool, @@ -46,27 +46,27 @@ impl RegularExpression { let other_regex; match (self, other) { - (RegularExpression::Concat(_), _) => { + (RegularExpression::Concat(..), _) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_concat_and_other(self, other, is_prefix); } - (_, RegularExpression::Concat(_)) => { + (_, RegularExpression::Concat(..)) => { (common_affix, (other_regex, self_regex)) = Self::opaffix_concat_and_other(other, self, is_prefix); } - (RegularExpression::Character(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Character(..), RegularExpression::Repetition(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_character_and_repetition(self, other); } - (RegularExpression::Repetition(_, _, _), RegularExpression::Character(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Character(..)) => { (common_affix, (other_regex, self_regex)) = Self::opaffix_character_and_repetition(other, self); } - (RegularExpression::Repetition(_, _, _), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Repetition(..), RegularExpression::Repetition(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_repetition_and_repetition(self, other); } - (RegularExpression::Alternation(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Alternation(..)) => { (common_affix, (self_regex, other_regex)) = Self::opaffix_alternation_and_alternation(self, other); } @@ -86,11 +86,17 @@ impl RegularExpression { (RegularExpression, RegularExpression), ) { if let ( - RegularExpression::Character(_), + RegularExpression::Character(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_character, that_repetition) { - if this_character == &**that_regex && *that_min == 1 { + // The `max != 0` guard keeps a directly-constructed invalid + // repetition (`r{1,0}`) from underflowing; such trees are + // rejected by `to_automaton`, the simplifier just must not panic. + if this_character == &**that_regex + && *that_min == 1 + && that_max_opt.is_none_or(|that_max| that_max >= 1) + { let new_max = that_max_opt.as_ref().map(|that_max| that_max - 1); ( Some(this_character.clone()), @@ -285,6 +291,7 @@ mod tests { assert_regex_affix(true, "(ab|cd)x", "(ab|cd)y", "(ab|cd)", "x", "y"); assert_regex_affix(true, "a+", "a+b", "a+", "", "b"); + assert_regex_affix(true, "(ab|cd)", "(ab|cd)", "(ab|cd)", "", ""); Ok(()) } diff --git a/src/regex/analyze/mod.rs b/src/regex/analyze/mod.rs index ae08148..5e2ab8b 100644 --- a/src/regex/analyze/mod.rs +++ b/src/regex/analyze/mod.rs @@ -6,7 +6,9 @@ mod affixes; mod number_of_states; impl RegularExpression { - pub fn get_length(&self) -> (Option, Option) { + /// Returns the minimum and maximum length of possible matched strings. + #[must_use] + pub fn length(&self) -> (Option, Option) { match self { RegularExpression::Character(range) => { if range.is_empty() { @@ -15,7 +17,7 @@ impl RegularExpression { (Some(1), Some(1)) } RegularExpression::Repetition(regex, min, max_opt) => { - let (min_length, max_length_opt) = regex.get_length(); + let (min_length, max_length_opt) = regex.length(); if let Some(min_length) = min_length { let new_min_length = min * min_length; let new_max_length = if let Some(max_length) = max_length_opt { @@ -35,7 +37,7 @@ impl RegularExpression { let mut new_max_length = Some(0); for concat_element in concat_vec { - let (min_length, max_length_opt) = concat_element.get_length(); + let (min_length, max_length_opt) = concat_element.length(); if let Some(min_length) = min_length { new_min_length += min_length; @@ -62,7 +64,7 @@ impl RegularExpression { let mut new_max_length = Some(0); for alternation_element in alternation_vec { - let (min_length, max_length_opt) = alternation_element.get_length(); + let (min_length, max_length_opt) = alternation_element.length(); if let Some(min_length) = min_length { new_min_length = cmp::min(new_min_length, min_length); @@ -84,7 +86,8 @@ impl RegularExpression { } } - pub fn get_cardinality(&self) -> Cardinality { + /// Returns the cardinality of the regular expression (i.e., the number of possible matched strings). + pub fn cardinality(&self) -> Cardinality { if self.is_empty() { return Cardinality::Integer(0); } else if self.is_total() { @@ -94,7 +97,7 @@ impl RegularExpression { RegularExpression::Character(range) => Cardinality::Integer(range.get_cardinality()), RegularExpression::Repetition(regular_expression, min, max_opt) => { if let Some(max) = max_opt { - let regex_cardinality = regular_expression.get_cardinality(); + let regex_cardinality = regular_expression.cardinality(); if let Cardinality::Integer(cardinality) = regex_cardinality { let mut cardinality_temp: u32 = 0; for i in *min..*max + 1 { @@ -119,7 +122,7 @@ impl RegularExpression { RegularExpression::Concat(concat) => { let mut cardinality: u32 = 1; for concat_element in concat { - let element_cardinality = concat_element.get_cardinality(); + let element_cardinality = concat_element.cardinality(); if let Cardinality::Integer(element_cardinality) = element_cardinality { if let Some(mult) = cardinality.checked_mul(element_cardinality) { cardinality = mult; @@ -135,7 +138,7 @@ impl RegularExpression { RegularExpression::Alternation(alternation) => { let mut cardinality: u32 = 0; for alternation_element in alternation { - let element_cardinality = alternation_element.get_cardinality(); + let element_cardinality = alternation_element.cardinality(); if let Cardinality::Integer(element_cardinality) = element_cardinality { if let Some(add) = cardinality.checked_add(element_cardinality) { cardinality = add; @@ -169,16 +172,18 @@ mod tests { assert_length("(at?)"); assert_length("(ot){3,4}"); assert_length("(ot?d){1,4}"); - assert_length("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_length( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); assert_eq!( - FastAutomaton::new_empty().get_length(), - RegularExpression::new_empty().get_length() + FastAutomaton::new_empty().length(), + RegularExpression::new_empty().length() ); assert_eq!( - FastAutomaton::new_total().get_length(), - RegularExpression::new_total().get_length() + FastAutomaton::new_total().length(), + RegularExpression::new_total().length() ); Ok(()) } @@ -187,12 +192,12 @@ mod tests { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); - let (min, max_opt) = regex.get_length(); + let (min, max_opt) = regex.length(); let automaton = regex.to_automaton().unwrap(); //automaton.to_dot(); - let (min_automaton_opt, max_automaton_opt) = automaton.get_length(); + let (min_automaton_opt, max_automaton_opt) = automaton.length(); assert_eq!((min_automaton_opt, max_automaton_opt), (min, max_opt)); } @@ -212,7 +217,9 @@ mod tests { assert_cardinality("(ot){3,4}"); assert_cardinality("(t){1,3}"); assert_cardinality("(ot?d){1,4}"); - assert_cardinality("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_cardinality( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); Ok(()) } @@ -220,17 +227,13 @@ mod tests { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); - let cardinality = regex.get_cardinality(); - - let mut automaton = regex.to_automaton().unwrap(); - - if !automaton.is_cyclic() { - automaton = automaton.determinize().unwrap(); - } - - //automaton.to_dot(); + let cardinality = regex.cardinality(); - let expected = automaton.get_cardinality().unwrap(); + let automaton = regex.to_automaton().unwrap(); + // `cardinality` returns `Infinite` for cyclic automata without + // determinizing and only determinizes the finite (acyclic) + // non-deterministic ones internally. + let expected = automaton.cardinality().unwrap(); assert_eq!(expected, cardinality); } diff --git a/src/regex/analyze/number_of_states.rs b/src/regex/analyze/number_of_states.rs index 90c1897..90d41ca 100644 --- a/src/regex/analyze/number_of_states.rs +++ b/src/regex/analyze/number_of_states.rs @@ -9,7 +9,7 @@ struct AbstractStateMetadata { } impl AbstractStateMetadata { - pub fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { + pub(crate) fn new(has_incoming_edges: bool, has_outgoing_edges: bool) -> Self { AbstractStateMetadata { has_incoming_edges, has_outgoing_edges, @@ -17,7 +17,7 @@ impl AbstractStateMetadata { } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct AbstractNFAMetadata { start: AbstractStateMetadata, accepted: Vec, @@ -25,7 +25,7 @@ struct AbstractNFAMetadata { } impl AbstractNFAMetadata { - pub fn new() -> Self { + pub(crate) fn new() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, true), accepted: vec![AbstractStateMetadata::new(true, false)], @@ -33,7 +33,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty_string() -> Self { + pub(crate) fn new_empty_string() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![AbstractStateMetadata::new(false, false)], @@ -41,7 +41,7 @@ impl AbstractNFAMetadata { } } - pub fn new_empty() -> Self { + pub(crate) fn new_empty() -> Self { AbstractNFAMetadata { start: AbstractStateMetadata::new(false, false), accepted: vec![], @@ -49,7 +49,7 @@ impl AbstractNFAMetadata { } } - pub fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn concat(&self, nfa: &AbstractNFAMetadata) -> Self { let start_state_and_accept_states_not_mergeable = nfa.start.has_incoming_edges && self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -68,7 +68,24 @@ impl AbstractNFAMetadata { } } - pub fn repeat(&self, min: u32, max_opt: &Option) -> Self { + pub(crate) fn repeat(&self, min: u32, max_opt: &Option) -> Self { + // r⁰ = {""} (the empty-string automaton, a single state). + if max_opt == &Some(0) { + return Self::new_empty_string(); + } + + // Unbounded with min >= 1: `repeat_mut` builds r{min,} = rᵐⁱⁿ · r*. + // Mirror that here (mandatory copies via merging concatenation, then a + // recursively-built star) so the predicted count stays consistent with + // the construction even when the start state has incoming edges. + if max_opt.is_none() && min >= 1 { + let mut acc = self.clone(); + for _ in 1..min { + acc = acc.concat(self); + } + return acc.concat(&self.repeat(0, &None)); + } + let start_state_not_mergeable = self.start.has_incoming_edges; let accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); let start_state_or_accept_states_not_mergeable = @@ -88,7 +105,13 @@ impl AbstractNFAMetadata { return_accepted.push(return_start.clone()); if max_opt.is_none() { let return_number_of_states = if !start_state_or_accept_states_not_mergeable { - self.number_of_states - 1 + // An automaton always has at least one state. Degenerate + // sub-expressions denoting {""} (e.g. an unsimplified + // `(a{0,0})*`) reach this point with a single state, and + // the merge discount must not drive the count to zero; + // every later `- 1` in this module relies on counts + // staying >= 1. + (self.number_of_states - 1).max(1) } else { self.number_of_states }; @@ -105,13 +128,30 @@ impl AbstractNFAMetadata { } let return_number_of_states = if let Some(max) = max_opt { - let mult = if start_state_not_mergeable && (accepted_not_mergeable || min == 0) { + // Mirror `repeat_mut`: rᵐⁱⁿ mandatory copies built by merging + // concatenation, then `max - max(min,1)` optional tail copies. A + // tail copy whose start has incoming edges is concatenated without + // merging (a fresh start state, so +`number_of_states`); otherwise + // it merges (+`number_of_states - 1`). + let max = *max as usize; + let merge_cost = if start_state_not_mergeable && accepted_not_mergeable { + self.number_of_states + } else { + self.number_of_states - 1 + }; + let tail_cost = if start_state_not_mergeable { self.number_of_states } else { self.number_of_states - 1 }; - *max as usize * mult + 1 + if min == 0 { + let base = self.number_of_states + if start_state_not_mergeable { 1 } else { 0 }; + base + max.saturating_sub(1) * tail_cost + } else { + let mandatory = self.number_of_states + (min as usize - 1) * merge_cost; + mandatory + max.saturating_sub(min as usize) * tail_cost + } } else { let mult = if start_state_not_mergeable { self.number_of_states @@ -129,7 +169,7 @@ impl AbstractNFAMetadata { } } - pub fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { + pub(crate) fn alternate(&mut self, nfa: &AbstractNFAMetadata) -> Self { let self_start_state_not_mergeable = self.start.has_incoming_edges; let self_accepted_not_mergeable = self.accepted.iter().any(|s| s.has_outgoing_edges); @@ -156,19 +196,22 @@ impl AbstractNFAMetadata { AbstractNFAMetadata { start: return_start, accepted: return_accepted, - number_of_states: return_number_of_states, + // Both merge discounts can apply to two single-state {""} + // operands (e.g. `a{0,0}|b{0,0}`); clamp so the count never + // reaches zero (see `repeat`). + number_of_states: return_number_of_states.max(1), } } } impl RegularExpression { - pub fn get_number_of_states_in_nfa(&self) -> usize { + pub(crate) fn get_number_of_states_in_nfa(&self) -> usize { self.evaluate_number_of_states_in_nfa().number_of_states } fn evaluate_number_of_states_in_nfa(&self) -> AbstractNFAMetadata { match self { - RegularExpression::Character(_) => AbstractNFAMetadata::new(), + RegularExpression::Character(..) => AbstractNFAMetadata::new(), RegularExpression::Repetition(regex, min, max_opt) => regex .evaluate_number_of_states_in_nfa() .repeat(*min, max_opt), @@ -222,6 +265,13 @@ mod tests { assert_number_of_states_in_nfa("(b*a){5,26}"); assert_number_of_states_in_nfa("(ba*){5,26}"); + // Unbounded with min >= 1 over a self-looping start (r{min,} = rᵐⁱⁿ·r*). + assert_number_of_states_in_nfa("(b*a){1,}"); + assert_number_of_states_in_nfa("(b*a){2,}"); + assert_number_of_states_in_nfa("(b*a){5,}"); + assert_number_of_states_in_nfa("(a*b){1,}"); + assert_number_of_states_in_nfa("(a*b){3,}"); + assert_number_of_states_in_nfa(""); assert_number_of_states_in_nfa("toto"); assert_number_of_states_in_nfa("A+B*"); @@ -261,10 +311,48 @@ mod tests { assert_number_of_states_in_nfa("q(ab|ca|ab|abc)x"); assert_number_of_states_in_nfa("a*(aad|ads|a)abc.*def.*ghi"); - assert_number_of_states_in_nfa("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}"); + assert_number_of_states_in_nfa( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + ); Ok(()) } + // Regression: directly-constructed (unsimplified) repetitions over {""} + // sub-expressions (shapes the string parser simplifies away but any user + // of the public enum can build) used to drive the abstract state count + // to zero, after which the merge discounts underflowed and panicked. + #[test] + fn degenerate_repetitions_do_not_underflow() { + use std::collections::VecDeque; + + let atom = RegularExpression::new("a").unwrap(); + // a{0,0} denotes {""} without being the canonical empty-string form. + let empty_string = RegularExpression::Repetition(Box::new(atom), 0, Some(0)); + let star_of_alternation = RegularExpression::Repetition( + Box::new(RegularExpression::Alternation(vec![ + empty_string.clone(), + empty_string.clone(), + ])), + 0, + None, + ); + let star_of_concat = RegularExpression::Repetition( + Box::new(RegularExpression::Concat(VecDeque::from([ + empty_string.clone(), + RegularExpression::Repetition(Box::new(empty_string), 0, None), + ]))), + 0, + None, + ); + + for regex in [star_of_alternation, star_of_concat] { + let estimate = regex.get_number_of_states_in_nfa(); + assert!(estimate >= 1, "state estimate of {regex} must be >= 1"); + let automaton = regex.to_automaton().unwrap(); + assert!(automaton.number_of_states() >= 1); + } + } + fn assert_number_of_states_in_nfa(regex: &str) { println!("{}", regex); let regex = RegularExpression::new(regex).unwrap(); @@ -275,6 +363,6 @@ mod tests { let automaton = regex.to_automaton().unwrap(); - assert_eq!(automaton.get_number_of_states(), number_of_states_in_nfa); + assert_eq!(automaton.number_of_states(), number_of_states_in_nfa); } } diff --git a/src/regex/builder.rs b/src/regex/builder.rs index e8a354f..46367c7 100644 --- a/src/regex/builder.rs +++ b/src/regex/builder.rs @@ -1,54 +1,88 @@ -use ::regex::Regex; -use lazy_static::lazy_static; use regex_charclass::irange::range::AnyRange; use regex_syntax::ParserBuilder; use super::*; -lazy_static! { - static ref RE_FLAG_DETECTION: Regex = - Regex::new(r"\(\?[imsx]*-?[imsx]*\)").expect("Can not compile flag detection regex."); -} - impl RegularExpression { - pub fn new(regex: &str) -> Result { - if regex.is_empty() { + /// Parses and simplifies the provided pattern and returns the resulting [`RegularExpression`]. + pub fn new(pattern: &str) -> Result { + Self::parse(pattern, true) + } + + /// Parses the provided pattern and returns the resulting [`RegularExpression`]. If `simplify` is `true`, the expression is simplified during parsing. + #[tracing::instrument(level = "debug", skip(pattern), fields(pattern_len = pattern.len()))] + pub fn parse(pattern: &str, simplify: bool) -> Result { + if pattern.is_empty() { return Ok(RegularExpression::new_empty_string()); } - if regex == "[]" { + if pattern == "[]" { return Ok(RegularExpression::new_empty()); } match ParserBuilder::new() .dot_matches_new_line(true) .build() - .parse(&Self::remove_flags(regex)) + .parse(&Self::remove_flags(pattern)) { - Ok(hir) => Self::convert_to_regex(&hir), + Ok(hir) => Self::convert_to_regex(&hir, simplify), Err(err) => Err(EngineError::RegexSyntaxError(err.to_string())), } } + /// Strips inline flag groups like `(?i)`, `(?m-s)` or `(?-s)` from the + /// pattern: the engine treats all characters uniformly, so the flags are + /// meaningless here. Equivalent to deleting every match of + /// `\(\?[imsx]*-?[imsx]*\)`; anything else (including non-capturing + /// groups `(?:...)`) is left untouched. fn remove_flags(regex: &str) -> String { - RE_FLAG_DETECTION.replace_all(regex, "").to_string() + let bytes = regex.as_bytes(); + let mut result = String::with_capacity(regex.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'(' && i + 1 < bytes.len() && bytes[i + 1] == b'?' { + let mut j = i + 2; + while j < bytes.len() && matches!(bytes[j], b'i' | b'm' | b's' | b'x') { + j += 1; + } + if j < bytes.len() && bytes[j] == b'-' { + j += 1; + while j < bytes.len() && matches!(bytes[j], b'i' | b'm' | b's' | b'x') { + j += 1; + } + } + if j < bytes.len() && bytes[j] == b')' { + // a flag group: skip it entirely + i = j + 1; + continue; + } + } + // not a flag group: copy the whole character (UTF-8 safe) + let char_len = regex[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1); + result.push_str(®ex[i..i + char_len]); + i += char_len; + } + result } + /// Creates a regular expression that matches all possible strings. pub fn new_total() -> Self { RegularExpression::Repetition( - Box::new(RegularExpression::Character(Range::total())), + Box::new(RegularExpression::Character(CharRange::total())), 0, None, ) } + /// Creates a regular expression that matches the empty language. pub fn new_empty() -> Self { - RegularExpression::Character(Range::empty()) + RegularExpression::Character(CharRange::empty()) } + /// Creates a regular expression that matches only the empty string `""`. pub fn new_empty_string() -> Self { RegularExpression::Concat(VecDeque::new()) } - fn convert_to_regex(hir: &Hir) -> Result { + fn convert_to_regex(hir: &Hir, simplify: bool) -> Result { match hir.kind() { HirKind::Empty => Ok(RegularExpression::new_empty_string()), HirKind::Literal(literal) => { @@ -56,7 +90,7 @@ impl RegularExpression { if let Ok(string) = String::from_utf8(literal.0.clone().into_vec()) { for char in string.chars() { regex_concat = regex_concat.concat( - &RegularExpression::Character(Range::new_from_range( + &RegularExpression::Character(CharRange::new_from_range( Char::new(char)..=Char::new(char), )), true, @@ -80,15 +114,26 @@ impl RegularExpression { HirKind::Look(_) => Ok(RegularExpression::new_empty_string()), HirKind::Repetition(repetition) => { let (min, max) = (repetition.min, repetition.max); - Self::convert_to_regex(&repetition.sub).map(|v| v.repeat(min, max)) + let regex = Self::convert_to_regex(&repetition.sub, simplify)?; + Ok(if simplify { + regex.repeat(min, max) + } else { + RegularExpression::Repetition(Box::new(regex), min, max) + }) } - HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub), + HirKind::Capture(capture) => Self::convert_to_regex(&capture.sub, simplify), HirKind::Concat(concat) => { let mut concat_regex = RegularExpression::Concat(VecDeque::with_capacity(concat.len())); for c in concat { - let concat_value = Self::convert_to_regex(c)?; - concat_regex = concat_regex.concat(&concat_value, true); + let concat_value = Self::convert_to_regex(c, simplify)?; + if simplify { + concat_regex = concat_regex.concat(&concat_value, true); + } else if let RegularExpression::Concat(values) = concat_regex { + let mut values = values.clone(); + values.push_back(concat_value); + concat_regex = RegularExpression::Concat(values); + } } Ok(concat_regex) } @@ -96,32 +141,38 @@ impl RegularExpression { let mut alternation_regex = RegularExpression::Alternation(Vec::with_capacity(alternation.len())); for a in alternation { - let alternation_value = Self::convert_to_regex(a)?; - alternation_regex = alternation_regex.union(&alternation_value); + let alternation_value = Self::convert_to_regex(a, simplify)?; + if simplify { + alternation_regex = alternation_regex.union(&alternation_value); + } else if let RegularExpression::Alternation(values) = alternation_regex { + let mut values = values.clone(); + values.push(alternation_value); + alternation_regex = RegularExpression::Alternation(values); + } } Ok(alternation_regex) } } } - fn to_range_unicode(class_unicode: &ClassUnicode) -> Range { + fn to_range_unicode(class_unicode: &ClassUnicode) -> CharRange { let mut new_range = Vec::with_capacity(class_unicode.ranges().len()); for range in class_unicode.ranges() { new_range.push(AnyRange::from( Char::new(range.start())..=Char::new(range.end()), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } - fn to_range_bytes(class_bytes: &ClassBytes) -> Range { + fn to_range_bytes(class_bytes: &ClassBytes) -> CharRange { let mut new_range = Vec::with_capacity(class_bytes.ranges().len()); for range in class_bytes.ranges() { new_range.push(AnyRange::from( Char::new(range.start() as char)..=Char::new(range.end() as char), )); } - Range::new_from_ranges(&new_range) + CharRange::new_from_ranges(&new_range) } } @@ -129,6 +180,79 @@ impl RegularExpression { mod tests { use crate::regex::RegularExpression; + // The hand-rolled flag stripper must delete exactly the matches of + // `\(\?[imsx]*-?[imsx]*\)` (the regex it replaced) and nothing else. + #[test] + fn remove_flags_strips_flag_groups_only() { + let strip = RegularExpression::remove_flags; + + assert_eq!(strip("(?i)a"), "a"); + assert_eq!(strip("a(?m-s)b"), "ab"); + assert_eq!(strip("a(?-s)b"), "ab"); + assert_eq!(strip("(?imsx)(?)a(?i-)"), "a"); + + // Non-flag constructs are untouched. + assert_eq!(strip("(?:ab|c)d"), "(?:ab|c)d"); + assert_eq!(strip("(a?)b"), "(a?)b"); + assert_eq!(strip("a(?i-s"), "a(?i-s"); // unterminated: not a flag group + assert_eq!(strip("héllo(?i)é"), "hélloé"); // multi-byte safe + } + + // The variants are freely constructible (open enum); invalid bounds are + // rejected at the conversion boundary instead. + #[test] + fn to_automaton_rejects_invalid_repetition_bounds() { + use crate::error::EngineError; + + let a = RegularExpression::new("a").unwrap(); + let invalid = RegularExpression::Repetition(Box::new(a.clone()), 5, Some(2)); + assert_eq!( + invalid.to_automaton().unwrap_err(), + EngineError::InvalidRepetitionBounds(5, 2) + ); + + // Nested invalid repetitions are caught by the recursion. + let nested = RegularExpression::Concat([a.clone(), invalid].into()); + assert_eq!( + nested.to_automaton().unwrap_err(), + EngineError::InvalidRepetitionBounds(5, 2) + ); + + // The simplifying combinators must not panic on invalid trees either + // (regression: the affix factoring of `r{1,0}` used to underflow). + let degenerate = RegularExpression::Repetition(Box::new(a.clone()), 1, Some(0)); + let _ = a.union(°enerate); + let _ = a.concat(°enerate, true); + } + + // Regression (found by the proptest generators): singleton + // Alternation/Concat wrappers print transparently, so quantified + // expressions must be parenthesized by looking through them: + // `((.a))*` used to print as `.a*` instead of `(.a)*`, changing the + // language. + #[test] + fn display_parenthesizes_through_singleton_wrappers() { + use regex_charclass::char::Char; + + let dot = RegularExpression::Character(crate::CharRange::total()); + let a = RegularExpression::Character(crate::CharRange::new_from_range( + Char::new('a')..=Char::new('a'), + )); + let wrapped = + RegularExpression::Alternation(vec![RegularExpression::Concat([dot, a].into())]); + let star = RegularExpression::Repetition(Box::new(wrapped), 0, None); + assert_eq!(star.to_string(), "(.a)*"); + + // The printed pattern must denote the same language as the tree. + let reparsed = RegularExpression::parse(&star.to_string(), false).unwrap(); + assert!( + star.to_automaton() + .unwrap() + .equivalent(&reparsed.to_automaton().unwrap()) + .unwrap() + ); + } + #[test] fn test_parse() -> Result<(), String> { assert_parse("abc+"); @@ -246,24 +370,35 @@ mod tests { let regex_parsed = RegularExpression::new(".").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(automaton.match_string("\t")); - assert!(automaton.match_string("\n")); - assert!(automaton.match_string("\r")); + assert!(automaton.is_match("a")); + assert!(automaton.is_match("\t")); + assert!(automaton.is_match("\n")); + assert!(automaton.is_match("\r")); let regex_parsed = RegularExpression::new("(?i)a").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("a")); - assert!(!automaton.match_string("A")); + assert!(automaton.is_match("a")); + assert!(!automaton.is_match("A")); let regex_parsed = RegularExpression::new("a(?i)a(?-s).").unwrap(); let automaton = regex_parsed.to_automaton().unwrap(); - assert!(automaton.match_string("aa\n")); - assert!(!automaton.match_string("aAb")); + assert!(automaton.is_match("aa\n")); + assert!(!automaton.is_match("aAb")); assert!(RegularExpression::new("\\1").is_err()); + + let two_chars = RegularExpression::new("..") + .unwrap() + .to_automaton() + .unwrap(); + assert!(two_chars.is_match("aé")); + assert!(two_chars.is_match("éa")); + assert!(two_chars.is_match("éé")); + assert!(!two_chars.is_match("é")); + assert!(!two_chars.is_match("aéa")); + Ok(()) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 176612f..228e5e4 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,7 +1,6 @@ use std::{cmp, collections::VecDeque, fmt::Display}; -use crate::Range; -use execution_profile::ThreadLocalParams; +use crate::execution_profile::ExecutionProfile; use regex_charclass::CharacterClass; use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind}; @@ -12,15 +11,40 @@ use super::*; mod analyze; mod builder; mod operation; -#[cfg(feature = "serde")] -mod serializer; -/// Represent a regular expression. +/// Represents a regular expression. +/// +/// The variants are public and freely constructible and matchable. Values +/// can also be built with the parser ([`new`](Self::new) / +/// [`parse`](Self::parse)) or the simplifying combinators +/// ([`concat`](Self::concat), [`union`](Self::union), +/// [`repeat`](Self::repeat)). A directly-constructed repetition whose +/// maximum is below its minimum denotes no valid language and is rejected +/// with [`EngineError::InvalidRepetitionBounds`] when converted by +/// [`to_automaton`](Self::to_automaton). +/// +/// ``` +/// use regexsolver::regex::RegularExpression; +/// +/// let regex = RegularExpression::new("a{2,3}").unwrap(); +/// if let RegularExpression::Repetition(inner, min, max) = ®ex { +/// assert_eq!((*min, *max), (2, Some(3))); +/// assert_eq!(inner.to_string(), "a"); +/// } +/// ``` #[derive(Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] +#[must_use = "regular expressions are immutable; operations return a new expression"] pub enum RegularExpression { - Character(Range), + /// A single character drawn from the given range; an empty range denotes + /// the empty language `[]`. + Character(CharRange), + /// `r{min,max}`; `None` means unbounded. Expected invariant: `max >= min` + /// when bounded (checked by [`to_automaton`](Self::to_automaton)). Repetition(Box, u32, Option), + /// The concatenation of the parts in order; no parts denotes the empty + /// string `""`. Concat(VecDeque), + /// The union of the parts; no parts denotes the empty language `[]`. Alternation(Vec), } @@ -44,21 +68,17 @@ impl Display for RegularExpression { multiplicator_part = String::from("?"); } else if let Some(max) = max_opt { if max == min { - multiplicator_part = format!("{{{}}}", max); + multiplicator_part = format!("{{{max}}}"); } else { - multiplicator_part = format!("{{{},{}}}", min, max); + multiplicator_part = format!("{{{min},{max}}}"); } } else { - multiplicator_part = format!("{{{},}}", min); + multiplicator_part = format!("{{{min},}}"); } - match **regular_expression { - RegularExpression::Repetition(_, _, _) => { - format!("({}){}", regex_part, multiplicator_part) - } - RegularExpression::Concat(_) => { - format!("({}){}", regex_part, multiplicator_part) - } - _ => format!("{}{}", regex_part, multiplicator_part), + if RegularExpression::quantifier_needs_parens(regular_expression) { + format!("({regex_part}){multiplicator_part}") + } else { + format!("{regex_part}{multiplicator_part}") } } RegularExpression::Concat(concat) => { @@ -82,15 +102,42 @@ impl Display for RegularExpression { if alternation.len() == 1 { sb } else { - format!("({})", sb) + format!("({sb})") } } }; - write!(f, "{}", str) + write!(f, "{str}") } } impl RegularExpression { + /// Whether applying a quantifier to the printed form of `r` requires + /// wrapping it in a group. Singleton `Concat`/`Alternation` wrappers + /// print transparently, so the decision must look through them instead + /// of matching on the direct child's variant. + fn quantifier_needs_parens(r: &RegularExpression) -> bool { + match r { + // Prints as a single char or a [class]: one token. + RegularExpression::Character(..) => false, + RegularExpression::Repetition(..) => true, + RegularExpression::Concat(parts) => match parts.len() { + 1 => Self::quantifier_needs_parens(&parts[0]), + // Covers both the empty concatenation (which prints as "" + // and needs the explicit group; `()*` is valid but a bare + // `*` is not) and real multi-part concatenations. + _ => true, + }, + RegularExpression::Alternation(parts) => match parts.len() { + // The empty alternation prints as "[]": one token. + 0 => false, + 1 => Self::quantifier_needs_parens(&parts[0]), + // Multi-part alternations print self-parenthesized. + _ => false, + }, + } + } + + /// Returns `true` if the regular expression matches the empty language. pub fn is_empty(&self) -> bool { match self { RegularExpression::Alternation(alternation) => alternation.is_empty(), @@ -99,6 +146,7 @@ impl RegularExpression { } } + /// Returns `true` if the regular expression matches only the empty string `""`. pub fn is_empty_string(&self) -> bool { match self { RegularExpression::Concat(concat) => concat.is_empty(), @@ -106,6 +154,7 @@ impl RegularExpression { } } + /// Returns `true` if the regular expression matches all possible strings. pub fn is_total(&self) -> bool { match self { RegularExpression::Repetition(regular_expression, min, max_opt) => { @@ -122,15 +171,23 @@ impl RegularExpression { } } + /// Converts the regular expression to an equivalent [`FastAutomaton`]. + #[tracing::instrument(level = "trace", skip_all)] pub fn to_automaton(&self) -> Result { - if self.get_number_of_states_in_nfa() >= ThreadLocalParams::get_max_number_of_states() { - return Err(EngineError::AutomatonHasTooManyStates); - } + ExecutionProfile::get().assert_max_number_of_states(self.get_number_of_states_in_nfa())?; + match self { - RegularExpression::Character(range) => FastAutomaton::make_from_range(range), + RegularExpression::Character(range) => Ok(FastAutomaton::new_from_range(range)), RegularExpression::Repetition(regular_expression, min, max_opt) => { + // The variants are freely constructible; invalid bounds are + // rejected at this boundary instead. + if let Some(max) = max_opt + && max < min + { + return Err(EngineError::InvalidRepetitionBounds(*min, *max)); + } let mut automaton = regular_expression.to_automaton()?; - automaton.repeat(*min, *max_opt)?; + automaton.repeat_mut(*min, *max_opt)?; Ok(automaton) } RegularExpression::Concat(concat) => { @@ -138,15 +195,106 @@ impl RegularExpression { for c in concat.iter() { concats.push(c.to_automaton()?); } - FastAutomaton::concatenate(concats) + FastAutomaton::concat_all(&concats) } RegularExpression::Alternation(alternation) => { - let mut concats = Vec::with_capacity(alternation.len()); + let mut alternates = Vec::with_capacity(alternation.len()); for c in alternation.iter() { - concats.push(c.to_automaton()?); + alternates.push(c.to_automaton()?); + } + FastAutomaton::union_all(&alternates) + } + } + } + + /// Returns a heuristic score for the readability of the pattern. + pub fn evaluate_complexity(&self) -> f64 { + let (score, depth, _) = self.eval_inner(); + score + Self::depth_penalty(depth) + } + + /// Returns: (score, max_depth, contains_repetition) + fn eval_inner(&self) -> (f64, usize, bool) { + match self { + RegularExpression::Character(range) => { + let len = range.to_regex().len() as f64; + // small, capped cost for raw length + let base = 1.0 + 0.05 * len.min(40.0); + (base, 1, false) + } + + RegularExpression::Repetition(inner, min, max_opt) => { + let (inner_score, inner_depth, inner_has_rep) = inner.eval_inner(); + + // multipliers tuned for readability impact + let mut m = match max_opt { + None => 1.6, + Some(max) if max > min => 1.3, + Some(max) if max == min && *min > 1 => 1.1, + _ => 1.0, + }; + + // nested quantifiers like (...+)+ are harder + if inner_has_rep { + m *= 1.5; } - FastAutomaton::alternation(concats) + + (inner_score * m, inner_depth + 1, true) } + + RegularExpression::Concat(items) => { + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for (i, it) in items.iter().enumerate() { + let (s, d, h) = it.eval_inner(); + sum += s; + if i > 0 { + // tiny discount: linear sequences are relatively easy to read + sum *= 0.98; + } + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + (sum, max_depth + 1, has_rep) + } + + RegularExpression::Alternation(branches) => { + if branches.is_empty() { + return (0.0, 1, false); + } + let mut sum = 0.0; + let mut max_depth = 0usize; + let mut has_rep = false; + + for b in branches { + let (s, d, h) = b.eval_inner(); + sum += s; + if d > max_depth { + max_depth = d; + } + has_rep |= h; + } + + // branching cost: more alternatives = harder to scan + let k = branches.len() as f64; + let multiplier = 1.0 + 0.15 * (k - 1.0); + + (sum * multiplier, max_depth + 1, has_rep) + } + } + } + + fn depth_penalty(depth: usize) -> f64 { + // no penalty up to depth 2, then quadratic growth + if depth <= 2 { + 0.0 + } else { + ((depth - 2) as f64).powi(2) * 0.8 } } } diff --git a/src/regex/operation/concat.rs b/src/regex/operation/concat.rs index 6907d9b..f32e5be 100644 --- a/src/regex/operation/concat.rs +++ b/src/regex/operation/concat.rs @@ -1,6 +1,22 @@ use super::*; impl RegularExpression { + /// Returns a regular expression that is the concatenation of all expressions in `regexes`. + #[tracing::instrument(level = "trace", skip_all)] + pub fn concat_all<'a, I: IntoIterator>( + regexes: I, + ) -> RegularExpression { + let mut result = RegularExpression::new_empty_string(); + + for other in regexes { + result = result.concat(other, true); + } + + result + } + + /// Returns a new regular expression representing the concatenation of `self` and `other`; `append_back` determines their order. + #[tracing::instrument(level = "trace", skip(self, other), fields(append_back = append_back))] pub fn concat(&self, other: &RegularExpression, append_back: bool) -> RegularExpression { if self.is_empty() || other.is_empty() { return RegularExpression::new_empty(); @@ -10,35 +26,19 @@ impl RegularExpression { return self.clone(); } - match (self, other) { - (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_concat_and_concat(self, other) - } else { - Self::opconcat_concat_and_concat(other, self) - } - } - (RegularExpression::Concat(_), _) => { - if append_back { - Self::opconcat_concat_and_other(self, other) - } else { - Self::opconcat_other_and_concat(other, self) - } - } - (_, RegularExpression::Concat(_)) => { - if append_back { - Self::opconcat_other_and_concat(self, other) - } else { - Self::opconcat_concat_and_other(other, self) - } - } - (_, _) => { - if append_back { - Self::opconcat_other_and_other(self, other) - } else { - Self::opconcat_other_and_other(other, self) - } + let (front, back) = if append_back { + (self, other) + } else { + (other, self) + }; + + match (front, back) { + (RegularExpression::Concat(..), RegularExpression::Concat(..)) => { + Self::opconcat_concat_and_concat(front, back) } + (RegularExpression::Concat(..), _) => Self::opconcat_concat_and_other(front, back), + (_, RegularExpression::Concat(..)) => Self::opconcat_other_and_concat(front, back), + (_, _) => Self::opconcat_other_and_other(front, back), } } @@ -69,12 +69,14 @@ impl RegularExpression { return merged; } - let mut vec = that_elements.clone(); - let that_index = 0; - - if let Some(merged) = Self::opconcat_can_be_merged(this, &that_elements[that_index]) { - vec[that_index] = merged; + // Clone the surviving elements only: the boundary element is + // either replaced by the merge or kept alongside `this`. + let mut vec: VecDeque; + if let Some(merged) = Self::opconcat_can_be_merged(this, &that_elements[0]) { + vec = that_elements.iter().skip(1).cloned().collect(); + vec.push_front(merged); } else { + vec = that_elements.clone(); vec.push_front(this.clone()); } @@ -101,12 +103,14 @@ impl RegularExpression { return merged; } - let mut vec = this_elements.clone(); + // Clone the surviving elements only (see opconcat_other_and_concat). let this_index = this_elements.len() - 1; - + let mut vec: VecDeque; if let Some(merged) = Self::opconcat_can_be_merged(&this_elements[this_index], that) { - vec[this_index] = merged; + vec = this_elements.iter().take(this_index).cloned().collect(); + vec.push_back(merged); } else { + vec = this_elements.clone(); vec.push_back(that.clone()); } @@ -139,15 +143,17 @@ impl RegularExpression { return merged; } - let mut vec = this_elements.clone(); + // Clone the surviving elements only (see opconcat_other_and_concat). let (this_index, that_index) = (this_elements.len() - 1, 0); - + let mut vec: VecDeque; if let Some(merged) = Self::opconcat_can_be_merged(&this_elements[this_index], &that_elements[that_index]) { - vec[this_index] = merged; + vec = this_elements.iter().take(this_index).cloned().collect(); + vec.push_back(merged); vec.extend(that_elements.iter().skip(1).cloned()); } else { + vec = this_elements.clone(); vec.extend(that_elements.iter().cloned()); } @@ -161,6 +167,24 @@ impl RegularExpression { } } + /// Merges the bounds of two adjacent repetitions of the same expression, + /// `r{a,b}r{c,d}` → `r{a+c,b+d}`. Returns `None` ("cannot be merged", + /// falling back to plain concatenation) when an addition would overflow. + fn merge_repetition_bounds( + this_min: u32, + this_max_opt: &Option, + that_min: u32, + that_max_opt: &Option, + ) -> Option<(u32, Option)> { + let new_min = this_min.checked_add(that_min)?; + let new_max_opt = if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { + Some(this_max.checked_add(*that_max)?) + } else { + None + }; + Some((new_min, new_max_opt)) + } + fn opconcat_can_be_merged( this: &RegularExpression, that: &RegularExpression, @@ -171,24 +195,15 @@ impl RegularExpression { RegularExpression::Repetition(_, that_min, that_max_opt), ) = (this, that) { - let new_min = this_min + that_min; - let new_max_opt = - if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { - Some(this_max + that_max) - } else { - None - }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + let (new_min, new_max_opt) = Self::merge_repetition_bounds( + *this_min, + this_max_opt, + *that_min, + that_max_opt, + )?; + Some(this_regex.repeat(new_min, new_max_opt)) } else { - Some(RegularExpression::Repetition( - Box::new(this.clone()), - 2, - Some(2), - )) + Some(this.repeat(2, Some(2))) } } else if let ( RegularExpression::Repetition(this_regex, this_min, this_max_opt), @@ -196,53 +211,39 @@ impl RegularExpression { ) = (this, that) { if this_regex == that_regex { - let new_min = this_min + that_min; - let new_max_opt = - if let (Some(this_max), Some(that_max)) = (this_max_opt, that_max_opt) { - Some(this_max + that_max) - } else { - None - }; - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + let (new_min, new_max_opt) = Self::merge_repetition_bounds( + *this_min, + this_max_opt, + *that_min, + that_max_opt, + )?; + Some(this_regex.repeat(new_min, new_max_opt)) } else if let ( RegularExpression::Character(this_range), RegularExpression::Character(that_range), - ) = (*this_regex.clone(), *that_regex.clone()) + ) = (&**this_regex, &**that_regex) { - if this_range.contains_all(&that_range) && that_min == &0 && this_max_opt.is_none() - { - return Some(this.clone()); + if this_range.contains_all(that_range) && that_min == &0 && this_max_opt.is_none() { + Some(this.clone()) } else { - return None; + None } } else { - return None; + None } } else if let RegularExpression::Repetition(this_regex, this_min, this_max_opt) = this { if **this_regex == *that { - let new_min = this_min + 1; - let new_max_opt = this_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - this_regex.clone(), - new_min, - new_max_opt, - )) + let (new_min, new_max_opt) = + Self::merge_repetition_bounds(*this_min, this_max_opt, 1, &Some(1))?; + Some(this_regex.repeat(new_min, new_max_opt)) } else { None } } else if let RegularExpression::Repetition(that_regex, that_min, that_max_opt) = that { if **that_regex == *this { - let new_min = that_min + 1; - let new_max_opt = that_max_opt.as_ref().map(|this_max| this_max + 1); - Some(RegularExpression::Repetition( - that_regex.clone(), - new_min, - new_max_opt, - )) + let (new_min, new_max_opt) = + Self::merge_repetition_bounds(*that_min, that_max_opt, 1, &Some(1))?; + Some(that_regex.repeat(new_min, new_max_opt)) } else { None } @@ -256,6 +257,20 @@ impl RegularExpression { mod tests { use super::*; + // Regression: merging adjacent repetitions used to add bounds unchecked; + // huge (but valid) bounds must fall back to plain concatenation instead + // of overflowing. + #[test] + fn concat_merge_bound_overflow_falls_back_to_concat() { + let a = RegularExpression::new("a").unwrap(); + let big = RegularExpression::Repetition(Box::new(a), u32::MAX, None); + let result = big.concat(&big, true); + assert!(matches!( + &result, + RegularExpression::Concat(parts) if parts.len() == 2 + )); + } + #[test] fn test_concat() -> Result<(), String> { assert_concat("xxx", "x{3}"); diff --git a/src/regex/operation/mod.rs b/src/regex/operation/mod.rs index 2baa587..0147fc9 100644 --- a/src/regex/operation/mod.rs +++ b/src/regex/operation/mod.rs @@ -1,236 +1,6 @@ use super::*; mod concat; +mod repeat; mod simplify; mod union; - -impl RegularExpression { - pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { - if self.is_total() { - return RegularExpression::new_total(); - } else if self.is_empty() { - return RegularExpression::new_empty(); - } else if self.is_empty_string() { - return Self::new_empty_string(); - } else if let Some(max) = max_opt { - if max < min || max == 0 { - return RegularExpression::new_empty_string(); - } else if min == 1 && max == 1 { - return self.clone(); - } - } - - match self { - RegularExpression::Repetition(regular_expression, o_min, o_max_opt) => { - let new_max = if let (Some(max), Some(o_max)) = (max_opt, o_max_opt) { - Some(max * o_max) - } else { - None - }; - - let o_min = *o_min; - if let Some(o_max) = o_max_opt { - let o_max = *o_max; - if o_min <= 1 || max_opt.is_some() && max_opt.unwrap() == min { - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } else if o_min == o_max && o_min > 1 { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } else { - let r = ((o_max as f64) - 1f64) / ((o_max as f64) - (o_min as f64)); - if r > cmp::max(2, min) as f64 { - return RegularExpression::Repetition( - Box::new(self.clone()), - min, - max_opt, - ); - } - - RegularExpression::Repetition( - regular_expression.clone(), - min * o_min, - new_max, - ) - } - } else if o_max_opt.is_none() - || max_opt.is_some() && (max_opt.unwrap() == min || max_opt.unwrap() == 1) - || o_max_opt.is_some() && o_max_opt.unwrap() == 1 - || max_opt.is_none() && o_min == 0 - { - RegularExpression::Repetition(regular_expression.clone(), min * o_min, new_max) - } else { - RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) - } - } - _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), - } - } -} - -#[cfg(test)] -mod tests { - use regex_charclass::{char::Char, irange::RangeSet}; - - use crate::regex::RegularExpression; - - #[test] - fn test_parse_and_simplify() -> Result<(), String> { - assert_parse_and_simplify("(xxx)*", "(x{3})*"); - assert_parse_and_simplify("(x*){3}", "x*"); - assert_parse_and_simplify("(x+)?", "x*"); - assert_parse_and_simplify("(x?)+", "x*"); - assert_parse_and_simplify("(x{0,3})+", "x*"); - assert_parse_and_simplify("(x{2,3})+", "x{2,}"); - assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); - assert_parse_and_simplify("(x+)*", "x*"); - assert_parse_and_simplify(".*abc", ".*abc"); - assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); - assert_parse_and_simplify( - "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", - "a(bc(dg|fe)|mkv)*(abc){4,5}", - ); - assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); - assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); - assert_parse_and_simplify("(a|b)", "[ab]"); - assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); - assert_parse_and_simplify("(ab|ab)", "ab"); - assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); - assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); - assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); - assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); - assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); - assert_parse_and_simplify("((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}"); - Ok(()) - } - - fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { - let regex_parsed = RegularExpression::new(regex).unwrap(); - assert_eq!(regex_simplified, regex_parsed.to_string()); - } - - #[test] - fn test_repeat_simplify() -> Result<(), String> { - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 3, - Some(3), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(2), - 2, - Some(4), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(3), - 0, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 1, - Some(2), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 2, - Some(3), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 3, - Some(4), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 7, - Some(8), - 1, - None, - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - None, - 3, - Some(3), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 1, - None, - 0, - Some(1), - ); - - assert_repeat_simplify( - &RangeSet::new_from_range(Char::new('a')..=Char::new('a')), - 0, - Some(1), - 1, - None, - ); - - Ok(()) - } - - fn assert_repeat_simplify( - range: &RangeSet, - min1: u32, - max1: Option, - min2: u32, - max2: Option, - ) { - let repeat = RegularExpression::Repetition( - Box::new(RegularExpression::Repetition( - Box::new(RegularExpression::Character(range.clone())), - min1, - max1, - )), - min2, - max2, - ); - - let got = RegularExpression::new(&repeat.to_string()).unwrap(); - - println!("{} -> {}", repeat, got); - - let repeat = repeat.to_automaton().unwrap(); - - //repeat.to_dot(); - - let result = got.to_automaton().unwrap(); - - assert!(repeat.is_equivalent_of(&result).unwrap()); - } -} diff --git a/src/regex/operation/repeat.rs b/src/regex/operation/repeat.rs new file mode 100644 index 0000000..15da7e7 --- /dev/null +++ b/src/regex/operation/repeat.rs @@ -0,0 +1,294 @@ +use super::*; + +impl RegularExpression { + /// Computes the repetition of the expression between `min` and `max_opt` times; if `max_opt` is `None`, the repetition is unbounded. + /// + /// When `max_opt` is below `min` there is no valid repetition count and + /// the result is the empty language, consistently with + /// [`FastAutomaton::repeat`](crate::fast_automaton::FastAutomaton::repeat). + #[tracing::instrument(level = "trace", skip(self), fields(min = min, max_opt = tracing::field::debug(max_opt)))] + pub fn repeat(&self, min: u32, max_opt: Option) -> RegularExpression { + if self.is_total() { + return RegularExpression::new_total(); + } else if self.is_empty() { + return RegularExpression::new_empty(); + } else if self.is_empty_string() { + return Self::new_empty_string(); + } else if let Some(max) = max_opt { + if max < min { + // No valid repetition count: the language is empty. This + // matches `FastAutomaton::repeat`, which disagreed with the + // {""} previously returned here. + return RegularExpression::new_empty(); + } else if max == 0 { + return RegularExpression::new_empty_string(); + } else if min == 1 && max == 1 { + return self.clone(); + } + } + + match self { + RegularExpression::Repetition(regular_expression, i_min, i_max_opt) => { + // Only collapse (r{i_min,i_max}){min,max} into + // r{min·i_min,max·i_max} when the bounds are gap-free AND the + // multiplications don't overflow; the nested form is always a + // correct fallback. + if Self::can_simplify_nested_repetition(*i_min, *i_max_opt, min, max_opt) { + let new_min = min.checked_mul(*i_min); + let new_max = match (max_opt, i_max_opt) { + (Some(o_max), Some(i_max)) => o_max.checked_mul(*i_max).map(Some), + _ => Some(None), + }; + if let (Some(new_min), Some(new_max)) = (new_min, new_max) { + return RegularExpression::Repetition( + regular_expression.clone(), + new_min, + new_max, + ); + } + } + RegularExpression::Repetition(Box::new(self.clone()), min, max_opt) + } + _ => RegularExpression::Repetition(Box::new(self.clone()), min, max_opt), + } + } + + /// Evaluate if the repetition `(r{i_min,i_max_opt}){o_min,o_max_opt}` can be simplified to `r{i_min*o_min,i_max_opt*o_max_opt}`. + fn can_simplify_nested_repetition( + i_min: u32, + i_max_opt: Option, + o_min: u32, + o_max_opt: Option, + ) -> bool { + if let Some(o_max) = o_max_opt + && o_min == o_max + { + return true; + } + + if let Some(i_max) = i_max_opt { + // We check if there is any gap by resolving: + // o_min * i_max >= (o_min + 1) * i_min - 1 + // <=> o_min * (i_max - i_min) >= i_min - 1 + o_min.saturating_mul(i_max.saturating_sub(i_min)) >= i_min.saturating_sub(1) + } else if o_min > 0 { + true + } else { + i_min <= 1 + } + } +} + +#[cfg(test)] +mod tests { + + use regex_charclass::char::Char; + + use crate::{CharRange, regex::RegularExpression}; + + // Regression: the nested-repetition simplification used to multiply + // bounds unchecked; huge (but valid) bounds must fall back to the nested + // form instead of overflowing. + #[test] + fn repeat_bound_overflow_keeps_nested_form() { + let a = RegularExpression::new("a").unwrap(); + let inner = a.repeat(2, Some(2)); // a{2} + let outer = inner.repeat(u32::MAX, Some(u32::MAX)); // 2·u32::MAX overflows + assert!(matches!( + &outer, + RegularExpression::Repetition(r, u32::MAX, Some(u32::MAX)) + if matches!(&**r, RegularExpression::Repetition(..)) + )); + } + + // r{min,max} with max < min has no valid repetition count: the language + // is empty, consistently with `FastAutomaton::repeat` (the regex side + // used to return {""} instead). + #[test] + fn repeat_with_max_below_min_is_empty() { + let a = RegularExpression::new("a").unwrap(); + assert!(a.repeat(5, Some(2)).is_empty()); + + let automaton = a.to_automaton().unwrap().repeat(5, Some(2)).unwrap(); + assert!(automaton.is_empty()); + } + + #[test] + fn test_parse_and_simplify() -> Result<(), String> { + assert_parse_and_simplify("(xxx)*", "(x{3})*"); + assert_parse_and_simplify("(x*){3}", "x*"); + assert_parse_and_simplify("(x+)?", "x*"); + assert_parse_and_simplify("(x?)+", "x*"); + assert_parse_and_simplify("(x{0,3})+", "x*"); + assert_parse_and_simplify("(x{2,3})+", "x{2,}"); + assert_parse_and_simplify("(x{7,9})+", "(x{7,9})+"); + assert_parse_and_simplify("(x+)*", "x*"); + assert_parse_and_simplify(".*abc", ".*abc"); + assert_parse_and_simplify(".*a(b|cd)", ".*a(b|cd)"); + assert_parse_and_simplify( + "a(bcfe|bcdg|mkv)*(abc){2,3}(abc){2}", + "a(bc(dg|fe)|mkv)*(abc){4,5}", + ); + assert_parse_and_simplify("((abc|fg)abc|(abc|fg)fg)", "(abc|fg){2}"); + assert_parse_and_simplify("(a{2}|a{3})", "a{2,3}"); + assert_parse_and_simplify("(a|b)", "[ab]"); + assert_parse_and_simplify("(ab|a|cd|b|ef)", "(b|ab?|cd|ef)"); + assert_parse_and_simplify("(ab|ab)", "ab"); + assert_parse_and_simplify("(ab)(ab)(ab)", "(ab){3}"); + assert_parse_and_simplify("aaaabbbbbccc", "a{4}b{5}c{3}"); + assert_parse_and_simplify("((ab))?(ab)(((ab)))((((ab)){3}))", "(ab){5,6}"); + assert_parse_and_simplify("(cd|ab)*(ab|cd)*", "(ab|cd)*"); + assert_parse_and_simplify(".*q(ab|ab|abc|ca)x", ".*q(abc?|ca)x"); + assert_parse_and_simplify( + "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q){1,100}", + "(q|(a|ads|a{2}d)*abc.*def.*uif(x|ads|a{2}d)*abc.*oxs.*def(ads|ax|a{2}d)*abc.*def.*ksd){1,100}", + ); + + assert_parse_and_simplify("(a{2,4}){2,4}", "a{4,16}"); + Ok(()) + } + + fn assert_parse_and_simplify(regex: &str, regex_simplified: &str) { + let regex_parsed = RegularExpression::new(regex).unwrap(); + assert_eq!(regex_simplified, regex_parsed.to_string()); + } + + #[test] + fn test_repeat_simplify() -> Result<(), String> { + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(2), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(3), + 0, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + Some(2), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 3, + Some(4), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 7, + Some(8), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + None, + 3, + Some(3), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 1, + None, + 0, + Some(1), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 0, + Some(1), + 1, + None, + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(4), + 2, + Some(4), + ); + + assert_repeat_simplify( + &CharRange::new_from_range(Char::new('a')..=Char::new('a')), + 2, + Some(3), + 2, + Some(2), + ); + + Ok(()) + } + + fn assert_repeat_simplify( + range: &CharRange, + min1: u32, + max1: Option, + min2: u32, + max2: Option, + ) { + let repeat = RegularExpression::Repetition( + Box::new(RegularExpression::Repetition( + Box::new(RegularExpression::Character(range.clone())), + min1, + max1, + )), + min2, + max2, + ); + + let got = RegularExpression::new(&repeat.to_string()).unwrap(); + + println!("{} -> {}", repeat, got); + + let repeat = repeat.to_automaton().unwrap(); + + //repeat.to_dot(); + + let result = got.to_automaton().unwrap(); + + assert!(repeat.equivalent(&result).unwrap()); + } +} diff --git a/src/regex/operation/simplify.rs b/src/regex/operation/simplify.rs index ae87087..be02823 100644 --- a/src/regex/operation/simplify.rs +++ b/src/regex/operation/simplify.rs @@ -1,32 +1,18 @@ use super::*; impl RegularExpression { + /// Returns a simplified version by eliminating redundant constructs and applying canonical reductions. + #[tracing::instrument(level = "trace", skip_all)] pub fn simplify(&self) -> Self { match self { - RegularExpression::Character(_) => self.clone(), + RegularExpression::Character(..) => self.clone(), RegularExpression::Repetition(regex, min, max_opt) => { - let regex = regex.simplify(); - match regex { - RegularExpression::Repetition( - simplified_regex, - simplified_min, - simplified_max_opt, - ) => { - let new_max = if let (Some(max), Some(simplified_max)) = - (max_opt, simplified_max_opt) - { - Some(max * simplified_max) - } else { - None - }; - RegularExpression::Repetition( - simplified_regex, - min * simplified_min, - new_max, - ) - } - _ => RegularExpression::Repetition(Box::new(regex), *min, *max_opt), - } + // Delegate to `repeat`, which guards the nested-repetition + // collapse with `can_simplify_nested_repetition`. Collapsing + // `(r{a,b}){c,d}` to `r{a*c,b*d}` unconditionally is unsound + // when the step lengths leave a gap (e.g. `(a{3,4}){1,2}` + // would wrongly widen to `a{3,8}`). + regex.simplify().repeat(*min, *max_opt) } RegularExpression::Concat(elements) => { let elements: VecDeque<_> = diff --git a/src/regex/operation/union.rs b/src/regex/operation/union.rs index 8f5c1ae..bcc6e37 100644 --- a/src/regex/operation/union.rs +++ b/src/regex/operation/union.rs @@ -3,74 +3,98 @@ use std::collections::BTreeSet; use super::*; impl RegularExpression { + /// Returns a regular expression matching the union of `self` and `other`. + #[tracing::instrument(level = "trace", skip_all)] pub fn union(&self, other: &RegularExpression) -> RegularExpression { + Self::union_all([self, other]) + } + + /// Returns a regular expression that is the union of all expressions in `regexes`. + #[tracing::instrument(level = "trace", skip_all)] + pub fn union_all<'a, I: IntoIterator>( + regexes: I, + ) -> RegularExpression { + let mut result: Cow<'a, RegularExpression> = Cow::Owned(RegularExpression::new_empty()); + + for other in regexes { + result = result.union_(other); + + if result.is_total() { + break; + } + } + + result.into_owned() + } + + fn union_<'a>(&self, other: &'a RegularExpression) -> Cow<'a, RegularExpression> { if self.is_total() || other.is_total() { - return RegularExpression::new_total(); + return Cow::Owned(RegularExpression::new_total()); } else if self.is_empty() { - return other.clone(); + return Cow::Borrowed(other); } else if other.is_empty() || self == other { - return self.clone(); + return Cow::Owned(self.clone()); } else if other.is_empty_string() { - return self.clone().repeat(0, Some(1)); + return Cow::Owned(self.repeat(0, Some(1))); } else if self.is_empty_string() { - return other.clone().repeat(0, Some(1)); + return Cow::Owned(other.repeat(0, Some(1))); } - match (self, other) { + Cow::Owned(match (self, other) { ( RegularExpression::Character(self_range), RegularExpression::Character(other_range), ) => RegularExpression::Character(self_range.union(other_range)), - (RegularExpression::Character(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Character(..), RegularExpression::Repetition(..)) => { Self::opunion_character_and_repetition(self, other) } - (RegularExpression::Character(_), RegularExpression::Concat(_)) => { + (RegularExpression::Character(..), RegularExpression::Concat(..)) => { Self::opunion_character_and_concat(self, other) } - (RegularExpression::Character(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Character(..), RegularExpression::Alternation(..)) => { Self::opunion_character_and_alternation(self, other) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Character(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Character(..)) => { Self::opunion_character_and_repetition(other, self) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Repetition(..), RegularExpression::Repetition(..)) => { Self::opunion_repetition_and_repetition(self, other) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Concat(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Concat(..)) => { Self::opunion_concat_and_repetition(other, self) } - (RegularExpression::Repetition(_, _, _), RegularExpression::Alternation(_)) => { + (RegularExpression::Repetition(..), RegularExpression::Alternation(..)) => { Self::opunion_repetition_and_alternation(self, other) } - (RegularExpression::Concat(_), RegularExpression::Character(_)) => { + (RegularExpression::Concat(..), RegularExpression::Character(..)) => { Self::opunion_character_and_concat(other, self) } - (RegularExpression::Concat(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Concat(..), RegularExpression::Repetition(..)) => { Self::opunion_concat_and_repetition(self, other) } - (RegularExpression::Concat(_), RegularExpression::Concat(_)) => { + (RegularExpression::Concat(..), RegularExpression::Concat(..)) => { Self::opunion_common_affixes(self, other) } - (RegularExpression::Concat(_), RegularExpression::Alternation(_)) => { + (RegularExpression::Concat(..), RegularExpression::Alternation(..)) => { Self::opunion_concat_and_alternation(self, other) } - (RegularExpression::Alternation(_), RegularExpression::Character(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Character(..)) => { Self::opunion_character_and_alternation(other, self) } - (RegularExpression::Alternation(_), RegularExpression::Repetition(_, _, _)) => { + (RegularExpression::Alternation(..), RegularExpression::Repetition(..)) => { Self::opunion_repetition_and_alternation(other, self) } - (RegularExpression::Alternation(_), RegularExpression::Concat(_)) => { + (RegularExpression::Alternation(..), RegularExpression::Concat(..)) => { Self::opunion_concat_and_alternation(other, self) } - (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(_)) => { - let mut new_alternation = other.clone(); + (RegularExpression::Alternation(self_elements), RegularExpression::Alternation(..)) => { + let mut new_alternation = Cow::Borrowed(other); for self_element in self_elements { - new_alternation = new_alternation.union(self_element); + new_alternation = new_alternation.union_(self_element); } - new_alternation + new_alternation.into_owned() } - } + }) } fn opunion_character_and_repetition( @@ -78,26 +102,19 @@ impl RegularExpression { that_repetition: &RegularExpression, ) -> RegularExpression { if let ( - RegularExpression::Character(_), + RegularExpression::Character(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_character, that_repetition) { if this_character == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { let mut alternate = vec![this_character.clone(), that_repetition.clone()]; alternate.sort_unstable(); RegularExpression::Alternation(alternate) } } else { - panic!( - "Not character and repetition {:?} {:?}", - this_character, that_repetition - ) + panic!("Not character and repetition {this_character:?} {that_repetition:?}") } } @@ -116,17 +133,17 @@ impl RegularExpression { if prefix.is_none() && suffix.is_none() { let mut alternate_elements = vec![self_regex, other_regex]; alternate_elements.sort_unstable(); - RegularExpression::Alternation(alternate_elements) + Cow::Owned(RegularExpression::Alternation(alternate_elements)) } else { - self_regex.union(&other_regex) + self_regex.union_(&other_regex) } } else { - RegularExpression::Repetition(Box::new(self_regex), 0, Some(1)) + Cow::Owned(self_regex.repeat(0, Some(1))) } } else if !other_regex.is_empty_string() { - RegularExpression::Repetition(Box::new(other_regex), 0, Some(1)) + Cow::Owned(other_regex.repeat(0, Some(1))) } else { - RegularExpression::new_empty_string() + Cow::Owned(RegularExpression::new_empty_string()) }; regex = regex.concat(®ex_from_alternate, true); @@ -153,10 +170,10 @@ impl RegularExpression { if let RegularExpression::Character(range) = element { set.insert(RegularExpression::Character(this_range.union(range))); had_character_union = true; - } else if matches!(element, RegularExpression::Repetition(_, _, _)) { + } else if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_character_and_repetition(this_character, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_character_union = true; } else { @@ -179,7 +196,7 @@ impl RegularExpression { this_character: &RegularExpression, that_concat: &RegularExpression, ) -> RegularExpression { - if let (RegularExpression::Character(_), RegularExpression::Concat(that_elements)) = + if let (RegularExpression::Character(..), RegularExpression::Concat(that_elements)) = (this_character, that_concat) { if that_elements.len() == 1 && that_elements[0] == *this_character { @@ -197,16 +214,12 @@ impl RegularExpression { that_repetition: &RegularExpression, ) -> RegularExpression { if let ( - RegularExpression::Concat(_), + RegularExpression::Concat(..), RegularExpression::Repetition(that_regex, that_min, that_max_opt), ) = (this_concat, that_repetition) { if this_concat == &**that_regex && *that_min <= 2 { - RegularExpression::Repetition( - that_regex.clone(), - cmp::min(1, *that_min), - *that_max_opt, - ) + that_regex.repeat(cmp::min(1, *that_min), *that_max_opt) } else { Self::opunion_common_affixes(this_concat, that_repetition) } @@ -219,16 +232,16 @@ impl RegularExpression { this_concat: &RegularExpression, that_alternation: &RegularExpression, ) -> RegularExpression { - if let (RegularExpression::Concat(_), RegularExpression::Alternation(that_elements)) = + if let (RegularExpression::Concat(..), RegularExpression::Alternation(that_elements)) = (this_concat, that_alternation) { let mut set = BTreeSet::new(); let mut had_concat_union = false; for element in that_elements { - if matches!(element, RegularExpression::Repetition(_, _, _)) { + if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_concat_and_repetition(this_concat, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_concat_union = true; } else { @@ -262,18 +275,26 @@ impl RegularExpression { || this_max + 1 == *that_min || that_max + 1 == *this_min { - return RegularExpression::Repetition( - this_regex.clone(), + return this_regex.repeat( cmp::min(*this_min, *that_min), Some(cmp::max(*this_max, *that_max)), ); } } else { - return RegularExpression::Repetition( - this_regex.clone(), - cmp::min(*this_min, *that_min), - None, - ); + // At least one side is unbounded. The union collapses to + // r{min(m1,m2),} only when the ranges overlap or are + // adjacent (i.e. the unbounded side starts no later than + // one past the bounded side's end). Otherwise there is a + // gap (e.g. a? ∪ a{3,} must NOT become a*). + let mergeable = match (this_max_opt, that_max_opt) { + (None, None) => true, + (Some(this_max), None) => *that_min <= this_max.saturating_add(1), + (None, Some(that_max)) => *this_min <= that_max.saturating_add(1), + (Some(_), Some(_)) => unreachable!("handled above"), + }; + if mergeable { + return this_regex.repeat(cmp::min(*this_min, *that_min), None); + } } } @@ -295,38 +316,34 @@ impl RegularExpression { ) = (this_repetition, that_alternation) { if that_alternation == &**this_regex && *this_min <= 2 { - RegularExpression::Repetition( - this_regex.clone(), - cmp::min(1, *this_min), - *this_max_opt, - ) + this_regex.repeat(cmp::min(1, *this_min), *this_max_opt) } else { let mut set = BTreeSet::new(); let mut had_repetition_union = false; for element in that_elements { - if matches!(element, RegularExpression::Repetition(_, _, _)) { + if matches!(element, RegularExpression::Repetition(..)) { let repetition = Self::opunion_repetition_and_repetition(this_repetition, element); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { set.insert(element.clone()); } - } else if matches!(element, RegularExpression::Character(_)) { + } else if matches!(element, RegularExpression::Character(..)) { let repetition = Self::opunion_character_and_repetition(element, this_repetition); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { set.insert(element.clone()); } - } else if matches!(element, RegularExpression::Concat(_)) { + } else if matches!(element, RegularExpression::Concat(..)) { let repetition = Self::opunion_concat_and_repetition(element, this_repetition); - if matches!(repetition, RegularExpression::Repetition(_, _, _)) { + if matches!(repetition, RegularExpression::Repetition(..)) { set.insert(repetition); had_repetition_union = true; } else { @@ -351,9 +368,39 @@ impl RegularExpression { mod tests { use super::*; + // Regression: with an unbounded side, the repetition merge used to fire + // unconditionally, so `a? ∪ a{3,}` collapsed to `a*` even though `a{2}` is + // in neither operand. The merge is only sound when the unbounded range + // starts no later than one past the bounded range's end. + #[test] + fn union_does_not_merge_gapped_repetitions() { + let union = |x: &str, y: &str| { + RegularExpression::parse(x, false) + .unwrap() + .union(&RegularExpression::parse(y, false).unwrap()) + .to_string() + }; + + // Gapped: must stay alternations. + assert_eq!("(a?|a{3,})", union("a?", "a{3,}")); + assert_eq!("(a?|a{3,})", union("a{3,}", "a?")); + assert_eq!("(a{2}|a{5,})", union("a{2}", "a{5,}")); + + // Overlapping or adjacent: still merge. + assert_eq!("a*", union("a?", "a{2,}")); + assert_eq!("a{2,}", union("a{2}", "a{3,}")); + assert_eq!("a*", union("a*", "a{3,}")); + assert_eq!("a{3,}", union("a{3,}", "a{5,}")); + } + #[test] fn test_union() -> Result<(), String> { assert_union("(a+|a+b)", "a+b?"); + assert_union("(a+|a*)", "a*"); + assert_union("(a?|a{0,2})", "a{0,2}"); + assert_union("(a{2,4}|a{1,3})", "a{1,4}"); + assert_union("(a{1,2}|a{3,4})", "a{1,4}"); + assert_union("(a{3,4}|a{1,2})", "a{1,4}"); Ok(()) } diff --git a/src/regex/serializer.rs b/src/regex/serializer.rs deleted file mode 100644 index 83fd99f..0000000 --- a/src/regex/serializer.rs +++ /dev/null @@ -1,28 +0,0 @@ -use serde::{de, Deserializer, Serializer}; - -use super::*; - -impl serde::Serialize for RegularExpression { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - serializer.serialize_str(&self.to_string()) - } -} - -impl<'de> serde::Deserialize<'de> for RegularExpression { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let regex_string = match String::deserialize(deserializer) { - Ok(str) => str, - Err(err) => return Err(err), - }; - match RegularExpression::new(®ex_string) { - Ok(regex) => Ok(regex), - Err(err) => Err(de::Error::custom(err.to_string())), - } - } -} diff --git a/src/tokenizer/embed_automaton.rs b/src/tokenizer/embed_automaton.rs deleted file mode 100644 index 79697dd..0000000 --- a/src/tokenizer/embed_automaton.rs +++ /dev/null @@ -1,213 +0,0 @@ -use token::TokenError; - -use crate::{error::EngineError, fast_automaton::condition::Condition}; - -use self::token::range_token::RangeToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_embedding(&self) -> Vec { - let mut vec = vec![]; - - let mut worklist = VecDeque::new(); - let mut seen = IntSet::default(); - - worklist.push_front(self.automaton.get_start_state()); - - while let Some(current_state) = worklist.pop_back() { - if !vec.is_empty() { - // separator - vec.push(AutomatonToken::SeparatorState) - } - seen.insert(current_state); - - // state - let embedded_state = - AutomatonToken::State(*self.state_to_token.get(¤t_state).unwrap()); - vec.push(embedded_state); - - if self.automaton.is_accepted(¤t_state) { - // accept state - vec.push(AutomatonToken::AcceptState) - } - - for (to_state, condition) in self - .automaton - .transitions_from_state_enumerate_iter(¤t_state) - { - if condition.is_empty() { - continue; - } - let embedded_state = - AutomatonToken::State(*self.state_to_token.get(to_state).unwrap()); - vec.push(embedded_state); - - if condition.is_total() { - vec.push(AutomatonToken::Range(RangeToken::Total)); - } else { - let range = condition - .to_range(self.automaton.get_spanning_set()) - .expect("It should be possible to convert the condition to range."); - self.range_tokenizer - .range_to_embedding(&range) - .unwrap() - .iter() - .for_each(|&e| { - vec.push(AutomatonToken::Range(e)); - }); - } - - if !seen.contains(to_state) { - worklist.push_front(*to_state); - } - } - } - - vec - } - - pub fn from_embedding(&self, vec: &Vec) -> Result { - let mut automaton = FastAutomaton::new_empty(); - automaton.apply_new_spanning_set(self.automaton.get_spanning_set())?; - - let mut from_state = None; - let mut to_state = None; - let mut range = Range::empty(); - for token in vec { - match token { - AutomatonToken::Range(r) => { - range = range.union(self.range_tokenizer.token_to_range(r).unwrap()); - } - AutomatonToken::State(s) => { - while !automaton.has_state((*s).into()) { - automaton.new_state(); - } - if let Some(fs) = from_state { - if let Some(ts) = to_state { - Self::apply_transition(&mut automaton, fs, ts, &range)?; - range = Range::empty(); - } - to_state = Some((*s).into()); - } else { - from_state = Some((*s).into()); - } - } - AutomatonToken::AcceptState => { - automaton.accept(from_state.unwrap()); - } - AutomatonToken::SeparatorState => { - if let Some(to_state) = to_state { - Self::apply_transition( - &mut automaton, - from_state.unwrap(), - to_state, - &range, - )?; - } - from_state = None; - to_state = None; - range = Range::empty(); - } - _ => return Err(EngineError::TokenError(TokenError::UnknownToken)), - }; - } - if let Some(to_state) = to_state { - Self::apply_transition(&mut automaton, from_state.unwrap(), to_state, &range)?; - } - Ok(automaton) - } - - fn apply_transition( - automaton: &mut FastAutomaton, - from_state: State, - to_state: State, - range: &Range, - ) -> Result<(), EngineError> { - let condition = Condition::from_range(range, automaton.get_spanning_set())?; - automaton.add_transition_to(from_state, to_state, &condition); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use embed_automaton::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion_for_fair_and_ai("(a|b)"); - assert_embedding_convertion_for_fair_and_ai("(|a)"); - assert_embedding_convertion_for_fair_and_ai(".*ab"); - assert_embedding_convertion_for_fair_and_ai("toto"); - assert_embedding_convertion_for_fair_and_ai(".{2,3}"); - assert_embedding_convertion_for_fair_and_ai("q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair_and_ai(".*q(ab|ca|ab|abc)x"); - assert_embedding_convertion_for_fair( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - ); - assert_embedding_convertion_for_fair("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"); - - Ok(()) - } - - fn assert_embedding_convertion_for_fair(regex: &str) { - assert_embedding_convertion(regex, true); - } - - fn assert_embedding_convertion_for_fair_and_ai(regex: &str) { - assert_embedding_convertion(regex, false); - } - - fn assert_embedding_convertion(regex: &str, ignore_ai: bool) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_embedding(); - - // FAIR - let embedding_u16 = AutomatonToken::to_fair_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u16 - .iter() - .map(|&t| AutomatonToken::from_fair_token(t)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); - - if !ignore_ai { - // AI - let embedding_u8 = AutomatonToken::to_ai_tokens(&embedding).unwrap(); - let embedding: Vec = embedding_u8 - .iter() - .map(|&t| AutomatonToken::from_ai_token(t)) - .collect(); - - let unembedded_automaton = tokenizer.from_embedding(&embedding).unwrap(); - - assert!(automaton - .subtraction(&unembedded_automaton) - .unwrap() - .is_empty()); - assert!(unembedded_automaton - .subtraction(&automaton) - .unwrap() - .is_empty()); - } - } -} diff --git a/src/tokenizer/embed_regex.rs b/src/tokenizer/embed_regex.rs deleted file mode 100644 index cb581e6..0000000 --- a/src/tokenizer/embed_regex.rs +++ /dev/null @@ -1,307 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_token::RegexToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_embedding(&self, regex: &RegularExpression) -> Vec { - let mut vec = self.to_regex_embedding_vec(regex); - - Self::append_counter_if_necessary(&mut vec); - - vec - } - - fn append_counter_if_necessary(vec: &mut Vec) { - if let Some(last) = vec.last() { - match last { - RegexToken::RepetitionNone => {} - RegexToken::Repetition(_) => {} - RegexToken::EndGroup => {} - RegexToken::StartGroup => {} - RegexToken::Alternation => {} - RegexToken::Error => todo!(), - _ => { - vec.push(RegexToken::Repetition(1)); - } - }; - } - } - - fn to_regex_embedding_vec(&self, regex: &RegularExpression) -> Vec { - let mut vec = vec![]; - - match regex { - RegularExpression::Character(range) => { - self.range_tokenizer - .range_to_embedding(range) - .unwrap() - .into_iter() - .for_each(|t| vec.push(RegexToken::Range(t))); - } - RegularExpression::Repetition(regex, min, max_opt) => { - if matches!( - **regex, - RegularExpression::Repetition(_, _, _) | RegularExpression::Concat(_) - ) { - vec.push(RegexToken::StartGroup); - vec.extend(self.to_regex_embedding_vec(regex)); - vec.push(RegexToken::EndGroup); - } else { - vec.extend(self.to_regex_embedding_vec(regex)); - } - - vec.push(RegexToken::Repetition(*min as u16)); - - if let Some(max) = max_opt { - if max != min { - vec.push(RegexToken::Repetition(*max as u16)); - } - } else { - vec.push(RegexToken::RepetitionNone); - } - } - RegularExpression::Concat(elements) => { - for element in elements { - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - } - } - RegularExpression::Alternation(elements) => { - vec.push(RegexToken::StartGroup); - - for i in 0..elements.len() { - let element = &elements[i]; - vec.extend(self.to_regex_embedding_vec(element)); - Self::append_counter_if_necessary(&mut vec); - if i < elements.len() - 1 { - vec.push(RegexToken::Alternation); - } - } - - vec.push(RegexToken::EndGroup); - } - } - - vec - } - - pub fn from_regex_embedding( - &self, - vec: &[RegexToken], - ) -> Result { - let mut regex_groups = vec![(RegularExpression::new_empty_string(), false)]; - let mut current_range: Option = None; - let mut current_min = None; - for i in 0..vec.len() { - let token = vec[i]; - let current_group = regex_groups.len() - 1; - match token { - RegexToken::Range(range_token) => { - let range = self.range_tokenizer.token_to_range(&range_token).unwrap(); - if let Some(curr_range) = ¤t_range { - current_range = Some(curr_range.union(range)); - } else { - current_range = Some(range.clone()); - } - } - RegexToken::StartGroup => { - regex_groups.push((RegularExpression::new_empty_string(), false)); - } - RegexToken::EndGroup => { - if current_group == 0 { - return Err(TokenError::SyntaxError); - } - if i == vec.len() - 1 || !matches!(vec[i + 1], RegexToken::Repetition(_)) { - let alternation: bool = regex_groups[current_group].1; - Self::pop_regex_group(&mut regex_groups, &None, &None); - if alternation { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - } - } - RegexToken::Alternation => { - if regex_groups[current_group].1 { - Self::pop_regex_group(&mut regex_groups, &None, &None); - } - regex_groups.push((RegularExpression::new_empty_string(), true)); - } - RegexToken::RepetitionNone => { - if current_min.is_some() { - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - ¤t_min, - &None, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, ¤t_min, &None); - } - current_min = None; - } else { - return Err(TokenError::SyntaxError); - } - } - RegexToken::Repetition(count) => { - if current_min.is_some() - || i == vec.len() - 1 - || !matches!(vec[i + 1], RegexToken::Repetition(_)) - && !matches!(vec[i + 1], RegexToken::RepetitionNone) - { - let min; - let max; - if current_min.is_some() { - min = current_min; - max = Some(count as u32); - } else { - min = Some(count as u32); - max = Some(count as u32); - } - if let Some(range) = ¤t_range { - Self::add_regex( - &mut regex_groups, - &min, - &max, - &RegularExpression::Character(range.clone()), - false, - ); - current_range = None; - } else { - Self::pop_regex_group(&mut regex_groups, &min, &max); - } - current_min = None; - } else { - current_min = Some(count as u32); - } - } - _ => return Err(TokenError::UnknownToken), - }; - } - - Ok(regex_groups[0].0.clone()) - } - - fn pop_regex_group( - regex_groups: &mut Vec<(RegularExpression, bool)>, - current_min: &Option, - current_max: &Option, - ) -> bool { - if regex_groups.len() <= 1 { - return false; - } - - let popped_group = regex_groups.pop().unwrap(); - Self::add_regex( - regex_groups, - current_min, - current_max, - &popped_group.0, - popped_group.1, - ); - true - } - - fn add_regex( - regex_groups: &mut [(RegularExpression, bool)], - current_min: &Option, - current_max: &Option, - regex: &RegularExpression, - alternation: bool, - ) { - let current_group = regex_groups.len() - 1; - let regex_to_use = if let Some(min) = current_min { - if min == &1 && current_max.is_some() { - if current_max.unwrap() == 1 { - regex.clone() - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - RegularExpression::Repetition(Box::new(regex.clone()), *min, *current_max) - } - } else { - regex.clone() - }; - - if alternation { - regex_groups[current_group].0 = regex_groups[current_group].0.union(®ex_to_use); - } else { - regex_groups[current_group].0 = - regex_groups[current_group].0.concat(®ex_to_use, true); - } - } -} - -#[cfg(test)] -mod tests { - use embed_regex::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(".*"); - assert_embedding_convertion("(a|b)"); - assert_embedding_convertion("(|a)"); - assert_embedding_convertion(".*ab"); - assert_embedding_convertion("[a-e]{3}"); - assert_embedding_convertion("[a-e]{3}efg"); - assert_embedding_convertion("toto"); - assert_embedding_convertion(".{2,3}"); - assert_embedding_convertion("q(abc?|ca)x"); - assert_embedding_convertion(".*q(abc?|ca)x"); - assert_embedding_convertion("(abc){3,6}"); - assert_embedding_convertion("((|a)abd+){3}"); - /*assert_embedding_convertion( - "((aad|ads|a)*abc.*def.*uif(aad|ads|x)*abc.*oxs.*def(aad|ads|ax)*abc.*def.*ksd|q)", - );*/ - Ok(()) - } - - fn assert_embedding_convertion(regex: &str) { - let regex = RegularExpression::new(regex).unwrap(); - println!("{}", regex); - - let automaton = regex.to_automaton().unwrap().determinize().unwrap(); - //automaton.to_dot(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_embedding(®ex); - - //println!("{:?}", embedding); - - // FAIR - let embedding_u16 = RegexToken::to_fair_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u16 - .iter() - .map(|&t| RegexToken::from_fair_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - - // AI - let embedding_u8 = RegexToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_regex = tokenizer.from_regex_embedding(&embedding).unwrap(); - assert_eq!(regex, unembedded_regex); - } -} diff --git a/src/tokenizer/embed_regex_operations.rs b/src/tokenizer/embed_regex_operations.rs deleted file mode 100644 index 4dcb19f..0000000 --- a/src/tokenizer/embed_regex_operations.rs +++ /dev/null @@ -1,119 +0,0 @@ -use token::TokenError; - -use crate::regex::RegularExpression; - -use self::token::regex_operations_token::RegexOperationsToken; - -use super::*; - -impl Tokenizer<'_> { - pub fn to_regex_operations_embedding( - &self, - regex_operations: &[(bool, RegularExpression)], - ) -> Vec { - let mut vec = vec![]; - - for (not, regex) in regex_operations { - if !vec.is_empty() { - vec.push(RegexOperationsToken::And); - } - if *not { - vec.push(RegexOperationsToken::Not); - } - - vec.extend( - self.to_regex_embedding(regex) - .into_iter() - .map(RegexOperationsToken::RegexToken), - ); - } - - vec - } - - pub fn from_regex_operations_embedding( - &self, - vec: &[RegexOperationsToken], - ) -> Result, TokenError> { - let mut operations = vec![]; - let mut current_regex_not = false; - let mut current_regex_token = vec![]; - for token in vec { - match token { - RegexOperationsToken::RegexToken(regex_token) => { - current_regex_token.push(*regex_token) - } - RegexOperationsToken::And => { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - current_regex_not = false; - current_regex_token.clear(); - } - RegexOperationsToken::Not => current_regex_not = true, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }; - } - - if !current_regex_token.is_empty() { - let regex = self.from_regex_embedding(¤t_regex_token)?; - operations.push((current_regex_not, regex)); - } - - Ok(operations) - } -} - -#[cfg(test)] -mod tests { - use embed_regex_operations::token::Token; - - use crate::regex::RegularExpression; - - use super::*; - - #[test] - fn test_tokenize() -> Result<(), String> { - assert_embedding_convertion(&[(false, "(a|b)")]); - assert_embedding_convertion(&[(false, "(|a)")]); - assert_embedding_convertion(&[(false, ".*ab")]); - assert_embedding_convertion(&[(true, "toto")]); - assert_embedding_convertion(&[(false, ".{2,3}")]); - assert_embedding_convertion(&[(false, "q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, ".*q(abc?|ca)x")]); - assert_embedding_convertion(&[(false, "(abc){3,6}")]); - assert_embedding_convertion(&[(true, "((|a)abd+){3}")]); - - assert_embedding_convertion(&[(false, ".*a.*"), (false, ".*b.*"), (true, ".*abc.*")]); - Ok(()) - } - - fn assert_embedding_convertion(operations: &[(bool, &str)]) { - let mut automaton = FastAutomaton::new_total(); - let operations: Vec<(bool, RegularExpression)> = operations - .iter() - .map(|(not, regex)| { - let regex = RegularExpression::new(regex).unwrap(); - automaton = automaton.intersection(®ex.to_automaton().unwrap()).unwrap(); - (*not, regex) - }) - .collect(); - - let tokenizer = Tokenizer::new(&automaton); - let embedding = tokenizer.to_regex_operations_embedding(&operations); - - // AI - let embedding_u8: Vec = RegexOperationsToken::to_ai_tokens(&embedding).unwrap(); - assert_eq!( - embedding, - embedding_u8 - .iter() - .map(|&t| RegexOperationsToken::from_ai_token(t)) - .collect::>() - ); - - let unembedded_operations = tokenizer - .from_regex_operations_embedding(&embedding) - .unwrap(); - assert_eq!(operations, unembedded_operations); - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs deleted file mode 100644 index 2e3e4ed..0000000 --- a/src/tokenizer/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -use std::{cmp::Ordering, collections::VecDeque, vec}; - -use ahash::HashMapExt; -use crate::fast_automaton::spanning_set::SpanningSet; -use crate::Range; - -use crate::{ - fast_automaton::{FastAutomaton, State}, - IntMap, IntSet, -}; - -use self::{range_tokenizer::RangeTokenizer, token::automaton_token::AutomatonToken}; - -mod embed_automaton; -mod embed_regex; -mod embed_regex_operations; -pub mod range_tokenizer; -pub mod token; - -#[derive(Debug)] -pub struct Tokenizer<'a> { - range_tokenizer: RangeTokenizer<'a>, - automaton: &'a FastAutomaton, - state_to_token: IntMap, -} - -impl Tokenizer<'_> { - pub fn new(automaton: &FastAutomaton) -> Tokenizer<'_> { - let mut worklist = VecDeque::with_capacity(automaton.get_number_of_states()); - let mut seen = IntSet::default(); - - worklist.push_front(automaton.get_start_state()); - - let mut state_counter: u16 = 0; - let mut state_to_token = IntMap::with_capacity(automaton.get_number_of_states()); - - while let Some(current_state) = worklist.pop_back() { - if !seen.insert(current_state) { - continue; - } - - state_to_token.insert(current_state, state_counter); - state_counter += 1; - - automaton - .transitions_from_state_enumerate_iter(¤t_state) - .filter(|(_, c)| !c.is_empty()) - .for_each(|(to_state, _)| { - if !seen.contains(to_state) { - worklist.push_front(*to_state); - } - }); - } - - Tokenizer { - range_tokenizer: RangeTokenizer::new(automaton.get_spanning_set()), - automaton, - state_to_token, - } - } - - pub fn get_number_of_spanning_ranges(&self) -> usize { - self.range_tokenizer.get_number_of_spanning_ranges() - } - - pub fn get_spanning_set(&self) -> &SpanningSet { - self.range_tokenizer.get_spanning_set() - } -} - -#[cfg(test)] -mod tests {} diff --git a/src/tokenizer/range_tokenizer.rs b/src/tokenizer/range_tokenizer.rs deleted file mode 100644 index 3950033..0000000 --- a/src/tokenizer/range_tokenizer.rs +++ /dev/null @@ -1,74 +0,0 @@ -use self::token::range_token::RangeToken; - -use super::*; - -#[derive(Debug)] -pub struct RangeTokenizer<'a> { - spanning_set: &'a SpanningSet, - total: Range, -} - -impl RangeTokenizer<'_> { - pub fn get_spanning_set(&self) -> &SpanningSet { - self.spanning_set - } - - pub fn new(spanning_set: &SpanningSet) -> RangeTokenizer<'_> { - let total = spanning_set.get_rest().complement(); - RangeTokenizer { - spanning_set, - total, - } - } - - pub fn range_to_embedding(&self, range: &Range) -> Option> { - if range == &self.total { - return Some(vec![RangeToken::Total]); - } else if !range.difference(&self.total).is_empty() { - return None; - } - - let mut vec = vec![]; - for (token, base) in self.spanning_set.get_spanning_ranges().enumerate() { - if range.contains_all(base) { - vec.push(RangeToken::Base(token)); - } - } - vec.sort_unstable(); - - Some(vec) - } - - pub fn embedding_to_range(&self, vec: &[RangeToken]) -> Option { - if vec.is_empty() { - return Some(Range::empty()); - } - - let mut range = Range::empty(); - if vec[0] == RangeToken::Total { - return Some(self.total.clone()); - } - - for token in vec { - if let Some(base) = self.token_to_range(token) { - range = range.union(base); - } else { - return None; - } - } - - Some(range) - } - - pub fn token_to_range(&self, token: &RangeToken) -> Option<&Range> { - match token { - RangeToken::Total => Some(&self.total), - RangeToken::Base(b) => self.spanning_set.get_spanning_range(*b), - RangeToken::Error => panic!("error token"), - } - } - - pub fn get_number_of_spanning_ranges(&self) -> usize { - self.spanning_set.get_number_of_spanning_ranges() - } -} diff --git a/src/tokenizer/token/automaton_token.rs b/src/tokenizer/token/automaton_token.rs deleted file mode 100644 index 215ffed..0000000 --- a/src/tokenizer/token/automaton_token.rs +++ /dev/null @@ -1,115 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum AutomatonToken { - Range(RangeToken), - State(u16), - AcceptState, - SeparatorState, - Error, -} - -impl Ord for AutomatonToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for AutomatonToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl AutomatonToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_STATE: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_ACCEPT_STATE: u8 = Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES; - const TK_AI_SEPARATOR_STATE: u8 = Self::TK_AI_ACCEPT_STATE + 1; - - pub const AI_MAX_NUMBER_OF_STATES: u8 = 100; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_SEPARATOR_STATE + 1; - - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_STATE: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_ACCEPT_STATE: u16 = Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES; - const TK_FAIR_SEPARATOR_STATE: u16 = Self::TK_FAIR_ACCEPT_STATE + 1; - - pub const FAIR_MAX_NUMBER_OF_STATES: u16 = 65_000; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_SEPARATOR_STATE + 1; -} - -impl Token for AutomatonToken { - fn from_ai_token(token: u8) -> AutomatonToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - AutomatonToken::Range(RangeToken::from_ai_token(token)) - } else if (Self::TK_AI_STATE..Self::TK_AI_STATE + Self::AI_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State((token - Self::TK_AI_STATE) as u16) - } else if token == Self::TK_AI_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_AI_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_ai_token()?, - AutomatonToken::State(s) => { - let max = Self::AI_MAX_NUMBER_OF_STATES; - let s = *s as u8; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_AI_STATE - } - AutomatonToken::AcceptState => Self::TK_AI_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_AI_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> AutomatonToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - AutomatonToken::Range(RangeToken::from_fair_token(token)) - } else if (Self::TK_FAIR_STATE..Self::TK_FAIR_STATE + Self::FAIR_MAX_NUMBER_OF_STATES) - .contains(&token) - { - AutomatonToken::State(token - Self::TK_FAIR_STATE) - } else if token == Self::TK_FAIR_ACCEPT_STATE { - AutomatonToken::AcceptState - } else if token == Self::TK_FAIR_SEPARATOR_STATE { - AutomatonToken::SeparatorState - } else { - AutomatonToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - AutomatonToken::Range(r) => r.to_fair_token()?, - AutomatonToken::State(s) => { - let max = Self::FAIR_MAX_NUMBER_OF_STATES; - let s = *s; - if s > max { - return Err(TokenError::TokenOutOfBound("State", max.into(), s.into())); - } - s + Self::TK_FAIR_STATE - } - AutomatonToken::AcceptState => Self::TK_FAIR_ACCEPT_STATE, - AutomatonToken::SeparatorState => Self::TK_FAIR_SEPARATOR_STATE, - AutomatonToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/mod.rs b/src/tokenizer/token/mod.rs deleted file mode 100644 index 2f28e32..0000000 --- a/src/tokenizer/token/mod.rs +++ /dev/null @@ -1,60 +0,0 @@ -use std::fmt::Display; - -use super::*; - -pub mod automaton_token; -pub mod range_token; -pub mod regex_operations_token; -pub mod regex_token; - -#[derive(Debug, PartialEq, Eq)] -pub enum TokenError { - TokenOutOfBound(&'static str, usize, usize), - UnknownToken, - SyntaxError, -} - -impl Display for TokenError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TokenError::TokenOutOfBound(token, expected, got) => write!( - f, - "TokenOutOfBound: {token}, expected: {expected}, got: {got}." - ), - TokenError::UnknownToken => write!(f, "UnknownToken"), - TokenError::SyntaxError => write!(f, "SyntaxError"), - } - } -} - -pub trait Token { - fn from_ai_token(token: u8) -> Self; - - fn to_ai_token(&self) -> Result; - - fn to_ai_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_ai_token()?); - } - Ok(vec) - } - - fn from_fair_token(token: u16) -> Self; - - fn to_fair_token(&self) -> Result; - - fn to_fair_tokens(tokens: &[Self]) -> Result, TokenError> - where - Self: Sized, - { - let mut vec = Vec::with_capacity(tokens.len()); - for token in tokens { - vec.push(token.to_fair_token()?); - } - Ok(vec) - } -} diff --git a/src/tokenizer/token/range_token.rs b/src/tokenizer/token/range_token.rs deleted file mode 100644 index 62a1753..0000000 --- a/src/tokenizer/token/range_token.rs +++ /dev/null @@ -1,92 +0,0 @@ -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RangeToken { - Total, - Base(usize), - Error, -} - -impl RangeToken { - const TK_AI_TOTAL: u8 = 0; - const TK_AI_BASE: u8 = 1; - - pub const AI_MAX_NUMBER_OF_BASES: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES + 1; - - const TK_FAIR_TOTAL: u16 = 0; - const TK_FAIR_BASE: u16 = 1; - - pub const FAIR_MAX_NUMBER_OF_BASES: u16 = 127; - - pub const FAIR_VOCABULARY_SIZE: u16 = Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES + 1; -} - -impl Ord for RangeToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RangeToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Token for RangeToken { - fn from_ai_token(token: u8) -> RangeToken { - if token == Self::TK_AI_TOTAL { - RangeToken::Total - } else if (Self::TK_AI_BASE..Self::TK_AI_BASE + Self::AI_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_AI_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_AI_TOTAL, - RangeToken::Base(b) => { - let max = Self::AI_MAX_NUMBER_OF_BASES; - let b = *b as u8; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_AI_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> RangeToken { - if token == Self::TK_FAIR_TOTAL { - RangeToken::Total - } else if (Self::TK_FAIR_BASE..Self::TK_FAIR_BASE + Self::FAIR_MAX_NUMBER_OF_BASES) - .contains(&token) - { - RangeToken::Base((token - Self::TK_FAIR_BASE) as usize) - } else { - RangeToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RangeToken::Total => Self::TK_FAIR_TOTAL, - RangeToken::Base(b) => { - let max = Self::FAIR_MAX_NUMBER_OF_BASES; - let b = *b as u16; - if b > max { - return Err(TokenError::TokenOutOfBound("Base", max.into(), b.into())); - } - b + Self::TK_FAIR_BASE - } - RangeToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/src/tokenizer/token/regex_operations_token.rs b/src/tokenizer/token/regex_operations_token.rs deleted file mode 100644 index 1074f7f..0000000 --- a/src/tokenizer/token/regex_operations_token.rs +++ /dev/null @@ -1,64 +0,0 @@ -use self::regex_token::RegexToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexOperationsToken { - RegexToken(RegexToken), - And, - Not, - Error, -} - -impl Ord for RegexOperationsToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_ai_token().unwrap()).cmp(&other.to_ai_token().unwrap()) - } -} - -impl PartialOrd for RegexOperationsToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexOperationsToken { - const TK_AI_REGEX_TOKEN: u8 = 0; - const TK_AI_AND: u8 = Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE; - const TK_AI_NOT: u8 = Self::TK_AI_AND + 1; - - pub const AI_VOCABULARY_SIZE: u8 = Self::TK_AI_NOT + 1; -} - -impl Token for RegexOperationsToken { - fn from_ai_token(token: u8) -> RegexOperationsToken { - if (Self::TK_AI_REGEX_TOKEN..Self::TK_AI_REGEX_TOKEN + RegexToken::AI_VOCABULARY_SIZE) - .contains(&token) - { - RegexOperationsToken::RegexToken(RegexToken::from_ai_token(token)) - } else if token == Self::TK_AI_AND { - RegexOperationsToken::And - } else if token == Self::TK_AI_NOT { - RegexOperationsToken::Not - } else { - RegexOperationsToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexOperationsToken::RegexToken(regex_token) => regex_token.to_ai_token()?, - RegexOperationsToken::And => Self::TK_AI_AND, - RegexOperationsToken::Not => Self::TK_AI_NOT, - RegexOperationsToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(_: u16) -> RegexOperationsToken { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } - - fn to_fair_token(&self) -> Result { - panic!("A RegexOperationsToken does not have a FAIR representation.") - } -} diff --git a/src/tokenizer/token/regex_token.rs b/src/tokenizer/token/regex_token.rs deleted file mode 100644 index 2f4c2f2..0000000 --- a/src/tokenizer/token/regex_token.rs +++ /dev/null @@ -1,137 +0,0 @@ -use self::range_token::RangeToken; - -use super::*; - -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -pub enum RegexToken { - Range(RangeToken), - StartGroup, - EndGroup, - Alternation, - RepetitionNone, - Repetition(u16), - Error, -} - -impl Ord for RegexToken { - fn cmp(&self, other: &Self) -> Ordering { - (self.to_fair_token().unwrap()).cmp(&other.to_fair_token().unwrap()) - } -} - -impl PartialOrd for RegexToken { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl RegexToken { - const TK_AI_RANGE: u8 = 0; - const TK_AI_START_GROUP: u8 = Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE; - const TK_AI_END_GROUP: u8 = Self::TK_AI_START_GROUP + 1; - const TK_AI_ALTERNATION: u8 = Self::TK_AI_END_GROUP + 1; - const TK_AI_REPETITION_NONE: u8 = Self::TK_AI_ALTERNATION + 1; - const TK_AI_REPETITION: u8 = Self::TK_AI_REPETITION_NONE + 1; - - pub const AI_MAX_NUMBER_OF_REPETITION: u8 = 10; - - pub const AI_VOCABULARY_SIZE: u8 = - Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION + 1; - - const TK_FAIR_RANGE: u16 = 0; - const TK_FAIR_START_GROUP: u16 = Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE; - const TK_FAIR_END_GROUP: u16 = Self::TK_FAIR_START_GROUP + 1; - const TK_FAIR_ALTERNATION: u16 = Self::TK_FAIR_END_GROUP + 1; - const TK_FAIR_REPETITION_NONE: u16 = Self::TK_FAIR_ALTERNATION + 1; - const TK_FAIR_REPETITION: u16 = Self::TK_FAIR_REPETITION_NONE + 1; - - pub const FAIR_MAX_NUMBER_OF_REPETITION: u16 = 1024; - - pub const FAIR_VOCABULARY_SIZE: u16 = - Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION + 1; -} - -impl Token for RegexToken { - fn from_ai_token(token: u8) -> RegexToken { - if (Self::TK_AI_RANGE..Self::TK_AI_RANGE + RangeToken::AI_VOCABULARY_SIZE).contains(&token) - { - RegexToken::Range(RangeToken::from_ai_token(token)) - } else if token == Self::TK_AI_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_AI_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_AI_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_AI_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_AI_REPETITION - ..Self::TK_AI_REPETITION + Self::AI_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition((token - Self::TK_AI_REPETITION) as u16) - } else { - RegexToken::Error - } - } - - fn to_ai_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_ai_token()?, - RegexToken::StartGroup => Self::TK_AI_START_GROUP, - RegexToken::EndGroup => Self::TK_AI_END_GROUP, - RegexToken::Alternation => Self::TK_AI_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_AI_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::AI_MAX_NUMBER_OF_REPETITION; - let r = *r as u8; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_AI_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } - - fn from_fair_token(token: u16) -> RegexToken { - if (Self::TK_FAIR_RANGE..Self::TK_FAIR_RANGE + RangeToken::FAIR_VOCABULARY_SIZE) - .contains(&token) - { - RegexToken::Range(RangeToken::from_fair_token(token)) - } else if token == Self::TK_FAIR_START_GROUP { - RegexToken::StartGroup - } else if token == Self::TK_FAIR_END_GROUP { - RegexToken::EndGroup - } else if token == Self::TK_FAIR_ALTERNATION { - RegexToken::Alternation - } else if token == Self::TK_FAIR_REPETITION_NONE { - RegexToken::RepetitionNone - } else if (Self::TK_FAIR_REPETITION - ..Self::TK_FAIR_REPETITION + Self::FAIR_MAX_NUMBER_OF_REPETITION) - .contains(&token) - { - RegexToken::Repetition(token - Self::TK_FAIR_REPETITION) - } else { - RegexToken::Error - } - } - - fn to_fair_token(&self) -> Result { - Ok(match self { - RegexToken::Range(r) => r.to_fair_token()?, - RegexToken::StartGroup => Self::TK_FAIR_START_GROUP, - RegexToken::EndGroup => Self::TK_FAIR_END_GROUP, - RegexToken::Alternation => Self::TK_FAIR_ALTERNATION, - RegexToken::RepetitionNone => Self::TK_FAIR_REPETITION_NONE, - RegexToken::Repetition(r) => { - let max = Self::FAIR_MAX_NUMBER_OF_REPETITION; - let r = *r; - if r > max { - return Err(TokenError::TokenOutOfBound("Repetition", max.into(), r.into())); - } - r + Self::TK_FAIR_REPETITION - } - RegexToken::Error => return Err(TokenError::UnknownToken), - }) - } -} diff --git a/tests/data/regex.txt b/tests/data/regex.txt index e5fb5df..c65eca9 100644 --- a/tests/data/regex.txt +++ b/tests/data/regex.txt @@ -1,3 +1,5 @@ +(a*,a*)? +(?:\s*,\s*(?:0|1|0?\.\d+))? [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f] a{2,3} (abc|fg){2} @@ -19,4 +21,59 @@ a+(ba+)* [0-9]+[A-Z]* ù -^\d$ \ No newline at end of file +^\d$ +foo +bar? +baz+ +qux* +quux{3} +quuux{2,5} +quuuux{0,4} +.* +[aeiou] +[^aeiou] +[a-zA-Z0-9] +[\dA-Fa-f] +[\w&&[^_]] +[[:alpha:]]+ +[\p{L}]+ +[0-9]{2,4} +[01]?\d +[1-9][0-9]* +(cat|dog|mouse) +(?:red|green|blue){2} +(gr(a|e)y){1,3} +((ab|cd)ef)+ +(a(b(c|d)e)f)+ +(a|b(c|d(e|f))){2,3} +(?:abc){0,} +(?:abc){1,} +(?:abc){2,5} +a++ +\.\*\?\+\(\)\[\]\{\}\\\| +\u0041\u0042\u0043 +\p{Greek}+ +\p{Sc} +[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[A-Za-z]{2,} +((25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]?\d?\d) +https?://[^\s/$.?#][^\s]* +\d{4}/\d{2}/\d{2} +\d{1,2}:\d{2}(:\d{2})? +<([A-Za-z][A-Za-z0-9]*)[^>]*?/> +\{(?:[^{}]|\{[^{}]*\})*\} +(?:\d[ -]*?){13,16} +#([A-Fa-f0-9]{8}) +(a|b|c|d|e|f|g|h|i|j){5} +(?:"[^"]*"|[^,]*)(?:,(?:"[^"]*"|[^,]*))* +([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2} +[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12} +[[:alnum:]&&[^0-9]] +[ \t]+ +[\r\n]+ +[^\t\r\n]+ +(a*,a*)* +#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3}) +\{(?:\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)(?:,\s*"[^"]*"\s*:\s*(?:\d+|"(...)" )\s*)*\} +rgba?\(\s*(?:\d{1,3}\s*,\s*){2}\d{1,3}(?:\s*,\s*(?:0|1|0?\.\d+))?\s*\) +[+-]?(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)? +<\w+(?:\s+\w+(?:="[^"]*")?)*\s*/?> diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 1e572a9..a4c142d 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -9,31 +9,26 @@ use regexsolver::regex::RegularExpression; fn assert_regex(regex: &str) { let re = Regex::new(&format!("(?s)^{}$", regex)).unwrap(); - let regex = RegularExpression::new(regex).unwrap(); + let regex = RegularExpression::parse(regex, true).unwrap(); let automaton = regex.to_automaton().unwrap(); - let strings = automaton.generate_strings(500).unwrap(); + let strings = automaton.generate_strings(500, 0).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } - assert_eq!( - automaton.get_number_of_states(), - regex.get_number_of_states_in_nfa() - ); - let determinized_automaton = automaton.determinize().unwrap(); - let strings = determinized_automaton.generate_strings(500).unwrap(); + let strings = determinized_automaton.generate_strings(500, 0).unwrap(); for string in strings { assert!(re.is_match(&string), "'{string}'"); } - assert!(automaton.is_subset_of(&determinized_automaton).unwrap()); - assert!(determinized_automaton.is_subset_of(&automaton).unwrap()); - assert!(automaton.is_equivalent_of(&determinized_automaton).unwrap()); + assert!(automaton.subset(&determinized_automaton).unwrap()); + assert!(determinized_automaton.subset(&automaton).unwrap()); + assert!(automaton.equivalent(&determinized_automaton).unwrap()); - let regex_from_automaton = automaton.to_regex().unwrap(); + let regex_from_automaton = automaton.to_regex(); let automaton_from_regex = regex_from_automaton.to_automaton().unwrap(); - assert!(automaton.is_equivalent_of(&automaton_from_regex).unwrap()); + assert!(automaton.equivalent(&automaton_from_regex).unwrap()); } #[test] diff --git a/tests/proptest_strategies.rs b/tests/proptest_strategies.rs new file mode 100644 index 0000000..927ad7d --- /dev/null +++ b/tests/proptest_strategies.rs @@ -0,0 +1,1169 @@ +//! Property-based tests built on `proptest` strategies that generate random +//! DFAs, NFAs and regular expressions. +//! +//! # Coverage guarantee +//! +//! The strategies are parameterized by a fixed finite alphabet ([`ALPHABET`]) +//! and a maximum number of states ([`MAX_STATES`]). Within those bounds every +//! structure has a strictly positive probability of being generated: +//! +//! * [`arb_dfa`] — every deterministic automaton over the alphabet with +//! `1..=MAX_STATES` states (start state fixed to `0`, any accepting subset, +//! any total transition function) can be produced. A transition function maps +//! each `(state, letter)` to at most one target, which is exactly the +//! definition of a DFA, so the whole DFA space is covered. +//! * [`arb_nfa`] — every nondeterministic automaton is reachable: each ordered +//! `(from, to)` pair may carry any subset of the alphabet letters as its +//! label, plus optional epsilon transitions. Since labels may overlap, this +//! spans all NFAs (and, as a subset, all DFAs). +//! * [`arb_regex`] — every regular expression built from the four +//! [`RegularExpression`] variants up to the configured recursion depth and +//! bound sizes is reachable, including the empty language (`[]`) and `.`. +//! +//! The alphabet is small on purpose so the spaces are finite and the +//! determinization / set operations under test stay cheap. +//! +//! While *coverage* is uniform-in-support, the *distribution* is deliberately +//! shaped — the `inspect::stats` test measures the result and asserts floors: +//! +//! * Per-automaton edge/epsilon/accept densities are sampled, with the accept +//! density centered on ½ (where accepting/rejecting states are hardest to +//! merge, keeping minimal DFAs — and therefore the work done by minimize / +//! equivalence / state elimination — large). +//! * An optional "anchor" state is forced accepting so the empty language is +//! an occasional edge case instead of a fifth of the sample. +//! * A per-automaton acyclic mode (≈⅓ of cases) generates DAGs, whose finite +//! languages exercise the topological-sort paths of the cardinality and +//! max-length analyses that cyclic automata never reach. +//! * The character-class strategy favors single letters so that a `[]` +//! (empty-language) leaf does not collapse most expressions. +//! +//! All weights stay strictly inside (0, 1), preserving the +//! non-null-probability guarantee. + +use proptest::prelude::*; +use regex_charclass::char::Char; +use regexsolver::CharRange; +use regexsolver::cardinality::Cardinality; +use regexsolver::error::EngineError; +use regexsolver::execution_profile::ExecutionProfileBuilder; +use regexsolver::fast_automaton::FastAutomaton; +use regexsolver::regex::RegularExpression; + +/// Fixed alphabet the strategies draw transition labels from. +pub const ALPHABET: &[char] = &['a', 'b']; + +/// Maximum number of states a generated automaton can have. +pub const MAX_STATES: usize = 5; + +/// The single-character range for the `i`-th letter of `alphabet`. +fn letter_over(alphabet: &[char], i: usize) -> CharRange { + let c = Char::new(alphabet[i]); + CharRange::new_from_range(c..=c) +} + +/// The single-character range for the `i`-th [`ALPHABET`] letter. +fn letter(i: usize) -> CharRange { + letter_over(ALPHABET, i) +} + +/// Number of transition-label letters (one per [`ALPHABET`] letter). +fn num_bases() -> usize { + ALPHABET.len() +} + +/// Number of states, biased toward larger automata (the maximum of two uniform +/// draws). Tiny automata are still generated, but the n = 1 space is almost +/// entirely degenerate, so uniform sampling would waste a quarter of all cases +/// on it. +fn arb_num_states() -> impl Strategy { + (1usize..=MAX_STATES, 1usize..=MAX_STATES).prop_map(|(a, b)| a.max(b)) +} + +/// Builds an automaton over `alphabet` from a structural description using +/// only the public API. +/// +/// `accepts[s]` marks state `s` accepting; each `(from, to, mask)` adds a +/// transition whose label is the union of the letters selected by `mask` +/// (`add_transition_from_range` grows the automaton's spanning set as +/// needed); each `(from, to)` in `eps` adds an epsilon transition. The start +/// state is `0`. +fn build_over( + alphabet: &[char], + n: usize, + accepts: &[bool], + char_edges: &[(usize, usize, Vec)], + eps: &[(usize, usize)], +) -> FastAutomaton { + let mut a = FastAutomaton::new_empty(); + for _ in 1..n { + a.new_state(); + } + + for (s, &acc) in accepts.iter().enumerate() { + if acc { + a.accept(s); + } + } + + for (from, to, mask) in char_edges { + let mut range = CharRange::empty(); + for (i, &on) in mask.iter().enumerate() { + if on { + range = range.union(&letter_over(alphabet, i)); + } + } + a.add_transition_from_range(*from, *to, &range) + .expect("adding a union of alphabet letters never fails"); + } + + // Epsilon transitions are added last: `add_epsilon_transition` eagerly + // folds the target's current transitions into the source. + for (from, to) in eps { + a.add_epsilon_transition(*from, *to); + } + + a +} + +/// [`build_over`] with the default [`ALPHABET`]. +fn build( + n: usize, + accepts: &[bool], + char_edges: &[(usize, usize, Vec)], + eps: &[(usize, usize)], +) -> FastAutomaton { + build_over(ALPHABET, n, accepts, char_edges, eps) +} + +/// Strategy producing every DFA over the alphabet with `1..=MAX_STATES` states. +/// +/// Determinism is structural: for each state and each letter we choose at most +/// one target, so transitions leaving a state always carry disjoint labels. +/// +/// An edge density and an accept density are sampled per automaton (both +/// bounded away from 0 and 1, so every DFA keeps a positive probability). +/// Mostly-total transition functions keep the states connected, which makes +/// degenerate (∅ / {""}) languages the exception rather than the rule. The +/// accept density is centered on ½ because that is where accepting/rejecting +/// states are hardest to merge, i.e. where minimal DFAs stay large. +/// +/// A per-automaton `acyclic` flag (≈⅓ of cases) remaps every chosen target +/// into the forward range `from+1..n`, producing a DAG and therefore a +/// **finite** language. Without it nearly every random automaton contains a +/// cycle, and the finite-language paths of the cardinality and max-length +/// analyses go untested. Cyclic mode still reaches every DFA, so coverage is +/// preserved. +pub fn arb_dfa() -> impl Strategy { + ( + arb_num_states(), + 0.6f64..0.97, + 0.25f64..0.75, + prop::bool::weighted(0.35), + ) + .prop_flat_map(|(n, edge_density, accept_density, acyclic)| { + let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // An "anchor" state forced accepting most of the time: without it + // the whole accept vector samples all-false often enough that the + // empty language eats a fifth of the sample. Non-start states are + // preferred — anchoring the start only inflates the {""} corner. + // The `None` branch keeps every accept subset (incl. all-false) + // reachable. + let anchor = prop::option::weighted(0.85, 1usize.min(n - 1)..n); + // transition function: tf[state][base] = optional target state + let tf = prop::collection::vec( + prop::collection::vec(prop::option::weighted(edge_density, 0usize..n), num_bases()), + n, + ); + (Just(n), accepts, anchor, tf, Just(acyclic)) + }) + .prop_map(|(n, mut accepts, anchor, tf, acyclic)| { + if let Some(k) = anchor { + accepts[k] = true; + } + let nb = num_bases(); + let mut edges: Vec<(usize, usize, Vec)> = Vec::new(); + for (from, row) in tf.iter().enumerate() { + // Group the bases by their chosen target so each (from, to) + // edge gets a single, disjoint-from-its-siblings label. + let mut by_target: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + for (base, target) in row.iter().enumerate() { + if let Some(t) = target { + let t = if acyclic { + if from + 1 >= n { + // the last state of a DAG has no outgoing edge + continue; + } + // remap into the forward range; every forward + // target keeps a positive probability + from + 1 + (*t % (n - from - 1)) + } else { + *t + }; + by_target.entry(t).or_insert_with(|| vec![false; nb])[base] = true; + } + } + for (to, mask) in by_target { + edges.push((from, to, mask)); + } + } + build(n, &accepts, &edges, &[]) + }) +} + +/// Strategy producing every NFA over the alphabet with `1..=MAX_STATES` states +/// (overlapping labels allowed, plus optional epsilon transitions). +/// +/// Instead of a fixed per-bit probability (which blobs large automata and +/// starves small ones), a per-state branching target is sampled and converted +/// into a bit density of `target / (n · |Σ|)`, so the *local* structure is +/// comparable across sizes. Epsilon and accept densities are sampled too. All +/// densities stay strictly inside (0, 1), so every NFA keeps a positive +/// probability. +/// +/// As in [`arb_dfa`], the accept density is centered on ½ and a per-automaton +/// `acyclic` flag (≈⅓ of cases) keeps only forward (`from < to`) edges, +/// producing finite languages; the density is rescaled to the smaller target +/// pool so the out-degree stays comparable. Epsilon transitions are kept rare +/// because [`FastAutomaton::add_epsilon_transition`] eagerly folds the target +/// state into the source, which merges languages and shrinks minimal DFAs. +pub fn arb_nfa() -> impl Strategy { + arb_nfa_over(ALPHABET) +} + +/// [`arb_nfa`] generalized to an arbitrary alphabet, so two operands of a +/// binary operation can be generated over *different* alphabets — the only +/// way to exercise `SpanningSet::merge` and the `ConditionConverter` +/// re-projection (same-alphabet operands share an identical spanning set and +/// the conversion is the identity). +pub fn arb_nfa_over(alphabet: &'static [char]) -> impl Strategy { + ( + arb_num_states(), + 1.0f64..2.8, + 0.02f64..0.12, + 0.25f64..0.75, + prop::bool::weighted(0.35), + ) + .prop_flat_map( + move |(n, target_out_degree, eps_density, accept_density, acyclic)| { + // In acyclic mode only the upper triangle of the matrix survives, + // so the average target pool is half as big. + let effective_targets = if acyclic { + (n as f64 / 2.0).max(1.0) + } else { + n as f64 + }; + let label_density = (target_out_degree + / (effective_targets * alphabet.len() as f64)) + .clamp(0.02, 0.95); + let accepts = prop::collection::vec(prop::bool::weighted(accept_density), n); + // see arb_dfa: keeps the empty language an edge case, not a fifth + // of the sample + let anchor = prop::option::weighted(0.85, 1usize.min(n - 1)..n); + // labels[from][to] = mask over the alphabet letters + let labels = prop::collection::vec( + prop::collection::vec( + prop::collection::vec(prop::bool::weighted(label_density), alphabet.len()), + n, + ), + n, + ); + // eps[from][to] = whether an epsilon transition is present + let eps = prop::collection::vec( + prop::collection::vec(prop::bool::weighted(eps_density), n), + n, + ); + (Just(n), accepts, anchor, labels, eps, Just(acyclic)) + }, + ) + .prop_map(move |(n, mut accepts, anchor, labels, eps, acyclic)| { + if let Some(k) = anchor { + accepts[k] = true; + } + let mut char_edges = Vec::new(); + for (from, row) in labels.iter().enumerate() { + for (to, mask) in row.iter().enumerate() { + if acyclic && to <= from { + continue; + } + if mask.iter().any(|&b| b) { + char_edges.push((from, to, mask.clone())); + } + } + } + let mut eps_edges = Vec::new(); + for (from, row) in eps.iter().enumerate() { + for (to, &on) in row.iter().enumerate() { + let backward = acyclic && to <= from; + if on && from != to && !backward { + eps_edges.push((from, to)); + } + } + } + build_over(alphabet, n, &accepts, &char_edges, &eps_edges) + }) +} + +/// Strategy for a character class: any subset of the alphabet (including the +/// empty language) and, occasionally, the total range `.`. +/// +/// Single letters dominate: the unbiased subset mask would produce the empty +/// class `[]` a quarter of the time, and a single `[]` anywhere in a +/// concatenation collapses the whole expression to the empty language. The +/// mask branch keeps every subset (including `[]`) at a positive probability. +fn arb_charrange() -> impl Strategy { + prop_oneof![ + 8 => (0..ALPHABET.len()).prop_map(letter), + 3 => prop::collection::vec(any::(), ALPHABET.len()).prop_map(|mask| { + let mut r = CharRange::empty(); + for (i, &on) in mask.iter().enumerate() { + if on { + r = r.union(&letter(i)); + } + } + r + }), + 1 => Just(CharRange::total()), + ] +} + +/// Strategy producing regular expressions over the four [`RegularExpression`] +/// variants up to a bounded recursion depth. +/// +/// Repetition gets the heaviest weight: it is the variant that feeds the +/// `{n,m}` expansion, the simplifier and the loop handling of state +/// elimination, and stacking it (`(a*){2}`-style nesting) is where those +/// paths historically break. Concat and alternation still keep substantial +/// weight so all shapes appear. +pub fn arb_regex() -> impl Strategy { + let leaf = arb_charrange().prop_map(RegularExpression::Character); + leaf.prop_recursive(4, 48, 3, |inner| { + prop_oneof![ + 3 => (inner.clone(), 0u32..=2, 0u32..=2, any::()).prop_map( + |(r, min, extra, has_max)| { + // max is min + extra, so the bounds are always valid + let max = if has_max { Some(min + extra) } else { None }; + RegularExpression::Repetition(Box::new(r), min, max) + } + ), + 2 => prop::collection::vec(inner.clone(), 1..=3) + .prop_map(|v| RegularExpression::Concat(v.into())), + 2 => prop::collection::vec(inner, 1..=3).prop_map(RegularExpression::Alternation), + ] + }) +} + +/// Runs `f` under a bounded execution profile, returning `None` when the +/// operation legitimately exceeds the state/time budget (which is not a bug). +fn bounded Result>(f: F) -> Option { + ExecutionProfileBuilder::new() + .max_number_of_states(8192) + .execution_timeout(3000) + .build() + .run(|| match f() { + Ok(v) => Some(v), + Err(EngineError::AutomatonHasTooManyStates) + | Err(EngineError::OperationTimeOutError) => None, + Err(e) => panic!("unexpected engine error: {e:?}"), + }) +} + +fn determinized(a: &FastAutomaton) -> Option { + bounded(|| a.determinize().map(|c| c.into_owned())) +} + +fn complemented(a: &FastAutomaton) -> Option { + bounded(|| { + let mut c = a.clone(); + c.complement()?; + Ok(c) + }) +} + +/// All strings up to length `max_len` over `alphabet` (plus the empty +/// string). +fn probes_over(alphabet: &[char], max_len: usize) -> Vec { + let mut all = vec![String::new()]; + let mut frontier = vec![String::new()]; + for _ in 0..max_len { + let mut next = Vec::new(); + for w in &frontier { + for &c in alphabet { + let mut s = w.clone(); + s.push(c); + next.push(s); + } + } + all.extend(next.iter().cloned()); + frontier = next; + } + all +} + +/// All strings up to length 4 over [`ALPHABET`] (plus the empty string). +fn probes() -> Vec { + probes_over(ALPHABET, 4) +} + +/// Asserts that intersection / union / difference of `a` and `b` agree with +/// the boolean combination of the operands on every probe string. +fn assert_set_ops_membership( + a: &FastAutomaton, + b: &FastAutomaton, + probes: &[String], +) -> Result<(), TestCaseError> { + if let Some(inter) = bounded(|| a.intersection(b)) { + for s in probes { + prop_assert_eq!( + inter.is_match(s), + a.is_match(s) && b.is_match(s), + "intersection membership for {:?}", + s + ); + } + } + if let Some(union) = bounded(|| a.union(b)) { + for s in probes { + prop_assert_eq!( + union.is_match(s), + a.is_match(s) || b.is_match(s), + "union membership for {:?}", + s + ); + } + } + // `difference` determinizes the subtrahend itself. + if let Some(diff) = bounded(|| a.difference(b)) { + for s in probes { + prop_assert_eq!( + diff.is_match(s), + a.is_match(s) && !b.is_match(s), + "difference membership for {:?}", + s + ); + } + } + Ok(()) +} + +/// Decomposition oracle for repetition: `s` is in L(a){min,max} iff `s` +/// splits into k pieces, each in L(a), for some valid k. Piece counts +/// saturate at `min` once they can only grow (relevant for unbounded +/// maxima and for "" ∈ L(a), which allows padding with empty pieces). +fn repeat_decomposition_oracle(a: &FastAutomaton, s: &str, min: u32, max: Option) -> bool { + let min = min as usize; + let cap = max.map(|m| m as usize).unwrap_or(min).max(min); + let accepts_empty = a.is_match(""); + let len = s.len(); + + // reach[i][k]: the prefix of length i splits into exactly k pieces + // (k saturated at cap + 1 to keep the table finite). + let k_slots = cap + 2; + let mut reach = vec![vec![false; k_slots]; len + 1]; + reach[0][0] = true; + for i in 0..=len { + for k in 0..k_slots { + if !reach[i][k] { + continue; + } + let next_k = (k + 1).min(cap + 1); + // Pad with an empty piece. + if accepts_empty { + reach[i][next_k] = true; + } + // Consume a non-empty piece. + for j in i + 1..=len { + if a.is_match(&s[i..j]) { + reach[j][next_k] = true; + } + } + } + } + + let k_ok = |k: usize| { + k >= min + && match max { + Some(m) => k <= m as usize, + None => true, + } + }; + (0..k_slots).any(|k| reach[len][k] && k_ok(k)) +} + +proptest! { + #![proptest_config(ProptestConfig::with_cases(192))] + + /// The DFA strategy really does produce deterministic automata. + #[test] + fn dfa_strategy_is_deterministic(a in arb_dfa()) { + prop_assert!(a.is_deterministic(), "arb_dfa produced a non-deterministic automaton"); + } + + /// `a` and `determinize(a)` accept the same language. + #[test] + fn determinize_preserves_language(a in arb_nfa()) { + if let Some(d) = determinized(&a) { + prop_assert!(d.is_deterministic()); + if let Some(eq) = bounded(|| a.equivalent(&d)) { + prop_assert!(eq, "determinize changed the language"); + } + } + } + + /// Minimizing a DFA preserves its language. + #[test] + fn minimize_preserves_language(a in arb_nfa()) { + if let Some(d) = determinized(&a) { + let mut m = d.clone(); + if bounded(|| m.minimize()).is_some() + && let Some(eq) = bounded(|| d.equivalent(&m)) + { + prop_assert!(eq, "minimize changed the language"); + } + } + } + + /// Complement laws: membership flips, `a ∩ ¬a = ∅`, `a ∪ ¬a = Σ*`. + #[test] + fn complement_laws(a in arb_dfa()) { + let d = match determinized(&a) { Some(d) => d, None => return Ok(()) }; + let c = match complemented(&d) { Some(c) => c, None => return Ok(()) }; + + for s in probes() { + prop_assert_eq!(d.is_match(&s), !c.is_match(&s), "complement membership for {:?}", s); + } + + if let Some(inter) = bounded(|| d.intersection(&c)) + && let Some(empty) = bounded(|| inter.equivalent(&FastAutomaton::new_empty())) + { + prop_assert!(empty, "a ∩ ¬a is not empty"); + } + if let Some(union) = bounded(|| d.union(&c)) + && let Some(total) = bounded(|| union.equivalent(&FastAutomaton::new_total())) + { + prop_assert!(total, "a ∪ ¬a is not total"); + } + } + + /// Membership of intersection / union / difference matches the boolean + /// combination of the operands on every probe string. + #[test] + fn set_ops_membership(a in arb_nfa(), b in arb_nfa()) { + assert_set_ops_membership(&a, &b, &probes())?; + } + + /// Set operations across operands built over *overlapping but different* + /// alphabets ({a,b} vs {b,c}): the operands carry different spanning + /// sets, so `SpanningSet::merge` and the `ConditionConverter` + /// re-projection do real work (same-alphabet pairs convert via the + /// identity). The shared letter `b` keeps the intersections non-trivial. + #[test] + fn set_ops_membership_overlapping_alphabets( + a in arb_nfa_over(&['a', 'b']), + b in arb_nfa_over(&['b', 'c']), + ) { + assert_set_ops_membership(&a, &b, &probes_over(&['a', 'b', 'c'], 4))?; + } + + /// Set operations across operands built over *disjoint* alphabets + /// ({a,b} vs {c,d}): the merged spanning set shares no base with either + /// source, the most extreme re-projection. The intersection collapses to + /// at most {""} — itself a worthwhile edge case. + #[test] + fn set_ops_membership_disjoint_alphabets( + a in arb_nfa_over(&['a', 'b']), + b in arb_nfa_over(&['c', 'd']), + ) { + assert_set_ops_membership(&a, &b, &probes_over(&['a', 'b', 'c', 'd'], 3))?; + } + + /// `length` and `cardinality` agree with brute-force enumeration. + /// The probes cover *every* string up to length 4, so they are exactly + /// the language whenever the maximum length is ≤ 4, and a complete + /// census of its short strings otherwise. + #[test] + fn length_cardinality_match_brute_force(a in arb_nfa()) { + let (min, max) = a.length(); + let matched_lengths: Vec = probes() + .iter() + .filter(|s| a.is_match(s)) + .map(|s| s.chars().count() as u32) + .collect(); + + // Minimum: any string of length ≤ 4 is a probe, so a language with + // min ≤ 4 has a matched probe of exactly that length. + match (min, matched_lengths.iter().min()) { + (Some(min_len), Some(&shortest)) => { + prop_assert_eq!(min_len, shortest, "min length disagrees with enumeration"); + } + (Some(min_len), None) => { + prop_assert!(min_len > 4, "min ≤ 4 but no probe matched"); + } + (None, Some(_)) => prop_assert!(false, "empty language matched a probe"), + (None, None) => {} + } + + if let Some(max_len) = max + && max_len <= 4 + { + // The probes enumerate the whole language. + prop_assert_eq!( + Some(max_len), + matched_lengths.iter().max().copied(), + "max length disagrees with enumeration" + ); + if let Some(cardinality) = bounded(|| a.cardinality()) { + prop_assert_eq!( + cardinality, + Cardinality::Integer(matched_lengths.len() as u32), + "cardinality disagrees with enumeration" + ); + } + } else if max.is_none() + && min.is_some() + && let Some(cardinality) = bounded(|| a.cardinality()) + { + // A cycle on an accepting path means infinitely many strings. + prop_assert_eq!( + cardinality, + Cardinality::Infinite, + "infinite language with non-infinite cardinality" + ); + } + } + + /// `FastAutomaton::concat` agrees with the split-membership oracle: + /// s ∈ L(a)·L(b) iff some split s = u·v has u ∈ L(a) and v ∈ L(b). + #[test] + fn automaton_concat_matches_split_oracle(a in arb_nfa(), b in arb_nfa()) { + if let Some(concat) = bounded(|| a.concat(&b)) { + for s in probes() { + let expected = + (0..=s.len()).any(|i| a.is_match(&s[..i]) && b.is_match(&s[i..])); + prop_assert_eq!( + concat.is_match(&s), expected, + "concat membership for {:?}", s + ); + } + } + } + + /// `FastAutomaton::repeat` agrees with a decomposition oracle computed by + /// dynamic programming over (position, piece-count) — independent of the + /// engine's own repeat construction (which the regex route would reuse). + #[test] + fn automaton_repeat_matches_decomposition_oracle( + a in arb_nfa(), + min in 0u32..3, + extra in 0u32..2, + unbounded in any::(), + ) { + let max = if unbounded { None } else { Some(min + extra) }; + if let Some(repeated) = bounded(|| a.repeat(min, max)) { + for s in probes() { + let expected = repeat_decomposition_oracle(&a, &s, min, max); + prop_assert_eq!( + repeated.is_match(&s), expected, + "repeat({}, {:?}) membership for {:?}", min, max, s + ); + } + } + } + + /// `Term::union` / `Term::intersection` over more than 3 operands (the + /// parallel dispatch path when the `parallel` feature is on) agree with + /// sequential pairwise folds. + #[test] + fn many_operand_term_ops_match_pairwise_folds( + a in arb_nfa(), b in arb_nfa(), c in arb_nfa(), d in arb_nfa(), e in arb_nfa(), + ) { + use regexsolver::Term; + + let operands: Vec = [&b, &c, &d, &e] + .into_iter() + .map(|x| Term::from_automaton(x.clone())) + .collect(); + let first = Term::from_automaton(a.clone()); + + if let Some(many) = bounded(|| { + Ok(first.union(&operands)?.to_automaton()?.into_owned()) + }) && let Some(pairwise) = bounded(|| { + let mut acc = a.clone(); + for x in [&b, &c, &d, &e] { + acc = acc.union(x)?; + } + Ok(acc) + }) && let Some(eq) = bounded(|| many.equivalent(&pairwise)) { + prop_assert!(eq, "5-operand union disagrees with pairwise folds"); + } + + if let Some(many) = bounded(|| { + Ok(first.intersection(&operands)?.to_automaton()?.into_owned()) + }) && let Some(pairwise) = bounded(|| { + let mut acc = a.clone(); + for x in [&b, &c, &d, &e] { + acc = acc.intersection(x)?; + } + Ok(acc) + }) && let Some(eq) = bounded(|| many.equivalent(&pairwise)) { + prop_assert!(eq, "5-operand intersection disagrees with pairwise folds"); + } + } + + /// `subset` and `equivalent` agree: mutual subset iff equivalent; both are + /// reflexive. + #[test] + fn subset_equivalent_consistency(a in arb_nfa(), b in arb_nfa()) { + if let Some(refl) = bounded(|| a.equivalent(&a)) { + prop_assert!(refl, "equivalent is not reflexive"); + } + if let Some(refl) = bounded(|| a.subset(&a)) { + prop_assert!(refl, "subset is not reflexive"); + } + if let (Some(ab), Some(ba), Some(eq)) = ( + bounded(|| a.subset(&b)), + bounded(|| b.subset(&a)), + bounded(|| a.equivalent(&b)), + ) { + prop_assert_eq!(ab && ba, eq, "mutual subset disagrees with equivalent"); + } + } + + /// `a -> regex -> a` round-trips: the regular expression extracted from an + /// automaton compiles back to an equivalent automaton. + #[test] + fn automaton_to_regex_roundtrip(a in arb_nfa()) { + let r = a.to_regex(); + if let Some(a2) = bounded(|| r.to_automaton()) + && let Some(eq) = bounded(|| a.equivalent(&a2)) + { + prop_assert!(eq, "automaton -> regex -> automaton changed the language: {}", r); + } + } + + /// Regular expressions round-trip through an automaton and agree with the + /// reference `regex` crate on every probe string. + #[test] + fn regex_roundtrip_and_oracle(r in arb_regex()) { + let a = match bounded(|| r.to_automaton()) { Some(a) => a, None => return Ok(()) }; + + // regex -> automaton -> regex -> automaton preserves the language. + let r2 = a.to_regex(); + if let Some(a2) = bounded(|| r2.to_automaton()) + && let Some(eq) = bounded(|| a.equivalent(&a2)) + { + prop_assert!(eq, "regex round-trip changed the language: {} -> {}", r, r2); + } + + // Cross-check membership against the standard regex engine (anchored, + // dot-matches-newline). Patterns denoting the empty language ("[]") are + // rejected by the `regex` crate, so we only compare when it accepts the + // pattern. + let pattern = r.to_string(); + if let Ok(re) = regex::Regex::new(&format!("(?s)^(?:{})$", pattern)) { + for s in probes() { + prop_assert_eq!( + a.is_match(&s), re.is_match(&s), + "pattern {:?} disagrees with reference engine on {:?}", pattern, s + ); + } + } + } +} + +#[cfg(test)] +mod inspect { + use super::*; + use proptest::strategy::{Strategy, ValueTree}; + use proptest::test_runner::TestRunner; + use regex_charclass::CharacterClass; + + fn samples(strat: S, n: usize) -> Vec { + let mut runner = TestRunner::deterministic(); + (0..n) + .map(|_| strat.new_tree(&mut runner).unwrap().current()) + .collect() + } + + #[test] + fn show_regex() { + for (i, r) in samples(arb_regex(), 30).into_iter().enumerate() { + println!("regex[{i:02}] = {r}"); + } + } + + #[test] + fn show_dfa() { + for (i, a) in samples(arb_dfa(), 30).into_iter().enumerate() { + println!("dfa[{i:02}] det={} Graphviz={}", a.is_deterministic(), a); + } + } + + #[test] + fn show_nfa() { + for (i, a) in samples(arb_nfa(), 30).into_iter().enumerate() { + println!("nfa[{i:02}] det={} Graphviz={}", a.is_deterministic(), a); + } + } + + /// The language class of a generated entity, ordered from degenerate to + /// rich. + /// + /// `Empty`, `EmptyString` and `Total` are the corners of the language + /// lattice: useful as occasional edge cases (they hit the `is_empty` / + /// complement / difference fast paths) but they exercise nothing else. + /// `Finite` languages take the topological-sort path of the cardinality + /// and max-length analyses; `Infinite` ones take the cycle paths of state + /// elimination and repeat synthesis. A quality sample needs both in bulk. + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] + enum LangClass { + Empty, + EmptyString, + Total, + Finite, + Infinite, + } + + /// Reduces an automaton to its canonical minimal DFA. + fn minimal_dfa(a: &FastAutomaton) -> FastAutomaton { + let mut m = determinized(a).expect("generated automata always determinize in budget"); + bounded(|| m.minimize()).expect("generated automata always minimize in budget"); + m + } + + /// Classifies the language of a **minimal DFA**. + /// + /// Unlike the trivial-vs-"interesting" split this used to be, the + /// non-trivial bulk is split into finite and infinite languages, which + /// exercise disjoint code paths (see [`LangClass`]). + fn classify(m: &FastAutomaton) -> LangClass { + if m.is_empty() { + LangClass::Empty + } else if m.is_empty_string() { + LangClass::EmptyString + } else if m.is_total() { + // exact on a DFA + LangClass::Total + } else if m.length().1.is_some() { + LangClass::Finite + } else { + LangClass::Infinite + } + } + + /// Canonical fingerprint of a language: the minimal DFA, renumbered in + /// BFS order with the outgoing transitions of each state sorted by label. + /// Minimal DFAs are unique up to isomorphism, so two automata share a key + /// iff they accept the same language — this is what lets the stats count + /// *distinct* languages instead of distinct syntax trees. + fn language_key(m: &FastAutomaton) -> String { + use std::fmt::Write; + let ss = m.spanning_set(); + let mut order = vec![m.start_state()]; + let mut ids = std::collections::HashMap::new(); + ids.insert(m.start_state(), 0usize); + let mut key = String::new(); + let mut i = 0; + while i < order.len() { + let s = order[i]; + i += 1; + // In a DFA the labels leaving a state are disjoint, hence unique, + // so sorting by label gives a deterministic traversal order. + let mut out: Vec<(String, usize)> = m + .transitions_from_vec(s) + .into_iter() + .map(|(c, t)| { + ( + c.to_range(ss) + .expect("condition always converts to a range") + .to_regex(), + t, + ) + }) + .collect(); + out.sort(); + write!(key, "{}", if m.is_accepted(s) { 'A' } else { 'r' }).unwrap(); + for (label, t) in out { + let id = match ids.get(&t) { + Some(&id) => id, + None => { + let id = order.len(); + ids.insert(t, id); + order.push(t); + id + } + }; + write!(key, " {label}>{id}").unwrap(); + } + key.push(';'); + } + key + } + + /// Everything we measure about one generated automaton. + struct Measure { + // Structure of the generated entity itself. + edges: usize, + multi_base_edges: usize, + deterministic: bool, + // Properties of its *language*, computed on the minimal DFA. + class: LangClass, + minimal_states: usize, + accepts_empty_string: bool, + key: String, + } + + fn measure(a: &FastAutomaton) -> Measure { + let mut edges = 0; + let mut multi_base_edges = 0; + for s in a.states_vec() { + for (cond, _) in a.transitions_from_vec(s) { + edges += 1; + if cond.binary_representation().iter().filter(|&&b| b).count() > 1 { + multi_base_edges += 1; + } + } + } + let m = minimal_dfa(a); + Measure { + edges, + multi_base_edges, + deterministic: a.is_deterministic(), + class: classify(&m), + minimal_states: m.number_of_states(), + accepts_empty_string: m.is_match(""), + key: language_key(&m), + } + } + + /// Aggregated quality metrics over a sample; what the strategies are + /// evaluated (and asserted) on. + struct Quality { + /// Share of `Empty` + `EmptyString` + `Total` languages. Wanted as a + /// small minority: present (they are real edge cases) but not eating + /// the sample. + degenerate_pct: f64, + finite_pct: f64, + infinite_pct: f64, + /// Distinct languages (by canonical minimal DFA) over sample size. + /// Duplicates re-test the same language and are wasted cases. + distinct_pct: f64, + /// Average minimal-DFA size: the number of Myhill-Nerode classes is + /// what minimize / equivalence / state elimination actually scale + /// with, so this — not the raw state count — is language complexity. + avg_minimal_states: f64, + /// Share of languages needing a minimal DFA of ≥ 3 states. + rich_pct: f64, + accepts_empty_string_pct: f64, + /// Share of transitions whose condition spans more than one base of + /// the spanning set (exercises the bitvector paths beyond single + /// bits). + multi_base_edge_pct: f64, + /// Share of genuinely nondeterministic automata (only meaningful for + /// the NFA strategy: a "NFA" that is already deterministic never + /// exercises subset construction). + nondeterministic_pct: f64, + } + + fn quality(name: &str, measures: &[Measure]) -> Quality { + let n = measures.len() as f64; + let count = |f: &dyn Fn(&Measure) -> bool| { + 100.0 * measures.iter().filter(|m| f(m)).count() as f64 / n + }; + + let distinct: std::collections::HashSet<&str> = + measures.iter().map(|m| m.key.as_str()).collect(); + let edges: usize = measures.iter().map(|m| m.edges).sum(); + let multi: usize = measures.iter().map(|m| m.multi_base_edges).sum(); + + let mut histogram = std::collections::BTreeMap::new(); + for m in measures { + *histogram.entry(m.minimal_states).or_insert(0usize) += 1; + } + + let q = Quality { + degenerate_pct: count(&|m| { + matches!( + m.class, + LangClass::Empty | LangClass::EmptyString | LangClass::Total + ) + }), + finite_pct: count(&|m| m.class == LangClass::Finite), + infinite_pct: count(&|m| m.class == LangClass::Infinite), + distinct_pct: 100.0 * distinct.len() as f64 / n, + avg_minimal_states: measures.iter().map(|m| m.minimal_states).sum::() as f64 / n, + rich_pct: count(&|m| m.minimal_states >= 3), + accepts_empty_string_pct: count(&|m| m.accepts_empty_string), + multi_base_edge_pct: 100.0 * multi as f64 / edges.max(1) as f64, + nondeterministic_pct: count(&|m| !m.deterministic), + }; + + println!( + "{name}: degenerate {:>4.1}% (∅ {:.1}% | {{\"\"}} {:.1}% | Σ* {:.1}%) | finite {:>4.1}% | infinite {:>4.1}%", + q.degenerate_pct, + count(&|m| m.class == LangClass::Empty), + count(&|m| m.class == LangClass::EmptyString), + count(&|m| m.class == LangClass::Total), + q.finite_pct, + q.infinite_pct, + ); + println!( + "{name}: distinct languages {:>4.1}% | accepts \"\" {:>4.1}% | nondet {:>4.1}% | multi-base edges {:>4.1}%", + q.distinct_pct, + q.accepts_empty_string_pct, + q.nondeterministic_pct, + q.multi_base_edge_pct, + ); + println!( + "{name}: minimal-DFA states avg {:.2}, ≥3 {:>4.1}%, histogram {:?}", + q.avg_minimal_states, q.rich_pct, histogram, + ); + q + } + + /// Operator coverage of a generated regular expression; the round-trip + /// (state elimination) and simplification code paths are keyed on these + /// shapes. + #[derive(Default)] + struct RegexFacets { + unbounded_repetition: bool, + bounded_repetition: bool, + nested_repetition: bool, + alternation: bool, + multi_char_class: bool, + } + + fn regex_facets(r: &RegularExpression, inside_repetition: bool, f: &mut RegexFacets) { + match r { + RegularExpression::Character(range) => { + let letters = (0..ALPHABET.len()) + .filter(|&i| !range.intersection(&letter(i)).is_empty()) + .count(); + if letters > 1 || range.is_total() { + f.multi_char_class = true; + } + } + RegularExpression::Repetition(inner, _, max) => { + if max.is_some() { + f.bounded_repetition = true; + } else { + f.unbounded_repetition = true; + } + if inside_repetition { + f.nested_repetition = true; + } + regex_facets(inner, true, f); + } + RegularExpression::Concat(parts) => { + for p in parts { + regex_facets(p, inside_repetition, f); + } + } + RegularExpression::Alternation(parts) => { + f.alternation = true; + for p in parts { + regex_facets(p, inside_repetition, f); + } + } + } + } + + /// Quantitative quality summary over a larger sample, with floors the + /// strategies must keep. The sample runner is deterministic, so the + /// numbers — and therefore the assertions — are reproducible. + #[test] + fn stats() { + const N: usize = 300; + + let regexes = samples(arb_regex(), N); + let regex_measures: Vec = regexes + .iter() + .map(|r| measure(&r.to_automaton().expect("small regexes always convert"))) + .collect(); + let regex_q = quality("regex", ®ex_measures); + + let avg_len = regexes.iter().map(|r| r.to_string().len()).sum::() as f64 / N as f64; + let mut facet_counts = [0usize; 5]; + for r in ®exes { + let mut f = RegexFacets::default(); + regex_facets(r, false, &mut f); + for (i, hit) in [ + f.unbounded_repetition, + f.bounded_repetition, + f.nested_repetition, + f.alternation, + f.multi_char_class, + ] + .into_iter() + .enumerate() + { + facet_counts[i] += hit as usize; + } + } + let fpct = |i: usize| 100.0 * facet_counts[i] as f64 / N as f64; + println!( + "regex: avg pattern length {avg_len:.1} | unbounded-rep {:.1}% | bounded-rep {:.1}% | nested-rep {:.1}% | alternation {:.1}% | multi-char class {:.1}%", + fpct(0), + fpct(1), + fpct(2), + fpct(3), + fpct(4), + ); + + let dfa_q = quality( + "dfa ", + &samples(arb_dfa(), N) + .iter() + .map(measure) + .collect::>(), + ); + let nfa_q = quality( + "nfa ", + &samples(arb_nfa(), N) + .iter() + .map(measure) + .collect::>(), + ); + + for (name, q) in [("regex", ®ex_q), ("dfa", &dfa_q), ("nfa", &nfa_q)] { + // The trivial corner languages should be present but a minority. + assert!( + q.degenerate_pct < 25.0, + "{name}: too many degenerate languages" + ); + // Both bulk classes must be well represented. + assert!( + q.finite_pct >= 15.0, + "{name}: finite languages under-represented" + ); + assert!( + q.infinite_pct >= 15.0, + "{name}: infinite languages under-represented" + ); + // The sample must not keep re-testing the same languages. + assert!( + q.distinct_pct >= 45.0, + "{name}: not enough distinct languages" + ); + // Language complexity: minimal DFAs must not collapse to 1-2 states. + assert!(q.rich_pct >= 35.0, "{name}: minimal DFAs too small"); + // Both "" ∈ L and "" ∉ L need bulk representation. + assert!( + (20.0..=80.0).contains(&q.accepts_empty_string_pct), + "{name}: empty-string acceptance unbalanced" + ); + // Conditions spanning several bases must show up regularly. + assert!( + q.multi_base_edge_pct >= 10.0, + "{name}: multi-base conditions too rare" + ); + } + // An NFA strategy that mostly produces DFAs never exercises subset + // construction. + assert!( + nfa_q.nondeterministic_pct >= 50.0, + "nfa: mostly deterministic" + ); + } +} diff --git a/tests/readme_examples.rs b/tests/readme_examples.rs new file mode 100644 index 0000000..2fc5807 --- /dev/null +++ b/tests/readme_examples.rs @@ -0,0 +1,80 @@ +//! Keeps the README's examples honest: these tests are the README snippets, +//! verbatim. If one fails, update the README. + +use regexsolver::Term; +use regexsolver::error::EngineError; + +#[test] +fn readme_automaton_building_example() -> Result<(), EngineError> { + use regex_charclass::char::Char; + use regexsolver::CharRange; + use regexsolver::fast_automaton::FastAutomaton; + + // Build an automaton matching "[a-c][0-9]*" by hand: + let mut automaton = FastAutomaton::new_empty(); + let s1 = automaton.new_state(); + automaton.accept(s1); + + let a_to_c = CharRange::new_from_range(Char::new('a')..=Char::new('c')); + let digits = CharRange::new_from_range(Char::new('0')..=Char::new('9')); + automaton.add_transition_from_range(0, s1, &a_to_c)?; + automaton.add_transition_from_range(s1, s1, &digits)?; + + assert!(automaton.is_match("b42")); + assert!(!automaton.is_match("4b")); + assert_eq!(automaton.to_regex().to_string(), "[a-c][0-9]*"); + + Ok(()) +} + +#[test] +fn readme_hero_example() -> Result<(), EngineError> { + let a = Term::from_pattern("(ab|xy){2}")?; + let b = Term::from_pattern(".*xy")?; + + // Which strings match BOTH patterns? Get the answer as a regex: + let both = a.intersection([&b])?; + assert_eq!(both.to_pattern(), "(ab|xy)xy"); + + // Test a concrete string against the result (matching is anchored): + assert!(both.matches("abxy")?); + + // ...and sample them: + assert_eq!(both.generate_strings(2, 0)?, ["xyxy", "abxy"]); + + Ok(()) +} + +#[test] +fn readme_regular_expression_example() -> Result<(), EngineError> { + use regexsolver::cardinality::Cardinality; + use regexsolver::regex::RegularExpression; + + // A validation pattern for an order id, e.g. "ORD-2024-12345". + let pattern = RegularExpression::new("ORD-20[0-9]{2}-[0-9]{4,6}")?; + + // How long can matching ids get? Size your database column accordingly. + assert_eq!(pattern.length(), (Some(13), Some(15))); + + // How many distinct ids does the pattern allow? + assert_eq!(pattern.cardinality(), Cardinality::Integer(111_000_000)); + + // The AST is a plain enum: walk it to lint patterns, e.g. reject + // validation rules that accept unboundedly long input. + fn has_unbounded_repetition(regex: &RegularExpression) -> bool { + match regex { + RegularExpression::Character(_) => false, + RegularExpression::Repetition(inner, _, max) => { + max.is_none() || has_unbounded_repetition(inner) + } + RegularExpression::Concat(parts) => parts.iter().any(has_unbounded_repetition), + RegularExpression::Alternation(parts) => parts.iter().any(has_unbounded_repetition), + } + } + assert!(!has_unbounded_repetition(&pattern)); + assert!(has_unbounded_repetition(&RegularExpression::new( + ".*@example\\.com" + )?)); + + Ok(()) +} diff --git a/tests/state_elimination_quality.rs b/tests/state_elimination_quality.rs new file mode 100644 index 0000000..789204c --- /dev/null +++ b/tests/state_elimination_quality.rs @@ -0,0 +1,69 @@ +//! Measures the quality of automaton→regex conversion (state elimination) +//! over the shared corpus. Not a pass/fail test of absolute numbers — it +//! prints aggregate metrics so a heuristic change can be compared before/after +//! (`cargo test --test state_elimination_quality -- --ignored --nocapture`), +//! while still asserting that every conversion round-trips (correctness). + +use std::{ + fs::File, + io::{BufRead, BufReader}, +}; + +use regexsolver::regex::RegularExpression; + +#[test] +#[ignore = "measurement harness; run explicitly with --ignored --nocapture"] +fn measure_state_elimination_quality() { + let file = File::open("tests/data/regex.txt").unwrap(); + let reader = BufReader::new(file); + + let mut count = 0usize; + let mut total_complexity_nfa = 0.0f64; + let mut total_len_nfa = 0usize; + let mut total_complexity_dfa = 0.0f64; + let mut total_len_dfa = 0usize; + + for line in reader.lines() { + let line = line.unwrap(); + if line.trim().is_empty() { + continue; + } + let input = match RegularExpression::parse(&line, true) { + Ok(r) => r, + Err(_) => continue, + }; + let automaton = input.to_automaton().unwrap(); + + // NFA-derived conversion. + let out_nfa = automaton.to_regex(); + assert!( + automaton + .equivalent(&out_nfa.to_automaton().unwrap()) + .unwrap(), + "NFA round-trip mismatch for {line:?} -> {out_nfa}" + ); + total_complexity_nfa += out_nfa.evaluate_complexity(); + total_len_nfa += out_nfa.to_string().chars().count(); + + // DFA-derived conversion. + let dfa = automaton.determinize().unwrap(); + let out_dfa = dfa.to_regex(); + assert!( + dfa.equivalent(&out_dfa.to_automaton().unwrap()).unwrap(), + "DFA round-trip mismatch for {line:?} -> {out_dfa}" + ); + total_complexity_dfa += out_dfa.evaluate_complexity(); + total_len_dfa += out_dfa.to_string().chars().count(); + + count += 1; + } + + println!("=== state elimination quality over {count} patterns ==="); + println!("NFA: total_complexity = {total_complexity_nfa:.3}, total_len = {total_len_nfa}"); + println!("DFA: total_complexity = {total_complexity_dfa:.3}, total_len = {total_len_dfa}"); + println!( + "SUM: total_complexity = {:.3}, total_len = {}", + total_complexity_nfa + total_complexity_dfa, + total_len_nfa + total_len_dfa + ); +}