matchers/
lib.rs

1//! Regex matchers on character and byte streams.
2//!
3//! ## Overview
4//!
5//! The [`regex`] crate implements regular expression matching on strings and byte
6//! arrays. However, in order to match the output of implementations of `fmt::Debug`
7//! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write`
8//! or `io::Write`, it is necessary to first allocate a buffer, write to that
9//! buffer, and then match the buffer against a regex.
10//!
11//! In cases where it is not necessary to extract substrings, but only to test whether
12//! or not output matches a regex, it is not strictly necessary to allocate and
13//! write this output to a buffer. This crate provides a simple interface on top of
14//! the lower-level [`regex-automata`] library that implements `fmt::Write` and
15//! `io::Write` for regex patterns. This may be used to test whether streaming
16//! output matches a pattern without buffering that output.
17//!
18//! Users who need to extract substrings based on a pattern or who already have
19//! buffered data should probably use the [`regex`] crate instead.
20//!
21//! ## Syntax
22//!
23//! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate.
24//!
25//! [`regex`]: https://crates.io/crates/regex
26//! [`regex-automata`]: https://crates.io/crates/regex-automata
27//! [syntax]: https://docs.rs/regex-automata/0.1.7/regex_automata/#syntax
28
29use regex_automata::{dense, DenseDFA, SparseDFA, StateID, DFA};
30use std::{fmt, io, marker::PhantomData, str::FromStr};
31
32pub use regex_automata::Error;
33
34/// A compiled match pattern that can match multipe inputs, or return a
35/// [`Matcher`] that matches a single input.
36///
37/// [`Matcher`]: ../struct.Matcher.html
38#[derive(Debug, Clone)]
39pub struct Pattern<S = usize, A = DenseDFA<Vec<S>, S>>
40where
41    S: StateID,
42    A: DFA<ID = S>,
43{
44    automaton: A,
45}
46
47/// A reference to a [`Pattern`] that matches a single input.
48///
49/// [`Pattern`]: ../struct.Pattern.html
50#[derive(Debug, Clone)]
51pub struct Matcher<'a, S = usize, A = DenseDFA<&'a [S], S>>
52where
53    S: StateID,
54    A: DFA<ID = S>,
55{
56    automaton: A,
57    state: S,
58    _lt: PhantomData<&'a ()>,
59}
60
61// === impl Pattern ===
62
63impl Pattern {
64    /// Returns a new `Pattern` for the given regex, or an error if the regex
65    /// was invalid.
66    ///
67    /// The returned `Pattern` will match occurances of the pattern which start
68    /// at *any* in a byte or character stream — the pattern may be preceded by
69    /// any number of non-matching characters. Essentially, it will behave as
70    /// though the regular expression started with a `.*?`, which enables a
71    /// match to appear anywhere. If this is not the desired behavior, use
72    /// [`Pattern::new_anchored`] instead.
73    ///
74    /// For example:
75    /// ```
76    /// use matchers::Pattern;
77    ///
78    /// // This pattern matches any number of `a`s followed by a `b`.
79    /// let pattern = Pattern::new("a+b").expect("regex is not invalid");
80    ///
81    /// // Of course, the pattern matches an input where the entire sequence of
82    /// // characters matches the pattern:
83    /// assert!(pattern.display_matches(&"aaaaab"));
84    ///
85    /// // And, since the pattern is unanchored, it will also match the
86    /// // sequence when it's followed by non-matching characters:
87    /// assert!(pattern.display_matches(&"hello world! aaaaab"));
88    /// ```
89    pub fn new(pattern: &str) -> Result<Self, Error> {
90        let automaton = DenseDFA::new(pattern)?;
91        Ok(Pattern { automaton })
92    }
93
94    /// Returns a new `Pattern` anchored at the beginning of the input stream,
95    /// or an error if the regex was invalid.
96    ///
97    /// The returned `Pattern` will *only* match an occurence of the pattern in
98    /// an input sequence if the first character or byte in the input matches
99    /// the pattern. If this is not the desired behavior, use [`Pattern::new`]
100    /// instead.
101    ///
102    /// For example:
103    /// ```
104    /// use matchers::Pattern;
105    ///
106    /// // This pattern matches any number of `a`s followed by a `b`.
107    /// let pattern = Pattern::new_anchored("a+b")
108    ///     .expect("regex is not invalid");
109    ///
110    /// // The pattern matches an input where the entire sequence of
111    /// // characters matches the pattern:
112    /// assert!(pattern.display_matches(&"aaaaab"));
113    ///
114    /// // Since the pattern is anchored, it will *not* match an input that
115    /// // begins with non-matching characters:
116    /// assert!(!pattern.display_matches(&"hello world! aaaaab"));
117    ///
118    /// // ...however, if we create a pattern beginning with `.*?`, it will:
119    /// let pattern2 = Pattern::new_anchored(".*?a+b")
120    ///     .expect("regex is not invalid");
121    /// assert!(pattern2.display_matches(&"hello world! aaaaab"));
122    /// ```
123    pub fn new_anchored(pattern: &str) -> Result<Self, Error> {
124        let automaton = dense::Builder::new().anchored(true).build(pattern)?;
125        Ok(Pattern { automaton })
126    }
127}
128
129impl FromStr for Pattern {
130    type Err = Error;
131    fn from_str(s: &str) -> Result<Self, Self::Err> {
132        Self::new(s)
133    }
134}
135
136impl<S, A> Pattern<S, A>
137where
138    S: StateID,
139    A: DFA<ID = S>,
140    Self: for<'a> ToMatcher<'a, S>,
141{
142    /// Returns `true` if this pattern matches the given string.
143    #[inline]
144    pub fn matches(&self, s: &impl AsRef<str>) -> bool {
145        self.matcher().matches(s)
146    }
147
148    /// Returns `true` if this pattern matches the formatted output of the given
149    /// type implementing `fmt::Debug`.
150    ///
151    /// For example:
152    /// ```rust
153    /// use matchers::Pattern;
154    ///
155    /// #[derive(Debug)]
156    /// pub struct Hello {
157    ///     to: &'static str,
158    /// }
159    ///
160    /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap();
161    ///
162    /// let hello_world = Hello { to: "World" };
163    /// assert!(pattern.debug_matches(&hello_world));
164    ///
165    /// let hello_sf = Hello { to: "San Francisco" };
166    /// assert_eq!(pattern.debug_matches(&hello_sf), false);
167    ///
168    /// let hello_washington = Hello { to: "Washington" };
169    /// assert!(pattern.debug_matches(&hello_washington));
170    /// ```
171    #[inline]
172    pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool {
173        self.matcher().debug_matches(d)
174    }
175
176    /// Returns `true` if this pattern matches the formatted output of the given
177    /// type implementing `fmt::Display`.
178    ///
179    /// For example:
180    /// ```rust
181    /// # use std::fmt;
182    /// use matchers::Pattern;
183    ///
184    /// #[derive(Debug)]
185    /// pub struct Hello {
186    ///     to: &'static str,
187    /// }
188    ///
189    /// impl fmt::Display for Hello {
190    ///     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
191    ///         write!(f, "Hello {}", self.to)
192    ///     }
193    /// }
194    ///
195    /// let pattern = Pattern::new("Hello [Ww].+").unwrap();
196    ///
197    /// let hello_world = Hello { to: "world" };
198    /// assert!(pattern.display_matches(&hello_world));
199    /// assert_eq!(pattern.debug_matches(&hello_world), false);
200    ///
201    /// let hello_sf = Hello { to: "San Francisco" };
202    /// assert_eq!(pattern.display_matches(&hello_sf), false);
203    ///
204    /// let hello_washington = Hello { to: "Washington" };
205    /// assert!(pattern.display_matches(&hello_washington));
206    /// ```
207    #[inline]
208    pub fn display_matches(&self, d: &impl fmt::Display) -> bool {
209        self.matcher().display_matches(d)
210    }
211
212    /// Returns either a `bool` indicating whether or not this pattern matches the
213    /// data read from the provided `io::Read` stream, or an `io::Error` if an
214    /// error occurred reading from the stream.
215    #[inline]
216    pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> {
217        self.matcher().read_matches(io)
218    }
219}
220
221// === impl Matcher ===
222
223impl<'a, S, A> Matcher<'a, S, A>
224where
225    S: StateID,
226    A: DFA<ID = S>,
227{
228    fn new(automaton: A) -> Self {
229        let state = automaton.start_state();
230        Self {
231            automaton,
232            state,
233            _lt: PhantomData,
234        }
235    }
236
237    #[inline]
238    fn advance(&mut self, input: u8) {
239        self.state = unsafe {
240            // It's safe to call `next_state_unchecked` since the matcher may
241            // only be constructed by a `Pattern`, which, in turn,can only be
242            // constructed with a valid DFA.
243            self.automaton.next_state_unchecked(self.state, input)
244        };
245    }
246
247    /// Returns `true` if this `Matcher` has matched any input that has been
248    /// provided.
249    #[inline]
250    pub fn is_matched(&self) -> bool {
251        self.automaton.is_match_state(self.state)
252    }
253
254    /// Returns `true` if this pattern matches the formatted output of the given
255    /// type implementing `fmt::Debug`.
256    pub fn matches(mut self, s: &impl AsRef<str>) -> bool {
257        for &byte in s.as_ref().as_bytes() {
258            self.advance(byte);
259            if self.automaton.is_dead_state(self.state) {
260                return false;
261            }
262        }
263        self.is_matched()
264    }
265
266    /// Returns `true` if this pattern matches the formatted output of the given
267    /// type implementing `fmt::Debug`.
268    pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool {
269        use std::fmt::Write;
270        write!(&mut self, "{:?}", d).expect("matcher write impl should not fail");
271        self.is_matched()
272    }
273
274    /// Returns `true` if this pattern matches the formatted output of the given
275    /// type implementing `fmt::Display`.
276    pub fn display_matches(mut self, d: &impl fmt::Display) -> bool {
277        use std::fmt::Write;
278        write!(&mut self, "{}", d).expect("matcher write impl should not fail");
279        self.is_matched()
280    }
281
282    /// Returns either a `bool` indicating whether or not this pattern matches the
283    /// data read from the provided `io::Read` stream, or an `io::Error` if an
284    /// error occurred reading from the stream.
285    pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> {
286        for r in io.bytes() {
287            self.advance(r?);
288            if self.automaton.is_dead_state(self.state) {
289                return Ok(false);
290            }
291        }
292        Ok(self.is_matched())
293    }
294}
295
296impl<'a, S, A> fmt::Write for Matcher<'a, S, A>
297where
298    S: StateID,
299    A: DFA<ID = S>,
300{
301    fn write_str(&mut self, s: &str) -> fmt::Result {
302        for &byte in s.as_bytes() {
303            self.advance(byte);
304            if self.automaton.is_dead_state(self.state) {
305                break;
306            }
307        }
308        Ok(())
309    }
310}
311
312impl<'a, S, A> io::Write for Matcher<'a, S, A>
313where
314    S: StateID,
315    A: DFA<ID = S>,
316{
317    fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> {
318        let mut i = 0;
319        for &byte in bytes {
320            self.advance(byte);
321            i += 1;
322            if self.automaton.is_dead_state(self.state) {
323                break;
324            }
325        }
326        Ok(i)
327    }
328
329    fn flush(&mut self) -> Result<(), io::Error> {
330        Ok(())
331    }
332}
333
334pub trait ToMatcher<'a, S>
335where
336    Self: crate::sealed::Sealed,
337    S: StateID + 'a,
338{
339    type Automaton: DFA<ID = S>;
340    fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton>;
341}
342
343impl<S> crate::sealed::Sealed for Pattern<S, DenseDFA<Vec<S>, S>> where S: StateID {}
344
345impl<'a, S> ToMatcher<'a, S> for Pattern<S, DenseDFA<Vec<S>, S>>
346where
347    S: StateID + 'a,
348{
349    type Automaton = DenseDFA<&'a [S], S>;
350    fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
351        Matcher::new(self.automaton.as_ref())
352    }
353}
354
355impl<'a, S> ToMatcher<'a, S> for Pattern<S, SparseDFA<Vec<u8>, S>>
356where
357    S: StateID + 'a,
358{
359    type Automaton = SparseDFA<&'a [u8], S>;
360    fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
361        Matcher::new(self.automaton.as_ref())
362    }
363}
364
365impl<S> crate::sealed::Sealed for Pattern<S, SparseDFA<Vec<u8>, S>> where S: StateID {}
366
367mod sealed {
368    pub trait Sealed {}
369}
370
371#[cfg(test)]
372mod test {
373    use super::*;
374
375    struct Str<'a>(&'a str);
376    struct ReadStr<'a>(io::Cursor<&'a [u8]>);
377
378    impl<'a> fmt::Debug for Str<'a> {
379        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380            write!(f, "{}", self.0)
381        }
382    }
383
384    impl<'a> fmt::Display for Str<'a> {
385        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
386            write!(f, "{}", self.0)
387        }
388    }
389
390    impl<'a> io::Read for ReadStr<'a> {
391        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
392            self.0.read(buf)
393        }
394    }
395
396    impl Str<'static> {
397        fn hello_world() -> Self {
398            Self::new("hello world")
399        }
400    }
401
402    impl<'a> Str<'a> {
403        fn new(s: &'a str) -> Self {
404            Str(s)
405        }
406
407        fn to_reader(self) -> ReadStr<'a> {
408            ReadStr(io::Cursor::new(self.0.as_bytes()))
409        }
410    }
411
412    fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
413        let pat = new_pattern("hello world").unwrap();
414        assert!(pat.debug_matches(&Str::hello_world()));
415
416        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
417        assert!(pat.debug_matches(&Str::hello_world()));
418
419        let pat = new_pattern("goodbye world").unwrap();
420        assert_eq!(pat.debug_matches(&Str::hello_world()), false);
421    }
422
423    fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
424        let pat = new_pattern("hello world").unwrap();
425        assert!(pat.display_matches(&Str::hello_world()));
426
427        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
428        assert!(pat.display_matches(&Str::hello_world()));
429
430        let pat = new_pattern("goodbye world").unwrap();
431        assert_eq!(pat.display_matches(&Str::hello_world()), false);
432    }
433
434    fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
435        let pat = new_pattern("hello world").unwrap();
436        assert!(pat
437            .read_matches(Str::hello_world().to_reader())
438            .expect("no io error should occur"));
439
440        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
441        assert!(pat
442            .read_matches(Str::hello_world().to_reader())
443            .expect("no io error should occur"));
444
445        let pat = new_pattern("goodbye world").unwrap();
446        assert_eq!(
447            pat.read_matches(Str::hello_world().to_reader())
448                .expect("no io error should occur"),
449            false
450        );
451    }
452
453    fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
454        let pat = new_pattern("a+b").unwrap();
455        assert!(pat.debug_matches(&Str::new("ab")));
456        assert!(pat.debug_matches(&Str::new("aaaab")));
457        assert!(pat.debug_matches(&Str::new("aaaaaaaaaab")));
458        assert_eq!(pat.debug_matches(&Str::new("b")), false);
459        assert_eq!(pat.debug_matches(&Str::new("abb")), false);
460        assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false);
461    }
462
463    mod anchored {
464        use super::*;
465        #[test]
466        fn debug_matches() {
467            test_debug_matches(Pattern::new_anchored)
468        }
469
470        #[test]
471        fn display_matches() {
472            test_display_matches(Pattern::new_anchored)
473        }
474
475        #[test]
476        fn reader_matches() {
477            test_reader_matches(Pattern::new_anchored)
478        }
479
480        #[test]
481        fn debug_rep_patterns() {
482            test_debug_rep_patterns(Pattern::new_anchored)
483        }
484
485        // === anchored behavior =============================================
486        // Tests that anchored patterns match each input type only beginning at
487        // the first character.
488        fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) {
489            let pat = Pattern::new_anchored("a+b").unwrap();
490            assert!(f(&pat, Str::new("ab")));
491            assert!(f(&pat, Str::new("aaaab")));
492            assert!(f(&pat, Str::new("aaaaaaaaaab")));
493            assert!(!f(&pat, Str::new("bab")));
494            assert!(!f(&pat, Str::new("ffab")));
495            assert!(!f(&pat, Str::new("qqqqqqqaaaaab")));
496        }
497
498        #[test]
499        fn debug_is_anchored() {
500            test_is_anchored(|pat, input| pat.debug_matches(&input))
501        }
502
503        #[test]
504        fn display_is_anchored() {
505            test_is_anchored(|pat, input| pat.display_matches(&input));
506        }
507
508        #[test]
509        fn reader_is_anchored() {
510            test_is_anchored(|pat, input| {
511                pat.read_matches(input.to_reader())
512                    .expect("no io error occurs")
513            });
514        }
515
516        // === explicitly unanchored =========================================
517        // Tests that if an "anchored" pattern begins with `.*?`, it matches as
518        // though it was unanchored.
519        fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
520            let pat = Pattern::new_anchored(".*?a+b").unwrap();
521            assert!(f(&pat, Str::new("ab")));
522            assert!(f(&pat, Str::new("aaaab")));
523            assert!(f(&pat, Str::new("aaaaaaaaaab")));
524            assert!(f(&pat, Str::new("bab")));
525            assert!(f(&pat, Str::new("ffab")));
526            assert!(f(&pat, Str::new("qqqqqqqaaaaab")));
527        }
528
529        #[test]
530        fn debug_explicitly_unanchored() {
531            test_explicitly_unanchored(|pat, input| pat.debug_matches(&input))
532        }
533
534        #[test]
535        fn display_explicitly_unanchored() {
536            test_explicitly_unanchored(|pat, input| pat.display_matches(&input));
537        }
538
539        #[test]
540        fn reader_explicitly_unanchored() {
541            test_explicitly_unanchored(|pat, input| {
542                pat.read_matches(input.to_reader())
543                    .expect("no io error occurs")
544            });
545        }
546    }
547
548    mod unanchored {
549        use super::*;
550        #[test]
551        fn debug_matches() {
552            test_debug_matches(Pattern::new)
553        }
554
555        #[test]
556        fn display_matches() {
557            test_display_matches(Pattern::new)
558        }
559
560        #[test]
561        fn reader_matches() {
562            test_reader_matches(Pattern::new)
563        }
564
565        #[test]
566        fn debug_rep_patterns() {
567            test_debug_rep_patterns(Pattern::new)
568        }
569
570        // === anchored behavior =============================================
571        // Tests that unanchored patterns match anywhere in the input stream.
572        fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
573            let pat = Pattern::new("a+b").unwrap();
574            assert!(f(&pat, Str::new("ab")));
575            assert!(f(&pat, Str::new("aaaab")));
576            assert!(f(&pat, Str::new("aaaaaaaaaab")));
577            assert!(f(&pat, Str::new("bab")));
578            assert!(f(&pat, Str::new("ffab")));
579            assert!(f(&pat, Str::new("qqqfqqqqaaaaab")));
580        }
581
582        #[test]
583        fn debug_is_unanchored() {
584            test_is_unanchored(|pat, input| pat.debug_matches(&input))
585        }
586
587        #[test]
588        fn display_is_unanchored() {
589            test_is_unanchored(|pat, input| pat.display_matches(&input));
590        }
591
592        #[test]
593        fn reader_is_unanchored() {
594            test_is_unanchored(|pat, input| {
595                pat.read_matches(input.to_reader())
596                    .expect("no io error occurs")
597            });
598        }
599    }
600}