regex/regex/
bytes.rs

1use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec};
2
3use regex_automata::{meta, util::captures, Input, PatternID};
4
5use crate::{bytes::RegexBuilder, error::Error};
6
7/// A compiled regular expression for searching Unicode haystacks.
8///
9/// A `Regex` can be used to search haystacks, split haystacks into substrings
10/// or replace substrings in a haystack with a different substring. All
11/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
12/// an pattern. To force an expression to match the whole string (or a prefix
13/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`).
14///
15/// Like the `Regex` type in the parent module, matches with this regex return
16/// byte offsets into the haystack. **Unlike** the parent `Regex` type, these
17/// byte offsets may not correspond to UTF-8 sequence boundaries since the
18/// regexes in this module can match arbitrary bytes.
19///
20/// The only methods that allocate new byte strings are the string replacement
21/// methods. All other methods (searching and splitting) return borrowed
22/// references into the haystack given.
23///
24/// # Example
25///
26/// Find the offsets of a US phone number:
27///
28/// ```
29/// use regex::bytes::Regex;
30///
31/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
32/// let m = re.find(b"phone: 111-222-3333").unwrap();
33/// assert_eq!(7..19, m.range());
34/// ```
35///
36/// # Example: extracting capture groups
37///
38/// A common way to use regexes is with capture groups. That is, instead of
39/// just looking for matches of an entire regex, parentheses are used to create
40/// groups that represent part of the match.
41///
42/// For example, consider a haystack with multiple lines, and each line has
43/// three whitespace delimited fields where the second field is expected to be
44/// a number and the third field a boolean. To make this convenient, we use
45/// the [`Captures::extract`] API to put the strings that match each group
46/// into a fixed size array:
47///
48/// ```
49/// use regex::bytes::Regex;
50///
51/// let hay = b"
52/// rabbit         54 true
53/// groundhog 2 true
54/// does not match
55/// fox   109    false
56/// ";
57/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap();
58/// let mut fields: Vec<(&[u8], i64, bool)> = vec![];
59/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) {
60///     // These unwraps are OK because our pattern is written in a way where
61///     // all matches for f2 and f3 will be valid UTF-8.
62///     let f2 = std::str::from_utf8(f2).unwrap();
63///     let f3 = std::str::from_utf8(f3).unwrap();
64///     fields.push((f1, f2.parse()?, f3.parse()?));
65/// }
66/// assert_eq!(fields, vec![
67///     (&b"rabbit"[..], 54, true),
68///     (&b"groundhog"[..], 2, true),
69///     (&b"fox"[..], 109, false),
70/// ]);
71///
72/// # Ok::<(), Box<dyn std::error::Error>>(())
73/// ```
74///
75/// # Example: matching invalid UTF-8
76///
77/// One of the reasons for searching `&[u8]` haystacks is that the `&[u8]`
78/// might not be valid UTF-8. Indeed, with a `bytes::Regex`, patterns that
79/// match invalid UTF-8 are explicitly allowed. Here's one example that looks
80/// for valid UTF-8 fields that might be separated by invalid UTF-8. In this
81/// case, we use `(?s-u:.)`, which matches any byte. Attempting to use it in a
82/// top-level `Regex` will result in the regex failing to compile. Notice also
83/// that we use `.` with Unicode mode enabled, in which case, only valid UTF-8
84/// is matched. In this way, we can build one pattern where some parts only
85/// match valid UTF-8 while other parts are more permissive.
86///
87/// ```
88/// use regex::bytes::Regex;
89///
90/// // F0 9F 92 A9 is the UTF-8 encoding for a Pile of Poo.
91/// let hay = b"\xFF\xFFfoo\xFF\xFF\xFF\xF0\x9F\x92\xA9\xFF";
92/// // An equivalent to '(?s-u:.)' is '(?-u:[\x00-\xFF])'.
93/// let re = Regex::new(r"(?s)(?-u:.)*?(?<f1>.+)(?-u:.)*?(?<f2>.+)").unwrap();
94/// let caps = re.captures(hay).unwrap();
95/// assert_eq!(&caps["f1"], &b"foo"[..]);
96/// assert_eq!(&caps["f2"], "💩".as_bytes());
97/// ```
98#[derive(Clone)]
99pub struct Regex {
100    pub(crate) meta: meta::Regex,
101    pub(crate) pattern: Arc<str>,
102}
103
104impl core::fmt::Display for Regex {
105    /// Shows the original regular expression.
106    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
107        write!(f, "{}", self.as_str())
108    }
109}
110
111impl core::fmt::Debug for Regex {
112    /// Shows the original regular expression.
113    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
114        f.debug_tuple("Regex").field(&self.as_str()).finish()
115    }
116}
117
118impl core::str::FromStr for Regex {
119    type Err = Error;
120
121    /// Attempts to parse a string into a regular expression
122    fn from_str(s: &str) -> Result<Regex, Error> {
123        Regex::new(s)
124    }
125}
126
127impl TryFrom<&str> for Regex {
128    type Error = Error;
129
130    /// Attempts to parse a string into a regular expression
131    fn try_from(s: &str) -> Result<Regex, Error> {
132        Regex::new(s)
133    }
134}
135
136impl TryFrom<String> for Regex {
137    type Error = Error;
138
139    /// Attempts to parse a string into a regular expression
140    fn try_from(s: String) -> Result<Regex, Error> {
141        Regex::new(&s)
142    }
143}
144
145/// Core regular expression methods.
146impl Regex {
147    /// Compiles a regular expression. Once compiled, it can be used repeatedly
148    /// to search, split or replace substrings in a haystack.
149    ///
150    /// Note that regex compilation tends to be a somewhat expensive process,
151    /// and unlike higher level environments, compilation is not automatically
152    /// cached for you. One should endeavor to compile a regex once and then
153    /// reuse it. For example, it's a bad idea to compile the same regex
154    /// repeatedly in a loop.
155    ///
156    /// # Errors
157    ///
158    /// If an invalid pattern is given, then an error is returned.
159    /// An error is also returned if the pattern is valid, but would
160    /// produce a regex that is bigger than the configured size limit via
161    /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by
162    /// default.)
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// use regex::bytes::Regex;
168    ///
169    /// // An Invalid pattern because of an unclosed parenthesis
170    /// assert!(Regex::new(r"foo(bar").is_err());
171    /// // An invalid pattern because the regex would be too big
172    /// // because Unicode tends to inflate things.
173    /// assert!(Regex::new(r"\w{1000}").is_err());
174    /// // Disabling Unicode can make the regex much smaller,
175    /// // potentially by up to or more than an order of magnitude.
176    /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok());
177    /// ```
178    pub fn new(re: &str) -> Result<Regex, Error> {
179        RegexBuilder::new(re).build()
180    }
181
182    /// Returns true if and only if there is a match for the regex anywhere
183    /// in the haystack given.
184    ///
185    /// It is recommended to use this method if all you need to do is test
186    /// whether a match exists, since the underlying matching engine may be
187    /// able to do less work.
188    ///
189    /// # Example
190    ///
191    /// Test if some haystack contains at least one word with exactly 13
192    /// Unicode word characters:
193    ///
194    /// ```
195    /// use regex::bytes::Regex;
196    ///
197    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
198    /// let hay = b"I categorically deny having triskaidekaphobia.";
199    /// assert!(re.is_match(hay));
200    /// ```
201    #[inline]
202    pub fn is_match(&self, haystack: &[u8]) -> bool {
203        self.is_match_at(haystack, 0)
204    }
205
206    /// This routine searches for the first match of this regex in the
207    /// haystack given, and if found, returns a [`Match`]. The `Match`
208    /// provides access to both the byte offsets of the match and the actual
209    /// substring that matched.
210    ///
211    /// Note that this should only be used if you want to find the entire
212    /// match. If instead you just want to test the existence of a match,
213    /// it's potentially faster to use `Regex::is_match(hay)` instead of
214    /// `Regex::find(hay).is_some()`.
215    ///
216    /// # Example
217    ///
218    /// Find the first word with exactly 13 Unicode word characters:
219    ///
220    /// ```
221    /// use regex::bytes::Regex;
222    ///
223    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
224    /// let hay = b"I categorically deny having triskaidekaphobia.";
225    /// let mat = re.find(hay).unwrap();
226    /// assert_eq!(2..15, mat.range());
227    /// assert_eq!(b"categorically", mat.as_bytes());
228    /// ```
229    #[inline]
230    pub fn find<'h>(&self, haystack: &'h [u8]) -> Option<Match<'h>> {
231        self.find_at(haystack, 0)
232    }
233
234    /// Returns an iterator that yields successive non-overlapping matches in
235    /// the given haystack. The iterator yields values of type [`Match`].
236    ///
237    /// # Time complexity
238    ///
239    /// Note that since `find_iter` runs potentially many searches on the
240    /// haystack and since each search has worst case `O(m * n)` time
241    /// complexity, the overall worst case time complexity for iteration is
242    /// `O(m * n^2)`.
243    ///
244    /// # Example
245    ///
246    /// Find every word with exactly 13 Unicode word characters:
247    ///
248    /// ```
249    /// use regex::bytes::Regex;
250    ///
251    /// let re = Regex::new(r"\b\w{13}\b").unwrap();
252    /// let hay = b"Retroactively relinquishing remunerations is reprehensible.";
253    /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_bytes()).collect();
254    /// assert_eq!(matches, vec![
255    ///     &b"Retroactively"[..],
256    ///     &b"relinquishing"[..],
257    ///     &b"remunerations"[..],
258    ///     &b"reprehensible"[..],
259    /// ]);
260    /// ```
261    #[inline]
262    pub fn find_iter<'r, 'h>(&'r self, haystack: &'h [u8]) -> Matches<'r, 'h> {
263        Matches { haystack, it: self.meta.find_iter(haystack) }
264    }
265
266    /// This routine searches for the first match of this regex in the haystack
267    /// given, and if found, returns not only the overall match but also the
268    /// matches of each capture group in the regex. If no match is found, then
269    /// `None` is returned.
270    ///
271    /// Capture group `0` always corresponds to an implicit unnamed group that
272    /// includes the entire match. If a match is found, this group is always
273    /// present. Subsequent groups may be named and are numbered, starting
274    /// at 1, by the order in which the opening parenthesis appears in the
275    /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`,
276    /// `b` and `c` correspond to capture group indices `1`, `2` and `3`,
277    /// respectively.
278    ///
279    /// You should only use `captures` if you need access to the capture group
280    /// matches. Otherwise, [`Regex::find`] is generally faster for discovering
281    /// just the overall match.
282    ///
283    /// # Example
284    ///
285    /// Say you have some haystack with movie names and their release years,
286    /// like "'Citizen Kane' (1941)". It'd be nice if we could search for
287    /// strings looking like that, while also extracting the movie name and its
288    /// release year separately. The example below shows how to do that.
289    ///
290    /// ```
291    /// use regex::bytes::Regex;
292    ///
293    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
294    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
295    /// let caps = re.captures(hay).unwrap();
296    /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
297    /// assert_eq!(caps.get(1).unwrap().as_bytes(), b"Citizen Kane");
298    /// assert_eq!(caps.get(2).unwrap().as_bytes(), b"1941");
299    /// // You can also access the groups by index using the Index notation.
300    /// // Note that this will panic on an invalid index. In this case, these
301    /// // accesses are always correct because the overall regex will only
302    /// // match when these capture groups match.
303    /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
304    /// assert_eq!(&caps[1], b"Citizen Kane");
305    /// assert_eq!(&caps[2], b"1941");
306    /// ```
307    ///
308    /// Note that the full match is at capture group `0`. Each subsequent
309    /// capture group is indexed by the order of its opening `(`.
310    ///
311    /// We can make this example a bit clearer by using *named* capture groups:
312    ///
313    /// ```
314    /// use regex::bytes::Regex;
315    ///
316    /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap();
317    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
318    /// let caps = re.captures(hay).unwrap();
319    /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
320    /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
321    /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
322    /// // You can also access the groups by name using the Index notation.
323    /// // Note that this will panic on an invalid group name. In this case,
324    /// // these accesses are always correct because the overall regex will
325    /// // only match when these capture groups match.
326    /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
327    /// assert_eq!(&caps["title"], b"Citizen Kane");
328    /// assert_eq!(&caps["year"], b"1941");
329    /// ```
330    ///
331    /// Here we name the capture groups, which we can access with the `name`
332    /// method or the `Index` notation with a `&str`. Note that the named
333    /// capture groups are still accessible with `get` or the `Index` notation
334    /// with a `usize`.
335    ///
336    /// The `0`th capture group is always unnamed, so it must always be
337    /// accessed with `get(0)` or `[0]`.
338    ///
339    /// Finally, one other way to to get the matched substrings is with the
340    /// [`Captures::extract`] API:
341    ///
342    /// ```
343    /// use regex::bytes::Regex;
344    ///
345    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
346    /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
347    /// let (full, [title, year]) = re.captures(hay).unwrap().extract();
348    /// assert_eq!(full, b"'Citizen Kane' (1941)");
349    /// assert_eq!(title, b"Citizen Kane");
350    /// assert_eq!(year, b"1941");
351    /// ```
352    #[inline]
353    pub fn captures<'h>(&self, haystack: &'h [u8]) -> Option<Captures<'h>> {
354        self.captures_at(haystack, 0)
355    }
356
357    /// Returns an iterator that yields successive non-overlapping matches in
358    /// the given haystack. The iterator yields values of type [`Captures`].
359    ///
360    /// This is the same as [`Regex::find_iter`], but instead of only providing
361    /// access to the overall match, each value yield includes access to the
362    /// matches of all capture groups in the regex. Reporting this extra match
363    /// data is potentially costly, so callers should only use `captures_iter`
364    /// over `find_iter` when they actually need access to the capture group
365    /// matches.
366    ///
367    /// # Time complexity
368    ///
369    /// Note that since `captures_iter` runs potentially many searches on the
370    /// haystack and since each search has worst case `O(m * n)` time
371    /// complexity, the overall worst case time complexity for iteration is
372    /// `O(m * n^2)`.
373    ///
374    /// # Example
375    ///
376    /// We can use this to find all movie titles and their release years in
377    /// some haystack, where the movie is formatted like "'Title' (xxxx)":
378    ///
379    /// ```
380    /// use regex::bytes::Regex;
381    ///
382    /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap();
383    /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
384    /// let mut movies = vec![];
385    /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) {
386    ///     // OK because [0-9]{4} can only match valid UTF-8.
387    ///     let year = std::str::from_utf8(year).unwrap();
388    ///     movies.push((title, year.parse::<i64>()?));
389    /// }
390    /// assert_eq!(movies, vec![
391    ///     (&b"Citizen Kane"[..], 1941),
392    ///     (&b"The Wizard of Oz"[..], 1939),
393    ///     (&b"M"[..], 1931),
394    /// ]);
395    /// # Ok::<(), Box<dyn std::error::Error>>(())
396    /// ```
397    ///
398    /// Or with named groups:
399    ///
400    /// ```
401    /// use regex::bytes::Regex;
402    ///
403    /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap();
404    /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
405    /// let mut it = re.captures_iter(hay);
406    ///
407    /// let caps = it.next().unwrap();
408    /// assert_eq!(&caps["title"], b"Citizen Kane");
409    /// assert_eq!(&caps["year"], b"1941");
410    ///
411    /// let caps = it.next().unwrap();
412    /// assert_eq!(&caps["title"], b"The Wizard of Oz");
413    /// assert_eq!(&caps["year"], b"1939");
414    ///
415    /// let caps = it.next().unwrap();
416    /// assert_eq!(&caps["title"], b"M");
417    /// assert_eq!(&caps["year"], b"1931");
418    /// ```
419    #[inline]
420    pub fn captures_iter<'r, 'h>(
421        &'r self,
422        haystack: &'h [u8],
423    ) -> CaptureMatches<'r, 'h> {
424        CaptureMatches { haystack, it: self.meta.captures_iter(haystack) }
425    }
426
427    /// Returns an iterator of substrings of the haystack given, delimited by a
428    /// match of the regex. Namely, each element of the iterator corresponds to
429    /// a part of the haystack that *isn't* matched by the regular expression.
430    ///
431    /// # Time complexity
432    ///
433    /// Since iterators over all matches requires running potentially many
434    /// searches on the haystack, and since each search has worst case
435    /// `O(m * n)` time complexity, the overall worst case time complexity for
436    /// this routine is `O(m * n^2)`.
437    ///
438    /// # Example
439    ///
440    /// To split a string delimited by arbitrary amounts of spaces or tabs:
441    ///
442    /// ```
443    /// use regex::bytes::Regex;
444    ///
445    /// let re = Regex::new(r"[ \t]+").unwrap();
446    /// let hay = b"a b \t  c\td    e";
447    /// let fields: Vec<&[u8]> = re.split(hay).collect();
448    /// assert_eq!(fields, vec![
449    ///     &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
450    /// ]);
451    /// ```
452    ///
453    /// # Example: more cases
454    ///
455    /// Basic usage:
456    ///
457    /// ```
458    /// use regex::bytes::Regex;
459    ///
460    /// let re = Regex::new(r" ").unwrap();
461    /// let hay = b"Mary had a little lamb";
462    /// let got: Vec<&[u8]> = re.split(hay).collect();
463    /// assert_eq!(got, vec![
464    ///     &b"Mary"[..], &b"had"[..], &b"a"[..], &b"little"[..], &b"lamb"[..],
465    /// ]);
466    ///
467    /// let re = Regex::new(r"X").unwrap();
468    /// let hay = b"";
469    /// let got: Vec<&[u8]> = re.split(hay).collect();
470    /// assert_eq!(got, vec![&b""[..]]);
471    ///
472    /// let re = Regex::new(r"X").unwrap();
473    /// let hay = b"lionXXtigerXleopard";
474    /// let got: Vec<&[u8]> = re.split(hay).collect();
475    /// assert_eq!(got, vec![
476    ///     &b"lion"[..], &b""[..], &b"tiger"[..], &b"leopard"[..],
477    /// ]);
478    ///
479    /// let re = Regex::new(r"::").unwrap();
480    /// let hay = b"lion::tiger::leopard";
481    /// let got: Vec<&[u8]> = re.split(hay).collect();
482    /// assert_eq!(got, vec![&b"lion"[..], &b"tiger"[..], &b"leopard"[..]]);
483    /// ```
484    ///
485    /// If a haystack contains multiple contiguous matches, you will end up
486    /// with empty spans yielded by the iterator:
487    ///
488    /// ```
489    /// use regex::bytes::Regex;
490    ///
491    /// let re = Regex::new(r"X").unwrap();
492    /// let hay = b"XXXXaXXbXc";
493    /// let got: Vec<&[u8]> = re.split(hay).collect();
494    /// assert_eq!(got, vec![
495    ///     &b""[..], &b""[..], &b""[..], &b""[..],
496    ///     &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
497    /// ]);
498    ///
499    /// let re = Regex::new(r"/").unwrap();
500    /// let hay = b"(///)";
501    /// let got: Vec<&[u8]> = re.split(hay).collect();
502    /// assert_eq!(got, vec![&b"("[..], &b""[..], &b""[..], &b")"[..]]);
503    /// ```
504    ///
505    /// Separators at the start or end of a haystack are neighbored by empty
506    /// substring.
507    ///
508    /// ```
509    /// use regex::bytes::Regex;
510    ///
511    /// let re = Regex::new(r"0").unwrap();
512    /// let hay = b"010";
513    /// let got: Vec<&[u8]> = re.split(hay).collect();
514    /// assert_eq!(got, vec![&b""[..], &b"1"[..], &b""[..]]);
515    /// ```
516    ///
517    /// When the regex can match the empty string, it splits at every byte
518    /// position in the haystack. This includes between all UTF-8 code units.
519    /// (The top-level [`Regex::split`](crate::Regex::split) will only split
520    /// at valid UTF-8 boundaries.)
521    ///
522    /// ```
523    /// use regex::bytes::Regex;
524    ///
525    /// let re = Regex::new(r"").unwrap();
526    /// let hay = "☃".as_bytes();
527    /// let got: Vec<&[u8]> = re.split(hay).collect();
528    /// assert_eq!(got, vec![
529    ///     &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..],
530    /// ]);
531    /// ```
532    ///
533    /// Contiguous separators (commonly shows up with whitespace), can lead to
534    /// possibly surprising behavior. For example, this code is correct:
535    ///
536    /// ```
537    /// use regex::bytes::Regex;
538    ///
539    /// let re = Regex::new(r" ").unwrap();
540    /// let hay = b"    a  b c";
541    /// let got: Vec<&[u8]> = re.split(hay).collect();
542    /// assert_eq!(got, vec![
543    ///     &b""[..], &b""[..], &b""[..], &b""[..],
544    ///     &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
545    /// ]);
546    /// ```
547    ///
548    /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want
549    /// to match contiguous space characters:
550    ///
551    /// ```
552    /// use regex::bytes::Regex;
553    ///
554    /// let re = Regex::new(r" +").unwrap();
555    /// let hay = b"    a  b c";
556    /// let got: Vec<&[u8]> = re.split(hay).collect();
557    /// // N.B. This does still include a leading empty span because ' +'
558    /// // matches at the beginning of the haystack.
559    /// assert_eq!(got, vec![&b""[..], &b"a"[..], &b"b"[..], &b"c"[..]]);
560    /// ```
561    #[inline]
562    pub fn split<'r, 'h>(&'r self, haystack: &'h [u8]) -> Split<'r, 'h> {
563        Split { haystack, it: self.meta.split(haystack) }
564    }
565
566    /// Returns an iterator of at most `limit` substrings of the haystack
567    /// given, delimited by a match of the regex. (A `limit` of `0` will return
568    /// no substrings.) Namely, each element of the iterator corresponds to a
569    /// part of the haystack that *isn't* matched by the regular expression.
570    /// The remainder of the haystack that is not split will be the last
571    /// element in the iterator.
572    ///
573    /// # Time complexity
574    ///
575    /// Since iterators over all matches requires running potentially many
576    /// searches on the haystack, and since each search has worst case
577    /// `O(m * n)` time complexity, the overall worst case time complexity for
578    /// this routine is `O(m * n^2)`.
579    ///
580    /// Although note that the worst case time here has an upper bound given
581    /// by the `limit` parameter.
582    ///
583    /// # Example
584    ///
585    /// Get the first two words in some haystack:
586    ///
587    /// ```
588    /// use regex::bytes::Regex;
589    ///
590    /// let re = Regex::new(r"\W+").unwrap();
591    /// let hay = b"Hey! How are you?";
592    /// let fields: Vec<&[u8]> = re.splitn(hay, 3).collect();
593    /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
594    /// ```
595    ///
596    /// # Examples: more cases
597    ///
598    /// ```
599    /// use regex::bytes::Regex;
600    ///
601    /// let re = Regex::new(r" ").unwrap();
602    /// let hay = b"Mary had a little lamb";
603    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
604    /// assert_eq!(got, vec![&b"Mary"[..], &b"had"[..], &b"a little lamb"[..]]);
605    ///
606    /// let re = Regex::new(r"X").unwrap();
607    /// let hay = b"";
608    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
609    /// assert_eq!(got, vec![&b""[..]]);
610    ///
611    /// let re = Regex::new(r"X").unwrap();
612    /// let hay = b"lionXXtigerXleopard";
613    /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
614    /// assert_eq!(got, vec![&b"lion"[..], &b""[..], &b"tigerXleopard"[..]]);
615    ///
616    /// let re = Regex::new(r"::").unwrap();
617    /// let hay = b"lion::tiger::leopard";
618    /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
619    /// assert_eq!(got, vec![&b"lion"[..], &b"tiger::leopard"[..]]);
620    ///
621    /// let re = Regex::new(r"X").unwrap();
622    /// let hay = b"abcXdef";
623    /// let got: Vec<&[u8]> = re.splitn(hay, 1).collect();
624    /// assert_eq!(got, vec![&b"abcXdef"[..]]);
625    ///
626    /// let re = Regex::new(r"X").unwrap();
627    /// let hay = b"abcdef";
628    /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
629    /// assert_eq!(got, vec![&b"abcdef"[..]]);
630    ///
631    /// let re = Regex::new(r"X").unwrap();
632    /// let hay = b"abcXdef";
633    /// let got: Vec<&[u8]> = re.splitn(hay, 0).collect();
634    /// assert!(got.is_empty());
635    /// ```
636    #[inline]
637    pub fn splitn<'r, 'h>(
638        &'r self,
639        haystack: &'h [u8],
640        limit: usize,
641    ) -> SplitN<'r, 'h> {
642        SplitN { haystack, it: self.meta.splitn(haystack, limit) }
643    }
644
645    /// Replaces the leftmost-first match in the given haystack with the
646    /// replacement provided. The replacement can be a regular string (where
647    /// `$N` and `$name` are expanded to match capture groups) or a function
648    /// that takes a [`Captures`] and returns the replaced string.
649    ///
650    /// If no match is found, then the haystack is returned unchanged. In that
651    /// case, this implementation will likely return a `Cow::Borrowed` value
652    /// such that no allocation is performed.
653    ///
654    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
655    /// to be equivalent to the `haystack` given.
656    ///
657    /// # Replacement string syntax
658    ///
659    /// All instances of `$ref` in the replacement string are replaced with
660    /// the substring corresponding to the capture group identified by `ref`.
661    ///
662    /// `ref` may be an integer corresponding to the index of the capture group
663    /// (counted by order of opening parenthesis where `0` is the entire match)
664    /// or it can be a name (consisting of letters, digits or underscores)
665    /// corresponding to a named capture group.
666    ///
667    /// If `ref` isn't a valid capture group (whether the name doesn't exist or
668    /// isn't a valid index), then it is replaced with the empty string.
669    ///
670    /// The longest possible name is used. For example, `$1a` looks up the
671    /// capture group named `1a` and not the capture group at index `1`. To
672    /// exert more precise control over the name, use braces, e.g., `${1}a`.
673    ///
674    /// To write a literal `$` use `$$`.
675    ///
676    /// # Example
677    ///
678    /// Note that this function is polymorphic with respect to the replacement.
679    /// In typical usage, this can just be a normal string:
680    ///
681    /// ```
682    /// use regex::bytes::Regex;
683    ///
684    /// let re = Regex::new(r"[^01]+").unwrap();
685    /// assert_eq!(re.replace(b"1078910", b""), &b"1010"[..]);
686    /// ```
687    ///
688    /// But anything satisfying the [`Replacer`] trait will work. For example,
689    /// a closure of type `|&Captures| -> String` provides direct access to the
690    /// captures corresponding to a match. This allows one to access capturing
691    /// group matches easily:
692    ///
693    /// ```
694    /// use regex::bytes::{Captures, Regex};
695    ///
696    /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
697    /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
698    ///     let mut buf = vec![];
699    ///     buf.extend_from_slice(&caps[2]);
700    ///     buf.push(b' ');
701    ///     buf.extend_from_slice(&caps[1]);
702    ///     buf
703    /// });
704    /// assert_eq!(result, &b"Bruce Springsteen"[..]);
705    /// ```
706    ///
707    /// But this is a bit cumbersome to use all the time. Instead, a simple
708    /// syntax is supported (as described above) that expands `$name` into the
709    /// corresponding capture group. Here's the last example, but using this
710    /// expansion technique with named capture groups:
711    ///
712    /// ```
713    /// use regex::bytes::Regex;
714    ///
715    /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
716    /// let result = re.replace(b"Springsteen, Bruce", b"$first $last");
717    /// assert_eq!(result, &b"Bruce Springsteen"[..]);
718    /// ```
719    ///
720    /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
721    /// would produce the same result. To write a literal `$` use `$$`.
722    ///
723    /// Sometimes the replacement string requires use of curly braces to
724    /// delineate a capture group replacement when it is adjacent to some other
725    /// literal text. For example, if we wanted to join two words together with
726    /// an underscore:
727    ///
728    /// ```
729    /// use regex::bytes::Regex;
730    ///
731    /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap();
732    /// let result = re.replace(b"deep fried", b"${first}_$second");
733    /// assert_eq!(result, &b"deep_fried"[..]);
734    /// ```
735    ///
736    /// Without the curly braces, the capture group name `first_` would be
737    /// used, and since it doesn't exist, it would be replaced with the empty
738    /// string.
739    ///
740    /// Finally, sometimes you just want to replace a literal string with no
741    /// regard for capturing group expansion. This can be done by wrapping a
742    /// string with [`NoExpand`]:
743    ///
744    /// ```
745    /// use regex::bytes::{NoExpand, Regex};
746    ///
747    /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
748    /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
749    /// assert_eq!(result, &b"$2 $last"[..]);
750    /// ```
751    ///
752    /// Using `NoExpand` may also be faster, since the replacement string won't
753    /// need to be parsed for the `$` syntax.
754    #[inline]
755    pub fn replace<'h, R: Replacer>(
756        &self,
757        haystack: &'h [u8],
758        rep: R,
759    ) -> Cow<'h, [u8]> {
760        self.replacen(haystack, 1, rep)
761    }
762
763    /// Replaces all non-overlapping matches in the haystack with the
764    /// replacement provided. This is the same as calling `replacen` with
765    /// `limit` set to `0`.
766    ///
767    /// If no match is found, then the haystack is returned unchanged. In that
768    /// case, this implementation will likely return a `Cow::Borrowed` value
769    /// such that no allocation is performed.
770    ///
771    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
772    /// to be equivalent to the `haystack` given.
773    ///
774    /// The documentation for [`Regex::replace`] goes into more detail about
775    /// what kinds of replacement strings are supported.
776    ///
777    /// # Time complexity
778    ///
779    /// Since iterators over all matches requires running potentially many
780    /// searches on the haystack, and since each search has worst case
781    /// `O(m * n)` time complexity, the overall worst case time complexity for
782    /// this routine is `O(m * n^2)`.
783    ///
784    /// # Fallibility
785    ///
786    /// If you need to write a replacement routine where any individual
787    /// replacement might "fail," doing so with this API isn't really feasible
788    /// because there's no way to stop the search process if a replacement
789    /// fails. Instead, if you need this functionality, you should consider
790    /// implementing your own replacement routine:
791    ///
792    /// ```
793    /// use regex::bytes::{Captures, Regex};
794    ///
795    /// fn replace_all<E>(
796    ///     re: &Regex,
797    ///     haystack: &[u8],
798    ///     replacement: impl Fn(&Captures) -> Result<Vec<u8>, E>,
799    /// ) -> Result<Vec<u8>, E> {
800    ///     let mut new = Vec::with_capacity(haystack.len());
801    ///     let mut last_match = 0;
802    ///     for caps in re.captures_iter(haystack) {
803    ///         let m = caps.get(0).unwrap();
804    ///         new.extend_from_slice(&haystack[last_match..m.start()]);
805    ///         new.extend_from_slice(&replacement(&caps)?);
806    ///         last_match = m.end();
807    ///     }
808    ///     new.extend_from_slice(&haystack[last_match..]);
809    ///     Ok(new)
810    /// }
811    ///
812    /// // Let's replace each word with the number of bytes in that word.
813    /// // But if we see a word that is "too long," we'll give up.
814    /// let re = Regex::new(r"\w+").unwrap();
815    /// let replacement = |caps: &Captures| -> Result<Vec<u8>, &'static str> {
816    ///     if caps[0].len() >= 5 {
817    ///         return Err("word too long");
818    ///     }
819    ///     Ok(caps[0].len().to_string().into_bytes())
820    /// };
821    /// assert_eq!(
822    ///     Ok(b"2 3 3 3?".to_vec()),
823    ///     replace_all(&re, b"hi how are you?", &replacement),
824    /// );
825    /// assert!(replace_all(&re, b"hi there", &replacement).is_err());
826    /// ```
827    ///
828    /// # Example
829    ///
830    /// This example shows how to flip the order of whitespace (excluding line
831    /// terminators) delimited fields, and normalizes the whitespace that
832    /// delimits the fields:
833    ///
834    /// ```
835    /// use regex::bytes::Regex;
836    ///
837    /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
838    /// let hay = b"
839    /// Greetings  1973
840    /// Wild\t1973
841    /// BornToRun\t\t\t\t1975
842    /// Darkness                    1978
843    /// TheRiver 1980
844    /// ";
845    /// let new = re.replace_all(hay, b"$2 $1");
846    /// assert_eq!(new, &b"
847    /// 1973 Greetings
848    /// 1973 Wild
849    /// 1975 BornToRun
850    /// 1978 Darkness
851    /// 1980 TheRiver
852    /// "[..]);
853    /// ```
854    #[inline]
855    pub fn replace_all<'h, R: Replacer>(
856        &self,
857        haystack: &'h [u8],
858        rep: R,
859    ) -> Cow<'h, [u8]> {
860        self.replacen(haystack, 0, rep)
861    }
862
863    /// Replaces at most `limit` non-overlapping matches in the haystack with
864    /// the replacement provided. If `limit` is `0`, then all non-overlapping
865    /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is
866    /// equivalent to `Regex::replacen(hay, 0, rep)`.
867    ///
868    /// If no match is found, then the haystack is returned unchanged. In that
869    /// case, this implementation will likely return a `Cow::Borrowed` value
870    /// such that no allocation is performed.
871    ///
872    /// When a `Cow::Borrowed` is returned, the value returned is guaranteed
873    /// to be equivalent to the `haystack` given.
874    ///
875    /// The documentation for [`Regex::replace`] goes into more detail about
876    /// what kinds of replacement strings are supported.
877    ///
878    /// # Time complexity
879    ///
880    /// Since iterators over all matches requires running potentially many
881    /// searches on the haystack, and since each search has worst case
882    /// `O(m * n)` time complexity, the overall worst case time complexity for
883    /// this routine is `O(m * n^2)`.
884    ///
885    /// Although note that the worst case time here has an upper bound given
886    /// by the `limit` parameter.
887    ///
888    /// # Fallibility
889    ///
890    /// See the corresponding section in the docs for [`Regex::replace_all`]
891    /// for tips on how to deal with a replacement routine that can fail.
892    ///
893    /// # Example
894    ///
895    /// This example shows how to flip the order of whitespace (excluding line
896    /// terminators) delimited fields, and normalizes the whitespace that
897    /// delimits the fields. But we only do it for the first two matches.
898    ///
899    /// ```
900    /// use regex::bytes::Regex;
901    ///
902    /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
903    /// let hay = b"
904    /// Greetings  1973
905    /// Wild\t1973
906    /// BornToRun\t\t\t\t1975
907    /// Darkness                    1978
908    /// TheRiver 1980
909    /// ";
910    /// let new = re.replacen(hay, 2, b"$2 $1");
911    /// assert_eq!(new, &b"
912    /// 1973 Greetings
913    /// 1973 Wild
914    /// BornToRun\t\t\t\t1975
915    /// Darkness                    1978
916    /// TheRiver 1980
917    /// "[..]);
918    /// ```
919    #[inline]
920    pub fn replacen<'h, R: Replacer>(
921        &self,
922        haystack: &'h [u8],
923        limit: usize,
924        mut rep: R,
925    ) -> Cow<'h, [u8]> {
926        // If we know that the replacement doesn't have any capture expansions,
927        // then we can use the fast path. The fast path can make a tremendous
928        // difference:
929        //
930        //   1) We use `find_iter` instead of `captures_iter`. Not asking for
931        //      captures generally makes the regex engines faster.
932        //   2) We don't need to look up all of the capture groups and do
933        //      replacements inside the replacement string. We just push it
934        //      at each match and be done with it.
935        if let Some(rep) = rep.no_expansion() {
936            let mut it = self.find_iter(haystack).enumerate().peekable();
937            if it.peek().is_none() {
938                return Cow::Borrowed(haystack);
939            }
940            let mut new = Vec::with_capacity(haystack.len());
941            let mut last_match = 0;
942            for (i, m) in it {
943                new.extend_from_slice(&haystack[last_match..m.start()]);
944                new.extend_from_slice(&rep);
945                last_match = m.end();
946                if limit > 0 && i >= limit - 1 {
947                    break;
948                }
949            }
950            new.extend_from_slice(&haystack[last_match..]);
951            return Cow::Owned(new);
952        }
953
954        // The slower path, which we use if the replacement needs access to
955        // capture groups.
956        let mut it = self.captures_iter(haystack).enumerate().peekable();
957        if it.peek().is_none() {
958            return Cow::Borrowed(haystack);
959        }
960        let mut new = Vec::with_capacity(haystack.len());
961        let mut last_match = 0;
962        for (i, cap) in it {
963            // unwrap on 0 is OK because captures only reports matches
964            let m = cap.get(0).unwrap();
965            new.extend_from_slice(&haystack[last_match..m.start()]);
966            rep.replace_append(&cap, &mut new);
967            last_match = m.end();
968            if limit > 0 && i >= limit - 1 {
969                break;
970            }
971        }
972        new.extend_from_slice(&haystack[last_match..]);
973        Cow::Owned(new)
974    }
975}
976
977/// A group of advanced or "lower level" search methods. Some methods permit
978/// starting the search at a position greater than `0` in the haystack. Other
979/// methods permit reusing allocations, for example, when extracting the
980/// matches for capture groups.
981impl Regex {
982    /// Returns the end byte offset of the first match in the haystack given.
983    ///
984    /// This method may have the same performance characteristics as
985    /// `is_match`. Behaviorlly, it doesn't just report whether it match
986    /// occurs, but also the end offset for a match. In particular, the offset
987    /// returned *may be shorter* than the proper end of the leftmost-first
988    /// match that you would find via [`Regex::find`].
989    ///
990    /// Note that it is not guaranteed that this routine finds the shortest or
991    /// "earliest" possible match. Instead, the main idea of this API is that
992    /// it returns the offset at the point at which the internal regex engine
993    /// has determined that a match has occurred. This may vary depending on
994    /// which internal regex engine is used, and thus, the offset itself may
995    /// change based on internal heuristics.
996    ///
997    /// # Example
998    ///
999    /// Typically, `a+` would match the entire first sequence of `a` in some
1000    /// haystack, but `shortest_match` *may* give up as soon as it sees the
1001    /// first `a`.
1002    ///
1003    /// ```
1004    /// use regex::bytes::Regex;
1005    ///
1006    /// let re = Regex::new(r"a+").unwrap();
1007    /// let offset = re.shortest_match(b"aaaaa").unwrap();
1008    /// assert_eq!(offset, 1);
1009    /// ```
1010    #[inline]
1011    pub fn shortest_match(&self, haystack: &[u8]) -> Option<usize> {
1012        self.shortest_match_at(haystack, 0)
1013    }
1014
1015    /// Returns the same as `shortest_match`, but starts the search at the
1016    /// given offset.
1017    ///
1018    /// The significance of the starting point is that it takes the surrounding
1019    /// context into consideration. For example, the `\A` anchor can only match
1020    /// when `start == 0`.
1021    ///
1022    /// If a match is found, the offset returned is relative to the beginning
1023    /// of the haystack, not the beginning of the search.
1024    ///
1025    /// # Panics
1026    ///
1027    /// This panics when `start >= haystack.len() + 1`.
1028    ///
1029    /// # Example
1030    ///
1031    /// This example shows the significance of `start` by demonstrating how it
1032    /// can be used to permit look-around assertions in a regex to take the
1033    /// surrounding context into account.
1034    ///
1035    /// ```
1036    /// use regex::bytes::Regex;
1037    ///
1038    /// let re = Regex::new(r"\bchew\b").unwrap();
1039    /// let hay = b"eschew";
1040    /// // We get a match here, but it's probably not intended.
1041    /// assert_eq!(re.shortest_match(&hay[2..]), Some(4));
1042    /// // No match because the  assertions take the context into account.
1043    /// assert_eq!(re.shortest_match_at(hay, 2), None);
1044    /// ```
1045    #[inline]
1046    pub fn shortest_match_at(
1047        &self,
1048        haystack: &[u8],
1049        start: usize,
1050    ) -> Option<usize> {
1051        let input =
1052            Input::new(haystack).earliest(true).span(start..haystack.len());
1053        self.meta.search_half(&input).map(|hm| hm.offset())
1054    }
1055
1056    /// Returns the same as [`Regex::is_match`], but starts the search at the
1057    /// given offset.
1058    ///
1059    /// The significance of the starting point is that it takes the surrounding
1060    /// context into consideration. For example, the `\A` anchor can only
1061    /// match when `start == 0`.
1062    ///
1063    /// # Panics
1064    ///
1065    /// This panics when `start >= haystack.len() + 1`.
1066    ///
1067    /// # Example
1068    ///
1069    /// This example shows the significance of `start` by demonstrating how it
1070    /// can be used to permit look-around assertions in a regex to take the
1071    /// surrounding context into account.
1072    ///
1073    /// ```
1074    /// use regex::bytes::Regex;
1075    ///
1076    /// let re = Regex::new(r"\bchew\b").unwrap();
1077    /// let hay = b"eschew";
1078    /// // We get a match here, but it's probably not intended.
1079    /// assert!(re.is_match(&hay[2..]));
1080    /// // No match because the  assertions take the context into account.
1081    /// assert!(!re.is_match_at(hay, 2));
1082    /// ```
1083    #[inline]
1084    pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
1085        self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
1086    }
1087
1088    /// Returns the same as [`Regex::find`], but starts the search at the given
1089    /// offset.
1090    ///
1091    /// The significance of the starting point is that it takes the surrounding
1092    /// context into consideration. For example, the `\A` anchor can only
1093    /// match when `start == 0`.
1094    ///
1095    /// # Panics
1096    ///
1097    /// This panics when `start >= haystack.len() + 1`.
1098    ///
1099    /// # Example
1100    ///
1101    /// This example shows the significance of `start` by demonstrating how it
1102    /// can be used to permit look-around assertions in a regex to take the
1103    /// surrounding context into account.
1104    ///
1105    /// ```
1106    /// use regex::bytes::Regex;
1107    ///
1108    /// let re = Regex::new(r"\bchew\b").unwrap();
1109    /// let hay = b"eschew";
1110    /// // We get a match here, but it's probably not intended.
1111    /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4));
1112    /// // No match because the  assertions take the context into account.
1113    /// assert_eq!(re.find_at(hay, 2), None);
1114    /// ```
1115    #[inline]
1116    pub fn find_at<'h>(
1117        &self,
1118        haystack: &'h [u8],
1119        start: usize,
1120    ) -> Option<Match<'h>> {
1121        let input = Input::new(haystack).span(start..haystack.len());
1122        self.meta.find(input).map(|m| Match::new(haystack, m.start(), m.end()))
1123    }
1124
1125    /// Returns the same as [`Regex::captures`], but starts the search at the
1126    /// given offset.
1127    ///
1128    /// The significance of the starting point is that it takes the surrounding
1129    /// context into consideration. For example, the `\A` anchor can only
1130    /// match when `start == 0`.
1131    ///
1132    /// # Panics
1133    ///
1134    /// This panics when `start >= haystack.len() + 1`.
1135    ///
1136    /// # Example
1137    ///
1138    /// This example shows the significance of `start` by demonstrating how it
1139    /// can be used to permit look-around assertions in a regex to take the
1140    /// surrounding context into account.
1141    ///
1142    /// ```
1143    /// use regex::bytes::Regex;
1144    ///
1145    /// let re = Regex::new(r"\bchew\b").unwrap();
1146    /// let hay = b"eschew";
1147    /// // We get a match here, but it's probably not intended.
1148    /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], b"chew");
1149    /// // No match because the  assertions take the context into account.
1150    /// assert!(re.captures_at(hay, 2).is_none());
1151    /// ```
1152    #[inline]
1153    pub fn captures_at<'h>(
1154        &self,
1155        haystack: &'h [u8],
1156        start: usize,
1157    ) -> Option<Captures<'h>> {
1158        let input = Input::new(haystack).span(start..haystack.len());
1159        let mut caps = self.meta.create_captures();
1160        self.meta.captures(input, &mut caps);
1161        if caps.is_match() {
1162            let static_captures_len = self.static_captures_len();
1163            Some(Captures { haystack, caps, static_captures_len })
1164        } else {
1165            None
1166        }
1167    }
1168
1169    /// This is like [`Regex::captures`], but writes the byte offsets of each
1170    /// capture group match into the locations given.
1171    ///
1172    /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
1173    /// but does *not* store a reference to the haystack. This makes its API
1174    /// a bit lower level and less convenient. But in exchange, callers
1175    /// may allocate their own `CaptureLocations` and reuse it for multiple
1176    /// searches. This may be helpful if allocating a `Captures` shows up in a
1177    /// profile as too costly.
1178    ///
1179    /// To create a `CaptureLocations` value, use the
1180    /// [`Regex::capture_locations`] method.
1181    ///
1182    /// This also returns the overall match if one was found. When a match is
1183    /// found, its offsets are also always stored in `locs` at index `0`.
1184    ///
1185    /// # Example
1186    ///
1187    /// ```
1188    /// use regex::bytes::Regex;
1189    ///
1190    /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap();
1191    /// let mut locs = re.capture_locations();
1192    /// assert!(re.captures_read(&mut locs, b"id=foo123").is_some());
1193    /// assert_eq!(Some((0, 9)), locs.get(0));
1194    /// assert_eq!(Some((0, 2)), locs.get(1));
1195    /// assert_eq!(Some((3, 9)), locs.get(2));
1196    /// ```
1197    #[inline]
1198    pub fn captures_read<'h>(
1199        &self,
1200        locs: &mut CaptureLocations,
1201        haystack: &'h [u8],
1202    ) -> Option<Match<'h>> {
1203        self.captures_read_at(locs, haystack, 0)
1204    }
1205
1206    /// Returns the same as [`Regex::captures_read`], but starts the search at
1207    /// the given offset.
1208    ///
1209    /// The significance of the starting point is that it takes the surrounding
1210    /// context into consideration. For example, the `\A` anchor can only
1211    /// match when `start == 0`.
1212    ///
1213    /// # Panics
1214    ///
1215    /// This panics when `start >= haystack.len() + 1`.
1216    ///
1217    /// # Example
1218    ///
1219    /// This example shows the significance of `start` by demonstrating how it
1220    /// can be used to permit look-around assertions in a regex to take the
1221    /// surrounding context into account.
1222    ///
1223    /// ```
1224    /// use regex::bytes::Regex;
1225    ///
1226    /// let re = Regex::new(r"\bchew\b").unwrap();
1227    /// let hay = b"eschew";
1228    /// let mut locs = re.capture_locations();
1229    /// // We get a match here, but it's probably not intended.
1230    /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some());
1231    /// // No match because the  assertions take the context into account.
1232    /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none());
1233    /// ```
1234    #[inline]
1235    pub fn captures_read_at<'h>(
1236        &self,
1237        locs: &mut CaptureLocations,
1238        haystack: &'h [u8],
1239        start: usize,
1240    ) -> Option<Match<'h>> {
1241        let input = Input::new(haystack).span(start..haystack.len());
1242        self.meta.search_captures(&input, &mut locs.0);
1243        locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end()))
1244    }
1245
1246    /// An undocumented alias for `captures_read_at`.
1247    ///
1248    /// The `regex-capi` crate previously used this routine, so to avoid
1249    /// breaking that crate, we continue to provide the name as an undocumented
1250    /// alias.
1251    #[doc(hidden)]
1252    #[inline]
1253    pub fn read_captures_at<'h>(
1254        &self,
1255        locs: &mut CaptureLocations,
1256        haystack: &'h [u8],
1257        start: usize,
1258    ) -> Option<Match<'h>> {
1259        self.captures_read_at(locs, haystack, start)
1260    }
1261}
1262
1263/// Auxiliary methods.
1264impl Regex {
1265    /// Returns the original string of this regex.
1266    ///
1267    /// # Example
1268    ///
1269    /// ```
1270    /// use regex::bytes::Regex;
1271    ///
1272    /// let re = Regex::new(r"foo\w+bar").unwrap();
1273    /// assert_eq!(re.as_str(), r"foo\w+bar");
1274    /// ```
1275    #[inline]
1276    pub fn as_str(&self) -> &str {
1277        &self.pattern
1278    }
1279
1280    /// Returns an iterator over the capture names in this regex.
1281    ///
1282    /// The iterator returned yields elements of type `Option<&str>`. That is,
1283    /// the iterator yields values for all capture groups, even ones that are
1284    /// unnamed. The order of the groups corresponds to the order of the group's
1285    /// corresponding opening parenthesis.
1286    ///
1287    /// The first element of the iterator always yields the group corresponding
1288    /// to the overall match, and this group is always unnamed. Therefore, the
1289    /// iterator always yields at least one group.
1290    ///
1291    /// # Example
1292    ///
1293    /// This shows basic usage with a mix of named and unnamed capture groups:
1294    ///
1295    /// ```
1296    /// use regex::bytes::Regex;
1297    ///
1298    /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
1299    /// let mut names = re.capture_names();
1300    /// assert_eq!(names.next(), Some(None));
1301    /// assert_eq!(names.next(), Some(Some("a")));
1302    /// assert_eq!(names.next(), Some(Some("b")));
1303    /// assert_eq!(names.next(), Some(None));
1304    /// // the '(?:.)' group is non-capturing and so doesn't appear here!
1305    /// assert_eq!(names.next(), Some(Some("c")));
1306    /// assert_eq!(names.next(), None);
1307    /// ```
1308    ///
1309    /// The iterator always yields at least one element, even for regexes with
1310    /// no capture groups and even for regexes that can never match:
1311    ///
1312    /// ```
1313    /// use regex::bytes::Regex;
1314    ///
1315    /// let re = Regex::new(r"").unwrap();
1316    /// let mut names = re.capture_names();
1317    /// assert_eq!(names.next(), Some(None));
1318    /// assert_eq!(names.next(), None);
1319    ///
1320    /// let re = Regex::new(r"[a&&b]").unwrap();
1321    /// let mut names = re.capture_names();
1322    /// assert_eq!(names.next(), Some(None));
1323    /// assert_eq!(names.next(), None);
1324    /// ```
1325    #[inline]
1326    pub fn capture_names(&self) -> CaptureNames<'_> {
1327        CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO))
1328    }
1329
1330    /// Returns the number of captures groups in this regex.
1331    ///
1332    /// This includes all named and unnamed groups, including the implicit
1333    /// unnamed group that is always present and corresponds to the entire
1334    /// match.
1335    ///
1336    /// Since the implicit unnamed group is always included in this length, the
1337    /// length returned is guaranteed to be greater than zero.
1338    ///
1339    /// # Example
1340    ///
1341    /// ```
1342    /// use regex::bytes::Regex;
1343    ///
1344    /// let re = Regex::new(r"foo").unwrap();
1345    /// assert_eq!(1, re.captures_len());
1346    ///
1347    /// let re = Regex::new(r"(foo)").unwrap();
1348    /// assert_eq!(2, re.captures_len());
1349    ///
1350    /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
1351    /// assert_eq!(5, re.captures_len());
1352    ///
1353    /// let re = Regex::new(r"[a&&b]").unwrap();
1354    /// assert_eq!(1, re.captures_len());
1355    /// ```
1356    #[inline]
1357    pub fn captures_len(&self) -> usize {
1358        self.meta.group_info().group_len(PatternID::ZERO)
1359    }
1360
1361    /// Returns the total number of capturing groups that appear in every
1362    /// possible match.
1363    ///
1364    /// If the number of capture groups can vary depending on the match, then
1365    /// this returns `None`. That is, a value is only returned when the number
1366    /// of matching groups is invariant or "static."
1367    ///
1368    /// Note that like [`Regex::captures_len`], this **does** include the
1369    /// implicit capturing group corresponding to the entire match. Therefore,
1370    /// when a non-None value is returned, it is guaranteed to be at least `1`.
1371    /// Stated differently, a return value of `Some(0)` is impossible.
1372    ///
1373    /// # Example
1374    ///
1375    /// This shows a few cases where a static number of capture groups is
1376    /// available and a few cases where it is not.
1377    ///
1378    /// ```
1379    /// use regex::bytes::Regex;
1380    ///
1381    /// let len = |pattern| {
1382    ///     Regex::new(pattern).map(|re| re.static_captures_len())
1383    /// };
1384    ///
1385    /// assert_eq!(Some(1), len("a")?);
1386    /// assert_eq!(Some(2), len("(a)")?);
1387    /// assert_eq!(Some(2), len("(a)|(b)")?);
1388    /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
1389    /// assert_eq!(None, len("(a)|b")?);
1390    /// assert_eq!(None, len("a|(b)")?);
1391    /// assert_eq!(None, len("(b)*")?);
1392    /// assert_eq!(Some(2), len("(b)+")?);
1393    ///
1394    /// # Ok::<(), Box<dyn std::error::Error>>(())
1395    /// ```
1396    #[inline]
1397    pub fn static_captures_len(&self) -> Option<usize> {
1398        self.meta.static_captures_len()
1399    }
1400
1401    /// Returns a fresh allocated set of capture locations that can
1402    /// be reused in multiple calls to [`Regex::captures_read`] or
1403    /// [`Regex::captures_read_at`].
1404    ///
1405    /// # Example
1406    ///
1407    /// ```
1408    /// use regex::bytes::Regex;
1409    ///
1410    /// let re = Regex::new(r"(.)(.)(\w+)").unwrap();
1411    /// let mut locs = re.capture_locations();
1412    /// assert!(re.captures_read(&mut locs, b"Padron").is_some());
1413    /// assert_eq!(locs.get(0), Some((0, 6)));
1414    /// assert_eq!(locs.get(1), Some((0, 1)));
1415    /// assert_eq!(locs.get(2), Some((1, 2)));
1416    /// assert_eq!(locs.get(3), Some((2, 6)));
1417    /// ```
1418    #[inline]
1419    pub fn capture_locations(&self) -> CaptureLocations {
1420        CaptureLocations(self.meta.create_captures())
1421    }
1422
1423    /// An alias for `capture_locations` to preserve backward compatibility.
1424    ///
1425    /// The `regex-capi` crate uses this method, so to avoid breaking that
1426    /// crate, we continue to export it as an undocumented API.
1427    #[doc(hidden)]
1428    #[inline]
1429    pub fn locations(&self) -> CaptureLocations {
1430        self.capture_locations()
1431    }
1432}
1433
1434/// Represents a single match of a regex in a haystack.
1435///
1436/// A `Match` contains both the start and end byte offsets of the match and the
1437/// actual substring corresponding to the range of those byte offsets. It is
1438/// guaranteed that `start <= end`. When `start == end`, the match is empty.
1439///
1440/// Unlike the top-level `Match` type, this `Match` type is produced by APIs
1441/// that search `&[u8]` haystacks. This means that the offsets in a `Match` can
1442/// point to anywhere in the haystack, including in a place that splits the
1443/// UTF-8 encoding of a Unicode scalar value.
1444///
1445/// The lifetime parameter `'h` refers to the lifetime of the matched of the
1446/// haystack that this match was produced from.
1447///
1448/// # Numbering
1449///
1450/// The byte offsets in a `Match` form a half-open interval. That is, the
1451/// start of the range is inclusive and the end of the range is exclusive.
1452/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte
1453/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and
1454/// `6` corresponds to `x`, which is one past the end of the match. This
1455/// corresponds to the same kind of slicing that Rust uses.
1456///
1457/// For more on why this was chosen over other schemes (aside from being
1458/// consistent with how Rust the language works), see [this discussion] and
1459/// [Dijkstra's note on a related topic][note].
1460///
1461/// [this discussion]: https://github.com/rust-lang/regex/discussions/866
1462/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html
1463///
1464/// # Example
1465///
1466/// This example shows the value of each of the methods on `Match` for a
1467/// particular search.
1468///
1469/// ```
1470/// use regex::bytes::Regex;
1471///
1472/// let re = Regex::new(r"\p{Greek}+").unwrap();
1473/// let hay = "Greek: αβγδ".as_bytes();
1474/// let m = re.find(hay).unwrap();
1475/// assert_eq!(7, m.start());
1476/// assert_eq!(15, m.end());
1477/// assert!(!m.is_empty());
1478/// assert_eq!(8, m.len());
1479/// assert_eq!(7..15, m.range());
1480/// assert_eq!("αβγδ".as_bytes(), m.as_bytes());
1481/// ```
1482#[derive(Copy, Clone, Eq, PartialEq)]
1483pub struct Match<'h> {
1484    haystack: &'h [u8],
1485    start: usize,
1486    end: usize,
1487}
1488
1489impl<'h> Match<'h> {
1490    /// Returns the byte offset of the start of the match in the haystack. The
1491    /// start of the match corresponds to the position where the match begins
1492    /// and includes the first byte in the match.
1493    ///
1494    /// It is guaranteed that `Match::start() <= Match::end()`.
1495    ///
1496    /// Unlike the top-level `Match` type, the start offset may appear anywhere
1497    /// in the haystack. This includes between the code units of a UTF-8
1498    /// encoded Unicode scalar value.
1499    #[inline]
1500    pub fn start(&self) -> usize {
1501        self.start
1502    }
1503
1504    /// Returns the byte offset of the end of the match in the haystack. The
1505    /// end of the match corresponds to the byte immediately following the last
1506    /// byte in the match. This means that `&slice[start..end]` works as one
1507    /// would expect.
1508    ///
1509    /// It is guaranteed that `Match::start() <= Match::end()`.
1510    ///
1511    /// Unlike the top-level `Match` type, the start offset may appear anywhere
1512    /// in the haystack. This includes between the code units of a UTF-8
1513    /// encoded Unicode scalar value.
1514    #[inline]
1515    pub fn end(&self) -> usize {
1516        self.end
1517    }
1518
1519    /// Returns true if and only if this match has a length of zero.
1520    ///
1521    /// Note that an empty match can only occur when the regex itself can
1522    /// match the empty string. Here are some examples of regexes that can
1523    /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`,
1524    /// `(foo|\d+|quux)?`.
1525    #[inline]
1526    pub fn is_empty(&self) -> bool {
1527        self.start == self.end
1528    }
1529
1530    /// Returns the length, in bytes, of this match.
1531    #[inline]
1532    pub fn len(&self) -> usize {
1533        self.end - self.start
1534    }
1535
1536    /// Returns the range over the starting and ending byte offsets of the
1537    /// match in the haystack.
1538    #[inline]
1539    pub fn range(&self) -> core::ops::Range<usize> {
1540        self.start..self.end
1541    }
1542
1543    /// Returns the substring of the haystack that matched.
1544    #[inline]
1545    pub fn as_bytes(&self) -> &'h [u8] {
1546        &self.haystack[self.range()]
1547    }
1548
1549    /// Creates a new match from the given haystack and byte offsets.
1550    #[inline]
1551    fn new(haystack: &'h [u8], start: usize, end: usize) -> Match<'h> {
1552        Match { haystack, start, end }
1553    }
1554}
1555
1556impl<'h> core::fmt::Debug for Match<'h> {
1557    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1558        use regex_automata::util::escape::DebugHaystack;
1559
1560        let mut fmt = f.debug_struct("Match");
1561        fmt.field("start", &self.start)
1562            .field("end", &self.end)
1563            .field("bytes", &DebugHaystack(&self.as_bytes()));
1564
1565        fmt.finish()
1566    }
1567}
1568
1569impl<'h> From<Match<'h>> for &'h [u8] {
1570    fn from(m: Match<'h>) -> &'h [u8] {
1571        m.as_bytes()
1572    }
1573}
1574
1575impl<'h> From<Match<'h>> for core::ops::Range<usize> {
1576    fn from(m: Match<'h>) -> core::ops::Range<usize> {
1577        m.range()
1578    }
1579}
1580
1581/// Represents the capture groups for a single match.
1582///
1583/// Capture groups refer to parts of a regex enclosed in parentheses. They
1584/// can be optionally named. The purpose of capture groups is to be able to
1585/// reference different parts of a match based on the original pattern. In
1586/// essence, a `Captures` is a container of [`Match`] values for each group
1587/// that participated in a regex match. Each `Match` can be looked up by either
1588/// its capture group index or name (if it has one).
1589///
1590/// For example, say you want to match the individual letters in a 5-letter
1591/// word:
1592///
1593/// ```text
1594/// (?<first>\w)(\w)(?:\w)\w(?<last>\w)
1595/// ```
1596///
1597/// This regex has 4 capture groups:
1598///
1599/// * The group at index `0` corresponds to the overall match. It is always
1600/// present in every match and never has a name.
1601/// * The group at index `1` with name `first` corresponding to the first
1602/// letter.
1603/// * The group at index `2` with no name corresponding to the second letter.
1604/// * The group at index `3` with name `last` corresponding to the fifth and
1605/// last letter.
1606///
1607/// Notice that `(?:\w)` was not listed above as a capture group despite it
1608/// being enclosed in parentheses. That's because `(?:pattern)` is a special
1609/// syntax that permits grouping but *without* capturing. The reason for not
1610/// treating it as a capture is that tracking and reporting capture groups
1611/// requires additional state that may lead to slower searches. So using as few
1612/// capture groups as possible can help performance. (Although the difference
1613/// in performance of a couple of capture groups is likely immaterial.)
1614///
1615/// Values with this type are created by [`Regex::captures`] or
1616/// [`Regex::captures_iter`].
1617///
1618/// `'h` is the lifetime of the haystack that these captures were matched from.
1619///
1620/// # Example
1621///
1622/// ```
1623/// use regex::bytes::Regex;
1624///
1625/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap();
1626/// let caps = re.captures(b"toady").unwrap();
1627/// assert_eq!(b"toady", &caps[0]);
1628/// assert_eq!(b"t", &caps["first"]);
1629/// assert_eq!(b"o", &caps[2]);
1630/// assert_eq!(b"y", &caps["last"]);
1631/// ```
1632pub struct Captures<'h> {
1633    haystack: &'h [u8],
1634    caps: captures::Captures,
1635    static_captures_len: Option<usize>,
1636}
1637
1638impl<'h> Captures<'h> {
1639    /// Returns the `Match` associated with the capture group at index `i`. If
1640    /// `i` does not correspond to a capture group, or if the capture group did
1641    /// not participate in the match, then `None` is returned.
1642    ///
1643    /// When `i == 0`, this is guaranteed to return a non-`None` value.
1644    ///
1645    /// # Examples
1646    ///
1647    /// Get the substring that matched with a default of an empty string if the
1648    /// group didn't participate in the match:
1649    ///
1650    /// ```
1651    /// use regex::bytes::Regex;
1652    ///
1653    /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
1654    /// let caps = re.captures(b"abc123").unwrap();
1655    ///
1656    /// let substr1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
1657    /// let substr2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
1658    /// assert_eq!(substr1, b"123");
1659    /// assert_eq!(substr2, b"");
1660    /// ```
1661    #[inline]
1662    pub fn get(&self, i: usize) -> Option<Match<'h>> {
1663        self.caps
1664            .get_group(i)
1665            .map(|sp| Match::new(self.haystack, sp.start, sp.end))
1666    }
1667
1668    /// Returns the `Match` associated with the capture group named `name`. If
1669    /// `name` isn't a valid capture group or it refers to a group that didn't
1670    /// match, then `None` is returned.
1671    ///
1672    /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime
1673    /// matches the lifetime of the haystack in this `Captures` value.
1674    /// Conversely, the substring returned by `caps["name"]` has a lifetime
1675    /// of the `Captures` value, which is likely shorter than the lifetime of
1676    /// the haystack. In some cases, it may be necessary to use this method to
1677    /// access the matching substring instead of the `caps["name"]` notation.
1678    ///
1679    /// # Examples
1680    ///
1681    /// Get the substring that matched with a default of an empty string if the
1682    /// group didn't participate in the match:
1683    ///
1684    /// ```
1685    /// use regex::bytes::Regex;
1686    ///
1687    /// let re = Regex::new(
1688    ///     r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))",
1689    /// ).unwrap();
1690    /// let caps = re.captures(b"abc123").unwrap();
1691    ///
1692    /// let numbers = caps.name("numbers").map_or(&b""[..], |m| m.as_bytes());
1693    /// let letters = caps.name("letters").map_or(&b""[..], |m| m.as_bytes());
1694    /// assert_eq!(numbers, b"123");
1695    /// assert_eq!(letters, b"");
1696    /// ```
1697    #[inline]
1698    pub fn name(&self, name: &str) -> Option<Match<'h>> {
1699        self.caps
1700            .get_group_by_name(name)
1701            .map(|sp| Match::new(self.haystack, sp.start, sp.end))
1702    }
1703
1704    /// This is a convenience routine for extracting the substrings
1705    /// corresponding to matching capture groups.
1706    ///
1707    /// This returns a tuple where the first element corresponds to the full
1708    /// substring of the haystack that matched the regex. The second element is
1709    /// an array of substrings, with each corresponding to the substring that
1710    /// matched for a particular capture group.
1711    ///
1712    /// # Panics
1713    ///
1714    /// This panics if the number of possible matching groups in this
1715    /// `Captures` value is not fixed to `N` in all circumstances.
1716    /// More precisely, this routine only works when `N` is equivalent to
1717    /// [`Regex::static_captures_len`].
1718    ///
1719    /// Stated more plainly, if the number of matching capture groups in a
1720    /// regex can vary from match to match, then this function always panics.
1721    ///
1722    /// For example, `(a)(b)|(c)` could produce two matching capture groups
1723    /// or one matching capture group for any given match. Therefore, one
1724    /// cannot use `extract` with such a pattern.
1725    ///
1726    /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because
1727    /// the number of capture groups in every match is always equivalent,
1728    /// even if the capture _indices_ in each match are not.
1729    ///
1730    /// # Example
1731    ///
1732    /// ```
1733    /// use regex::bytes::Regex;
1734    ///
1735    /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
1736    /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
1737    /// let Some((full, [year, month, day])) =
1738    ///     re.captures(hay).map(|caps| caps.extract()) else { return };
1739    /// assert_eq!(b"2010-03-14", full);
1740    /// assert_eq!(b"2010", year);
1741    /// assert_eq!(b"03", month);
1742    /// assert_eq!(b"14", day);
1743    /// ```
1744    ///
1745    /// # Example: iteration
1746    ///
1747    /// This example shows how to use this method when iterating over all
1748    /// `Captures` matches in a haystack.
1749    ///
1750    /// ```
1751    /// use regex::bytes::Regex;
1752    ///
1753    /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
1754    /// let hay = b"1973-01-05, 1975-08-25 and 1980-10-18";
1755    ///
1756    /// let mut dates: Vec<(&[u8], &[u8], &[u8])> = vec![];
1757    /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) {
1758    ///     dates.push((y, m, d));
1759    /// }
1760    /// assert_eq!(dates, vec![
1761    ///     (&b"1973"[..], &b"01"[..], &b"05"[..]),
1762    ///     (&b"1975"[..], &b"08"[..], &b"25"[..]),
1763    ///     (&b"1980"[..], &b"10"[..], &b"18"[..]),
1764    /// ]);
1765    /// ```
1766    ///
1767    /// # Example: parsing different formats
1768    ///
1769    /// This API is particularly useful when you need to extract a particular
1770    /// value that might occur in a different format. Consider, for example,
1771    /// an identifier that might be in double quotes or single quotes:
1772    ///
1773    /// ```
1774    /// use regex::bytes::Regex;
1775    ///
1776    /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap();
1777    /// let hay = br#"The first is id:"foo" and the second is id:'bar'."#;
1778    /// let mut ids = vec![];
1779    /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) {
1780    ///     ids.push(id);
1781    /// }
1782    /// assert_eq!(ids, vec![b"foo", b"bar"]);
1783    /// ```
1784    pub fn extract<const N: usize>(&self) -> (&'h [u8], [&'h [u8]; N]) {
1785        let len = self
1786            .static_captures_len
1787            .expect("number of capture groups can vary in a match")
1788            .checked_sub(1)
1789            .expect("number of groups is always greater than zero");
1790        assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len);
1791        // The regex-automata variant of extract is a bit more permissive.
1792        // It doesn't require the number of matching capturing groups to be
1793        // static, and you can even request fewer groups than what's there. So
1794        // this is guaranteed to never panic because we've asserted above that
1795        // the user has requested precisely the number of groups that must be
1796        // present in any match for this regex.
1797        self.caps.extract_bytes(self.haystack)
1798    }
1799
1800    /// Expands all instances of `$ref` in `replacement` to the corresponding
1801    /// capture group, and writes them to the `dst` buffer given. A `ref` can
1802    /// be a capture group index or a name. If `ref` doesn't refer to a capture
1803    /// group that participated in the match, then it is replaced with the
1804    /// empty string.
1805    ///
1806    /// # Format
1807    ///
1808    /// The format of the replacement string supports two different kinds of
1809    /// capture references: unbraced and braced.
1810    ///
1811    /// For the unbraced format, the format supported is `$ref` where `name`
1812    /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always
1813    /// the longest possible parse. So for example, `$1a` corresponds to the
1814    /// capture group named `1a` and not the capture group at index `1`. If
1815    /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index
1816    /// itself and not a name.
1817    ///
1818    /// For the braced format, the format supported is `${ref}` where `ref` can
1819    /// be any sequence of bytes except for `}`. If no closing brace occurs,
1820    /// then it is not considered a capture reference. As with the unbraced
1821    /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture
1822    /// group index and not a name.
1823    ///
1824    /// The braced format is useful for exerting precise control over the name
1825    /// of the capture reference. For example, `${1}a` corresponds to the
1826    /// capture group reference `1` followed by the letter `a`, where as `$1a`
1827    /// (as mentioned above) corresponds to the capture group reference `1a`.
1828    /// The braced format is also useful for expressing capture group names
1829    /// that use characters not supported by the unbraced format. For example,
1830    /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`.
1831    ///
1832    /// If a capture group reference is found and it does not refer to a valid
1833    /// capture group, then it will be replaced with the empty string.
1834    ///
1835    /// To write a literal `$`, use `$$`.
1836    ///
1837    /// # Example
1838    ///
1839    /// ```
1840    /// use regex::bytes::Regex;
1841    ///
1842    /// let re = Regex::new(
1843    ///     r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
1844    /// ).unwrap();
1845    /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
1846    /// let caps = re.captures(hay).unwrap();
1847    ///
1848    /// let mut dst = vec![];
1849    /// caps.expand(b"year=$year, month=$month, day=$day", &mut dst);
1850    /// assert_eq!(dst, b"year=2010, month=03, day=14");
1851    /// ```
1852    #[inline]
1853    pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
1854        self.caps.interpolate_bytes_into(self.haystack, replacement, dst);
1855    }
1856
1857    /// Returns an iterator over all capture groups. This includes both
1858    /// matching and non-matching groups.
1859    ///
1860    /// The iterator always yields at least one matching group: the first group
1861    /// (at index `0`) with no name. Subsequent groups are returned in the order
1862    /// of their opening parenthesis in the regex.
1863    ///
1864    /// The elements yielded have type `Option<Match<'h>>`, where a non-`None`
1865    /// value is present if the capture group matches.
1866    ///
1867    /// # Example
1868    ///
1869    /// ```
1870    /// use regex::bytes::Regex;
1871    ///
1872    /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
1873    /// let caps = re.captures(b"AZ").unwrap();
1874    ///
1875    /// let mut it = caps.iter();
1876    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"AZ"[..]));
1877    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"A"[..]));
1878    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), None);
1879    /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"Z"[..]));
1880    /// assert_eq!(it.next(), None);
1881    /// ```
1882    #[inline]
1883    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> {
1884        SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() }
1885    }
1886
1887    /// Returns the total number of capture groups. This includes both
1888    /// matching and non-matching groups.
1889    ///
1890    /// The length returned is always equivalent to the number of elements
1891    /// yielded by [`Captures::iter`]. Consequently, the length is always
1892    /// greater than zero since every `Captures` value always includes the
1893    /// match for the entire regex.
1894    ///
1895    /// # Example
1896    ///
1897    /// ```
1898    /// use regex::bytes::Regex;
1899    ///
1900    /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
1901    /// let caps = re.captures(b"AZ").unwrap();
1902    /// assert_eq!(caps.len(), 4);
1903    /// ```
1904    #[inline]
1905    pub fn len(&self) -> usize {
1906        self.caps.group_len()
1907    }
1908}
1909
1910impl<'h> core::fmt::Debug for Captures<'h> {
1911    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1912        /// A little helper type to provide a nice map-like debug
1913        /// representation for our capturing group spans.
1914        ///
1915        /// regex-automata has something similar, but it includes the pattern
1916        /// ID in its debug output, which is confusing. It also doesn't include
1917        /// that strings that match because a regex-automata `Captures` doesn't
1918        /// borrow the haystack.
1919        struct CapturesDebugMap<'a> {
1920            caps: &'a Captures<'a>,
1921        }
1922
1923        impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
1924            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1925                let mut map = f.debug_map();
1926                let names =
1927                    self.caps.caps.group_info().pattern_names(PatternID::ZERO);
1928                for (group_index, maybe_name) in names.enumerate() {
1929                    let key = Key(group_index, maybe_name);
1930                    match self.caps.get(group_index) {
1931                        None => map.entry(&key, &None::<()>),
1932                        Some(mat) => map.entry(&key, &Value(mat)),
1933                    };
1934                }
1935                map.finish()
1936            }
1937        }
1938
1939        struct Key<'a>(usize, Option<&'a str>);
1940
1941        impl<'a> core::fmt::Debug for Key<'a> {
1942            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1943                write!(f, "{}", self.0)?;
1944                if let Some(name) = self.1 {
1945                    write!(f, "/{:?}", name)?;
1946                }
1947                Ok(())
1948            }
1949        }
1950
1951        struct Value<'a>(Match<'a>);
1952
1953        impl<'a> core::fmt::Debug for Value<'a> {
1954            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1955                use regex_automata::util::escape::DebugHaystack;
1956
1957                write!(
1958                    f,
1959                    "{}..{}/{:?}",
1960                    self.0.start(),
1961                    self.0.end(),
1962                    DebugHaystack(self.0.as_bytes())
1963                )
1964            }
1965        }
1966
1967        f.debug_tuple("Captures")
1968            .field(&CapturesDebugMap { caps: self })
1969            .finish()
1970    }
1971}
1972
1973/// Get a matching capture group's haystack substring by index.
1974///
1975/// The haystack substring returned can't outlive the `Captures` object if this
1976/// method is used, because of how `Index` is defined (normally `a[i]` is part
1977/// of `a` and can't outlive it). To work around this limitation, do that, use
1978/// [`Captures::get`] instead.
1979///
1980/// `'h` is the lifetime of the matched haystack, but the lifetime of the
1981/// `&str` returned by this implementation is the lifetime of the `Captures`
1982/// value itself.
1983///
1984/// # Panics
1985///
1986/// If there is no matching group at the given index.
1987impl<'h> core::ops::Index<usize> for Captures<'h> {
1988    type Output = [u8];
1989
1990    // The lifetime is written out to make it clear that the &str returned
1991    // does NOT have a lifetime equivalent to 'h.
1992    fn index<'a>(&'a self, i: usize) -> &'a [u8] {
1993        self.get(i)
1994            .map(|m| m.as_bytes())
1995            .unwrap_or_else(|| panic!("no group at index '{}'", i))
1996    }
1997}
1998
1999/// Get a matching capture group's haystack substring by name.
2000///
2001/// The haystack substring returned can't outlive the `Captures` object if this
2002/// method is used, because of how `Index` is defined (normally `a[i]` is part
2003/// of `a` and can't outlive it). To work around this limitation, do that, use
2004/// [`Captures::name`] instead.
2005///
2006/// `'h` is the lifetime of the matched haystack, but the lifetime of the
2007/// `&str` returned by this implementation is the lifetime of the `Captures`
2008/// value itself.
2009///
2010/// `'n` is the lifetime of the group name used to index the `Captures` value.
2011///
2012/// # Panics
2013///
2014/// If there is no matching group at the given name.
2015impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
2016    type Output = [u8];
2017
2018    fn index<'a>(&'a self, name: &'n str) -> &'a [u8] {
2019        self.name(name)
2020            .map(|m| m.as_bytes())
2021            .unwrap_or_else(|| panic!("no group named '{}'", name))
2022    }
2023}
2024
2025/// A low level representation of the byte offsets of each capture group.
2026///
2027/// You can think of this as a lower level [`Captures`], where this type does
2028/// not support named capturing groups directly and it does not borrow the
2029/// haystack that these offsets were matched on.
2030///
2031/// Primarily, this type is useful when using the lower level `Regex` APIs such
2032/// as [`Regex::captures_read`], which permits amortizing the allocation in
2033/// which capture match offsets are stored.
2034///
2035/// In order to build a value of this type, you'll need to call the
2036/// [`Regex::capture_locations`] method. The value returned can then be reused
2037/// in subsequent searches for that regex. Using it for other regexes may
2038/// result in a panic or otherwise incorrect results.
2039///
2040/// # Example
2041///
2042/// This example shows how to create and use `CaptureLocations` in a search.
2043///
2044/// ```
2045/// use regex::bytes::Regex;
2046///
2047/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2048/// let mut locs = re.capture_locations();
2049/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2050/// assert_eq!(0..17, m.range());
2051/// assert_eq!(Some((0, 17)), locs.get(0));
2052/// assert_eq!(Some((0, 5)), locs.get(1));
2053/// assert_eq!(Some((6, 17)), locs.get(2));
2054///
2055/// // Asking for an invalid capture group always returns None.
2056/// assert_eq!(None, locs.get(3));
2057/// # // literals are too big for 32-bit usize: #1041
2058/// # #[cfg(target_pointer_width = "64")]
2059/// assert_eq!(None, locs.get(34973498648));
2060/// # #[cfg(target_pointer_width = "64")]
2061/// assert_eq!(None, locs.get(9944060567225171988));
2062/// ```
2063#[derive(Clone, Debug)]
2064pub struct CaptureLocations(captures::Captures);
2065
2066/// A type alias for `CaptureLocations` for backwards compatibility.
2067///
2068/// Previously, we exported `CaptureLocations` as `Locations` in an
2069/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
2070/// we continue re-exporting the same undocumented API.
2071#[doc(hidden)]
2072pub type Locations = CaptureLocations;
2073
2074impl CaptureLocations {
2075    /// Returns the start and end byte offsets of the capture group at index
2076    /// `i`. This returns `None` if `i` is not a valid capture group or if the
2077    /// capture group did not match.
2078    ///
2079    /// # Example
2080    ///
2081    /// ```
2082    /// use regex::bytes::Regex;
2083    ///
2084    /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2085    /// let mut locs = re.capture_locations();
2086    /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2087    /// assert_eq!(Some((0, 17)), locs.get(0));
2088    /// assert_eq!(Some((0, 5)), locs.get(1));
2089    /// assert_eq!(Some((6, 17)), locs.get(2));
2090    /// ```
2091    #[inline]
2092    pub fn get(&self, i: usize) -> Option<(usize, usize)> {
2093        self.0.get_group(i).map(|sp| (sp.start, sp.end))
2094    }
2095
2096    /// Returns the total number of capture groups (even if they didn't match).
2097    /// That is, the length returned is unaffected by the result of a search.
2098    ///
2099    /// This is always at least `1` since every regex has at least `1`
2100    /// capturing group that corresponds to the entire match.
2101    ///
2102    /// # Example
2103    ///
2104    /// ```
2105    /// use regex::bytes::Regex;
2106    ///
2107    /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
2108    /// let mut locs = re.capture_locations();
2109    /// assert_eq!(3, locs.len());
2110    /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
2111    /// assert_eq!(3, locs.len());
2112    /// ```
2113    ///
2114    /// Notice that the length is always at least `1`, regardless of the regex:
2115    ///
2116    /// ```
2117    /// use regex::bytes::Regex;
2118    ///
2119    /// let re = Regex::new(r"").unwrap();
2120    /// let locs = re.capture_locations();
2121    /// assert_eq!(1, locs.len());
2122    ///
2123    /// // [a&&b] is a regex that never matches anything.
2124    /// let re = Regex::new(r"[a&&b]").unwrap();
2125    /// let locs = re.capture_locations();
2126    /// assert_eq!(1, locs.len());
2127    /// ```
2128    #[inline]
2129    pub fn len(&self) -> usize {
2130        // self.0.group_len() returns 0 if the underlying captures doesn't
2131        // represent a match, but the behavior guaranteed for this method is
2132        // that the length doesn't change based on a match or not.
2133        self.0.group_info().group_len(PatternID::ZERO)
2134    }
2135
2136    /// An alias for the `get` method for backwards compatibility.
2137    ///
2138    /// Previously, we exported `get` as `pos` in an undocumented API. To
2139    /// prevent breaking that code (e.g., in `regex-capi`), we continue
2140    /// re-exporting the same undocumented API.
2141    #[doc(hidden)]
2142    #[inline]
2143    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
2144        self.get(i)
2145    }
2146}
2147
2148/// An iterator over all non-overlapping matches in a haystack.
2149///
2150/// This iterator yields [`Match`] values. The iterator stops when no more
2151/// matches can be found.
2152///
2153/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2154/// lifetime of the haystack.
2155///
2156/// This iterator is created by [`Regex::find_iter`].
2157///
2158/// # Time complexity
2159///
2160/// Note that since an iterator runs potentially many searches on the haystack
2161/// and since each search has worst case `O(m * n)` time complexity, the
2162/// overall worst case time complexity for iteration is `O(m * n^2)`.
2163#[derive(Debug)]
2164pub struct Matches<'r, 'h> {
2165    haystack: &'h [u8],
2166    it: meta::FindMatches<'r, 'h>,
2167}
2168
2169impl<'r, 'h> Iterator for Matches<'r, 'h> {
2170    type Item = Match<'h>;
2171
2172    #[inline]
2173    fn next(&mut self) -> Option<Match<'h>> {
2174        self.it
2175            .next()
2176            .map(|sp| Match::new(self.haystack, sp.start(), sp.end()))
2177    }
2178
2179    #[inline]
2180    fn count(self) -> usize {
2181        // This can actually be up to 2x faster than calling `next()` until
2182        // completion, because counting matches when using a DFA only requires
2183        // finding the end of each match. But returning a `Match` via `next()`
2184        // requires the start of each match which, with a DFA, requires a
2185        // reverse forward scan to find it.
2186        self.it.count()
2187    }
2188}
2189
2190impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {}
2191
2192/// An iterator over all non-overlapping capture matches in a haystack.
2193///
2194/// This iterator yields [`Captures`] values. The iterator stops when no more
2195/// matches can be found.
2196///
2197/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2198/// lifetime of the matched string.
2199///
2200/// This iterator is created by [`Regex::captures_iter`].
2201///
2202/// # Time complexity
2203///
2204/// Note that since an iterator runs potentially many searches on the haystack
2205/// and since each search has worst case `O(m * n)` time complexity, the
2206/// overall worst case time complexity for iteration is `O(m * n^2)`.
2207#[derive(Debug)]
2208pub struct CaptureMatches<'r, 'h> {
2209    haystack: &'h [u8],
2210    it: meta::CapturesMatches<'r, 'h>,
2211}
2212
2213impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> {
2214    type Item = Captures<'h>;
2215
2216    #[inline]
2217    fn next(&mut self) -> Option<Captures<'h>> {
2218        let static_captures_len = self.it.regex().static_captures_len();
2219        self.it.next().map(|caps| Captures {
2220            haystack: self.haystack,
2221            caps,
2222            static_captures_len,
2223        })
2224    }
2225
2226    #[inline]
2227    fn count(self) -> usize {
2228        // This can actually be up to 2x faster than calling `next()` until
2229        // completion, because counting matches when using a DFA only requires
2230        // finding the end of each match. But returning a `Match` via `next()`
2231        // requires the start of each match which, with a DFA, requires a
2232        // reverse forward scan to find it.
2233        self.it.count()
2234    }
2235}
2236
2237impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {}
2238
2239/// An iterator over all substrings delimited by a regex match.
2240///
2241/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2242/// lifetime of the byte string being split.
2243///
2244/// This iterator is created by [`Regex::split`].
2245///
2246/// # Time complexity
2247///
2248/// Note that since an iterator runs potentially many searches on the haystack
2249/// and since each search has worst case `O(m * n)` time complexity, the
2250/// overall worst case time complexity for iteration is `O(m * n^2)`.
2251#[derive(Debug)]
2252pub struct Split<'r, 'h> {
2253    haystack: &'h [u8],
2254    it: meta::Split<'r, 'h>,
2255}
2256
2257impl<'r, 'h> Iterator for Split<'r, 'h> {
2258    type Item = &'h [u8];
2259
2260    #[inline]
2261    fn next(&mut self) -> Option<&'h [u8]> {
2262        self.it.next().map(|span| &self.haystack[span])
2263    }
2264}
2265
2266impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
2267
2268/// An iterator over at most `N` substrings delimited by a regex match.
2269///
2270/// The last substring yielded by this iterator will be whatever remains after
2271/// `N-1` splits.
2272///
2273/// `'r` is the lifetime of the compiled regular expression and `'h` is the
2274/// lifetime of the byte string being split.
2275///
2276/// This iterator is created by [`Regex::splitn`].
2277///
2278/// # Time complexity
2279///
2280/// Note that since an iterator runs potentially many searches on the haystack
2281/// and since each search has worst case `O(m * n)` time complexity, the
2282/// overall worst case time complexity for iteration is `O(m * n^2)`.
2283///
2284/// Although note that the worst case time here has an upper bound given
2285/// by the `limit` parameter to [`Regex::splitn`].
2286#[derive(Debug)]
2287pub struct SplitN<'r, 'h> {
2288    haystack: &'h [u8],
2289    it: meta::SplitN<'r, 'h>,
2290}
2291
2292impl<'r, 'h> Iterator for SplitN<'r, 'h> {
2293    type Item = &'h [u8];
2294
2295    #[inline]
2296    fn next(&mut self) -> Option<&'h [u8]> {
2297        self.it.next().map(|span| &self.haystack[span])
2298    }
2299
2300    #[inline]
2301    fn size_hint(&self) -> (usize, Option<usize>) {
2302        self.it.size_hint()
2303    }
2304}
2305
2306impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
2307
2308/// An iterator over the names of all capture groups in a regex.
2309///
2310/// This iterator yields values of type `Option<&str>` in order of the opening
2311/// capture group parenthesis in the regex pattern. `None` is yielded for
2312/// groups with no name. The first element always corresponds to the implicit
2313/// and unnamed group for the overall match.
2314///
2315/// `'r` is the lifetime of the compiled regular expression.
2316///
2317/// This iterator is created by [`Regex::capture_names`].
2318#[derive(Clone, Debug)]
2319pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>);
2320
2321impl<'r> Iterator for CaptureNames<'r> {
2322    type Item = Option<&'r str>;
2323
2324    #[inline]
2325    fn next(&mut self) -> Option<Option<&'r str>> {
2326        self.0.next()
2327    }
2328
2329    #[inline]
2330    fn size_hint(&self) -> (usize, Option<usize>) {
2331        self.0.size_hint()
2332    }
2333
2334    #[inline]
2335    fn count(self) -> usize {
2336        self.0.count()
2337    }
2338}
2339
2340impl<'r> ExactSizeIterator for CaptureNames<'r> {}
2341
2342impl<'r> core::iter::FusedIterator for CaptureNames<'r> {}
2343
2344/// An iterator over all group matches in a [`Captures`] value.
2345///
2346/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the
2347/// lifetime of the haystack that the matches are for. The order of elements
2348/// yielded corresponds to the order of the opening parenthesis for the group
2349/// in the regex pattern. `None` is yielded for groups that did not participate
2350/// in the match.
2351///
2352/// The first element always corresponds to the implicit group for the overall
2353/// match. Since this iterator is created by a [`Captures`] value, and a
2354/// `Captures` value is only created when a match occurs, it follows that the
2355/// first element yielded by this iterator is guaranteed to be non-`None`.
2356///
2357/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that
2358/// created this iterator, and the lifetime `'h` corresponds to the originally
2359/// matched haystack.
2360#[derive(Clone, Debug)]
2361pub struct SubCaptureMatches<'c, 'h> {
2362    haystack: &'h [u8],
2363    it: captures::CapturesPatternIter<'c>,
2364}
2365
2366impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> {
2367    type Item = Option<Match<'h>>;
2368
2369    #[inline]
2370    fn next(&mut self) -> Option<Option<Match<'h>>> {
2371        self.it.next().map(|group| {
2372            group.map(|sp| Match::new(self.haystack, sp.start, sp.end))
2373        })
2374    }
2375
2376    #[inline]
2377    fn size_hint(&self) -> (usize, Option<usize>) {
2378        self.it.size_hint()
2379    }
2380
2381    #[inline]
2382    fn count(self) -> usize {
2383        self.it.count()
2384    }
2385}
2386
2387impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {}
2388
2389impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {}
2390
2391/// A trait for types that can be used to replace matches in a haystack.
2392///
2393/// In general, users of this crate shouldn't need to implement this trait,
2394/// since implementations are already provided for `&[u8]` along with other
2395/// variants of byte string types, as well as `FnMut(&Captures) -> Vec<u8>` (or
2396/// any `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`). Those cover most use
2397/// cases, but callers can implement this trait directly if necessary.
2398///
2399/// # Example
2400///
2401/// This example shows a basic implementation of the `Replacer` trait. This can
2402/// be done much more simply using the replacement byte string interpolation
2403/// support (e.g., `$first $last`), but this approach avoids needing to parse
2404/// the replacement byte string at all.
2405///
2406/// ```
2407/// use regex::bytes::{Captures, Regex, Replacer};
2408///
2409/// struct NameSwapper;
2410///
2411/// impl Replacer for NameSwapper {
2412///     fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2413///         dst.extend_from_slice(&caps["first"]);
2414///         dst.extend_from_slice(b" ");
2415///         dst.extend_from_slice(&caps["last"]);
2416///     }
2417/// }
2418///
2419/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
2420/// let result = re.replace(b"Springsteen, Bruce", NameSwapper);
2421/// assert_eq!(result, &b"Bruce Springsteen"[..]);
2422/// ```
2423pub trait Replacer {
2424    /// Appends possibly empty data to `dst` to replace the current match.
2425    ///
2426    /// The current match is represented by `caps`, which is guaranteed to have
2427    /// a match at capture group `0`.
2428    ///
2429    /// For example, a no-op replacement would be
2430    /// `dst.extend_from_slice(&caps[0])`.
2431    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
2432
2433    /// Return a fixed unchanging replacement byte string.
2434    ///
2435    /// When doing replacements, if access to [`Captures`] is not needed (e.g.,
2436    /// the replacement byte string does not need `$` expansion), then it can
2437    /// be beneficial to avoid finding sub-captures.
2438    ///
2439    /// In general, this is called once for every call to a replacement routine
2440    /// such as [`Regex::replace_all`].
2441    fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
2442        None
2443    }
2444
2445    /// Returns a type that implements `Replacer`, but that borrows and wraps
2446    /// this `Replacer`.
2447    ///
2448    /// This is useful when you want to take a generic `Replacer` (which might
2449    /// not be cloneable) and use it without consuming it, so it can be used
2450    /// more than once.
2451    ///
2452    /// # Example
2453    ///
2454    /// ```
2455    /// use regex::bytes::{Regex, Replacer};
2456    ///
2457    /// fn replace_all_twice<R: Replacer>(
2458    ///     re: Regex,
2459    ///     src: &[u8],
2460    ///     mut rep: R,
2461    /// ) -> Vec<u8> {
2462    ///     let dst = re.replace_all(src, rep.by_ref());
2463    ///     let dst = re.replace_all(&dst, rep.by_ref());
2464    ///     dst.into_owned()
2465    /// }
2466    /// ```
2467    fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
2468        ReplacerRef(self)
2469    }
2470}
2471
2472impl<'a, const N: usize> Replacer for &'a [u8; N] {
2473    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2474        caps.expand(&**self, dst);
2475    }
2476
2477    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2478        no_expansion(self)
2479    }
2480}
2481
2482impl<const N: usize> Replacer for [u8; N] {
2483    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2484        caps.expand(&*self, dst);
2485    }
2486
2487    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2488        no_expansion(self)
2489    }
2490}
2491
2492impl<'a> Replacer for &'a [u8] {
2493    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2494        caps.expand(*self, dst);
2495    }
2496
2497    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2498        no_expansion(self)
2499    }
2500}
2501
2502impl<'a> Replacer for &'a Vec<u8> {
2503    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2504        caps.expand(*self, dst);
2505    }
2506
2507    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2508        no_expansion(self)
2509    }
2510}
2511
2512impl Replacer for Vec<u8> {
2513    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2514        caps.expand(self, dst);
2515    }
2516
2517    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2518        no_expansion(self)
2519    }
2520}
2521
2522impl<'a> Replacer for Cow<'a, [u8]> {
2523    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2524        caps.expand(self.as_ref(), dst);
2525    }
2526
2527    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2528        no_expansion(self)
2529    }
2530}
2531
2532impl<'a> Replacer for &'a Cow<'a, [u8]> {
2533    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2534        caps.expand(self.as_ref(), dst);
2535    }
2536
2537    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2538        no_expansion(self)
2539    }
2540}
2541
2542impl<F, T> Replacer for F
2543where
2544    F: FnMut(&Captures<'_>) -> T,
2545    T: AsRef<[u8]>,
2546{
2547    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2548        dst.extend_from_slice((*self)(caps).as_ref());
2549    }
2550}
2551
2552/// A by-reference adaptor for a [`Replacer`].
2553///
2554/// This permits reusing the same `Replacer` value in multiple calls to a
2555/// replacement routine like [`Regex::replace_all`].
2556///
2557/// This type is created by [`Replacer::by_ref`].
2558#[derive(Debug)]
2559pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
2560
2561impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
2562    fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
2563        self.0.replace_append(caps, dst)
2564    }
2565
2566    fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
2567        self.0.no_expansion()
2568    }
2569}
2570
2571/// A helper type for forcing literal string replacement.
2572///
2573/// It can be used with routines like [`Regex::replace`] and
2574/// [`Regex::replace_all`] to do a literal string replacement without expanding
2575/// `$name` to their corresponding capture groups. This can be both convenient
2576/// (to avoid escaping `$`, for example) and faster (since capture groups
2577/// don't need to be found).
2578///
2579/// `'s` is the lifetime of the literal string to use.
2580///
2581/// # Example
2582///
2583/// ```
2584/// use regex::bytes::{NoExpand, Regex};
2585///
2586/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
2587/// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
2588/// assert_eq!(result, &b"$2 $last"[..]);
2589/// ```
2590#[derive(Clone, Debug)]
2591pub struct NoExpand<'s>(pub &'s [u8]);
2592
2593impl<'s> Replacer for NoExpand<'s> {
2594    fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
2595        dst.extend_from_slice(self.0);
2596    }
2597
2598    fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
2599        Some(Cow::Borrowed(self.0))
2600    }
2601}
2602
2603/// Quickly checks the given replacement string for whether interpolation
2604/// should be done on it. It returns `None` if a `$` was found anywhere in the
2605/// given string, which suggests interpolation needs to be done. But if there's
2606/// no `$` anywhere, then interpolation definitely does not need to be done. In
2607/// that case, the given string is returned as a borrowed `Cow`.
2608///
2609/// This is meant to be used to implement the `Replacer::no_expandsion` method
2610/// in its various trait impls.
2611fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
2612    let replacement = replacement.as_ref();
2613    match crate::find_byte::find_byte(b'$', replacement) {
2614        Some(_) => None,
2615        None => Some(Cow::Borrowed(replacement)),
2616    }
2617}
2618
2619#[cfg(test)]
2620mod tests {
2621    use super::*;
2622    use alloc::format;
2623
2624    #[test]
2625    fn test_match_properties() {
2626        let haystack = b"Hello, world!";
2627        let m = Match::new(haystack, 7, 12);
2628
2629        assert_eq!(m.start(), 7);
2630        assert_eq!(m.end(), 12);
2631        assert_eq!(m.is_empty(), false);
2632        assert_eq!(m.len(), 5);
2633        assert_eq!(m.as_bytes(), b"world");
2634    }
2635
2636    #[test]
2637    fn test_empty_match() {
2638        let haystack = b"";
2639        let m = Match::new(haystack, 0, 0);
2640
2641        assert_eq!(m.is_empty(), true);
2642        assert_eq!(m.len(), 0);
2643    }
2644
2645    #[test]
2646    fn test_debug_output_valid_utf8() {
2647        let haystack = b"Hello, world!";
2648        let m = Match::new(haystack, 7, 12);
2649        let debug_str = format!("{:?}", m);
2650
2651        assert_eq!(
2652            debug_str,
2653            r#"Match { start: 7, end: 12, bytes: "world" }"#
2654        );
2655    }
2656
2657    #[test]
2658    fn test_debug_output_invalid_utf8() {
2659        let haystack = b"Hello, \xFFworld!";
2660        let m = Match::new(haystack, 7, 13);
2661        let debug_str = format!("{:?}", m);
2662
2663        assert_eq!(
2664            debug_str,
2665            r#"Match { start: 7, end: 13, bytes: "\xffworld" }"#
2666        );
2667    }
2668
2669    #[test]
2670    fn test_debug_output_various_unicode() {
2671        let haystack =
2672            "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes();
2673        let m = Match::new(haystack, 0, haystack.len());
2674        let debug_str = format!("{:?}", m);
2675
2676        assert_eq!(
2677            debug_str,
2678            r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"#
2679        );
2680    }
2681
2682    #[test]
2683    fn test_debug_output_ascii_escape() {
2684        let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m.";
2685        let m = Match::new(haystack, 0, haystack.len());
2686        let debug_str = format!("{:?}", m);
2687
2688        assert_eq!(
2689            debug_str,
2690            r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"#
2691        );
2692    }
2693
2694    #[test]
2695    fn test_debug_output_match_in_middle() {
2696        let haystack = b"The quick brown fox jumps over the lazy dog.";
2697        let m = Match::new(haystack, 16, 19);
2698        let debug_str = format!("{:?}", m);
2699
2700        assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#);
2701    }
2702}