onig/
find.rs

1use super::{Regex, Region, SearchOptions};
2use std::iter::FusedIterator;
3
4impl Regex {
5    /// Returns the capture groups corresponding to the leftmost-first match
6    /// in text. Capture group `0` always corresponds to the entire match.
7    /// If no match is found, then `None` is returned.
8    pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
9        let mut region = Region::new();
10        self.search_with_options(
11            text,
12            0,
13            text.len(),
14            SearchOptions::SEARCH_OPTION_NONE,
15            Some(&mut region),
16        )
17        .map(|pos| Captures {
18            text,
19            region,
20            offset: pos,
21        })
22    }
23
24    /// Returns an iterator for each successive non-overlapping match in `text`,
25    /// returning the start and end byte indices with respect to `text`.
26    ///
27    /// # Example
28    ///
29    /// Find the start and end location of every word with exactly 13
30    /// characters:
31    ///
32    /// ```rust
33    /// # use onig::Regex;
34    /// # fn main() {
35    /// let text = "Retroactively relinquishing remunerations is reprehensible.";
36    /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
37    ///     println!("{:?}", pos);
38    /// }
39    /// // Output:
40    /// // (0, 13)
41    /// // (14, 27)
42    /// // (28, 41)
43    /// // (45, 58)
44    /// # }
45    /// ```
46    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
47        FindMatches {
48            regex: self,
49            region: Region::new(),
50            text,
51            last_end: 0,
52            last_match_end: None,
53        }
54    }
55
56    /// Returns an iterator over all the non-overlapping capture groups matched
57    /// in `text`. This is operationally the same as `find_iter` (except it
58    /// yields information about submatches).
59    ///
60    /// # Example
61    ///
62    /// We can use this to find all movie titles and their release years in
63    /// some text, where the movie is formatted like "'Title' (xxxx)":
64    ///
65    /// ```rust
66    /// # use onig::Regex;
67    /// # fn main() {
68    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")
69    ///                .unwrap();
70    /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
71    /// for caps in re.captures_iter(text) {
72    ///     println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2));
73    /// }
74    /// // Output:
75    /// // Movie: Citizen Kane, Released: 1941
76    /// // Movie: The Wizard of Oz, Released: 1939
77    /// // Movie: M, Released: 1931
78    /// # }
79    /// ```
80    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> {
81        FindCaptures {
82            regex: self,
83            text,
84            last_end: 0,
85            last_match_end: None,
86        }
87    }
88
89    /// Returns an iterator of substrings of `text` delimited by a match
90    /// of the regular expression.
91    /// Namely, each element of the iterator corresponds to text that *isn't*
92    /// matched by the regular expression.
93    ///
94    /// This method will *not* copy the text given.
95    ///
96    /// # Example
97    ///
98    /// To split a string delimited by arbitrary amounts of spaces or tabs:
99    ///
100    /// ```rust
101    /// # use onig::Regex;
102    /// # fn main() {
103    /// let re = Regex::new(r"[ \t]+").unwrap();
104    /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
105    /// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
106    /// # }
107    /// ```
108    pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
109        RegexSplits {
110            finder: self.find_iter(text),
111            last: 0,
112        }
113    }
114
115    /// Returns an iterator of at most `limit` substrings of `text` delimited
116    /// by a match of the regular expression. (A `limit` of `0` will return no
117    /// substrings.)
118    /// Namely, each element of the iterator corresponds to text that *isn't*
119    /// matched by the regular expression.
120    /// The remainder of the string that is not split will be the last element
121    /// in the iterator.
122    ///
123    /// This method will *not* copy the text given.
124    ///
125    /// # Example
126    ///
127    /// Get the first two words in some text:
128    ///
129    /// ```rust
130    /// # use onig::Regex;
131    /// # fn main() {
132    /// let re = Regex::new(r"\W+").unwrap();
133    /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
134    /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
135    /// # }
136    /// ```
137    pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> {
138        RegexSplitsN {
139            splits: self.split(text),
140            n: limit,
141        }
142    }
143
144    /// Scan the given slice, capturing into the given region and
145    /// executing a callback for each match.
146    pub fn scan_with_region<F>(
147        &self,
148        to_search: &str,
149        region: &mut Region,
150        options: SearchOptions,
151        mut callback: F,
152    ) -> i32
153    where
154        F: Fn(i32, i32, &Region) -> bool,
155    {
156        use onig_sys::{onig_scan, OnigRegion};
157        use std::os::raw::{c_int, c_void};
158
159        // Find the bounds of the string we're searching
160        let start = to_search.as_ptr();
161        let end = to_search[to_search.len()..].as_ptr();
162
163        unsafe extern "C" fn scan_cb<F>(
164            i: c_int,
165            j: c_int,
166            r: *mut OnigRegion,
167            ud: *mut c_void,
168        ) -> c_int
169        where
170            F: Fn(i32, i32, &Region) -> bool,
171        {
172            let region = Region::clone_from_raw(r);
173            let callback = &*(ud as *mut F);
174            if callback(i, j, &region) {
175                0
176            } else {
177                -1
178            }
179        }
180
181        unsafe {
182            onig_scan(
183                self.raw,
184                start,
185                end,
186                (&mut region.raw) as *mut ::onig_sys::OnigRegion,
187                options.bits(),
188                Some(scan_cb::<F>),
189                &mut callback as *mut F as *mut c_void,
190            )
191        }
192    }
193
194    /// Scan a Pattern and Observe Captures
195    ///
196    /// The scan function takes a haystack `to_search` and invokes the
197    /// given `callback` for each capture of this expression.
198    pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB)
199    where
200        CB: Fn(i32, Captures<'t>) -> bool,
201    {
202        let mut region = Region::new();
203        self.scan_with_region(
204            to_search,
205            &mut region,
206            SearchOptions::SEARCH_OPTION_NONE,
207            |n, s, region| {
208                let captures = Captures {
209                    text: to_search,
210                    region: region.clone(),
211                    offset: s as usize,
212                };
213                callback(n, captures)
214            },
215        );
216    }
217}
218
219/// Captures represents a group of captured strings for a single match.
220///
221/// The 0th capture always corresponds to the entire match. Each subsequent
222/// index corresponds to the next capture group in the regex. Positions
223/// returned from a capture group are always byte indices.
224///
225/// `'t` is the lifetime of the matched text.
226#[derive(Debug)]
227pub struct Captures<'t> {
228    text: &'t str,
229    region: Region,
230    offset: usize,
231}
232
233impl<'t> Captures<'t> {
234    /// Returns the start and end positions of the Nth capture group. Returns
235    /// `None` if i is not a valid capture group or if the capture group did
236    /// not match anything. The positions returned are always byte indices with
237    /// respect to the original string matched.
238    pub fn pos(&self, pos: usize) -> Option<(usize, usize)> {
239        self.region.pos(pos)
240    }
241
242    /// Returns the matched string for the capture group `i`. If `i` isn't
243    /// a valid capture group or didn't match anything, then `None` is returned.
244    pub fn at(&self, pos: usize) -> Option<&'t str> {
245        self.pos(pos).map(|(beg, end)| &self.text[beg..end])
246    }
247
248    /// Returns the number of captured groups.
249    pub fn len(&self) -> usize {
250        self.region.len()
251    }
252
253    /// Returns true if and only if there are no captured groups.
254    pub fn is_empty(&self) -> bool {
255        self.len() == 0
256    }
257
258    /// Creates an iterator of all the capture groups in order of appearance in
259    /// the regular expression.
260    pub fn iter(&'t self) -> SubCaptures<'t> {
261        SubCaptures { idx: 0, caps: self }
262    }
263
264    /// Creates an iterator of all the capture group positions in order of
265    /// appearance in the regular expression. Positions are byte indices in
266    /// terms of the original string matched.
267    pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
268        SubCapturesPos { idx: 0, caps: self }
269    }
270
271    /// Offset of the captures within the given string slice.
272    pub fn offset(&self) -> usize {
273        self.offset
274    }
275}
276
277/// An iterator over capture groups for a particular match of a regular
278/// expression.
279///
280/// `'t` is the lifetime of the matched text.
281pub struct SubCaptures<'t> {
282    idx: usize,
283    caps: &'t Captures<'t>,
284}
285
286impl<'t> Iterator for SubCaptures<'t> {
287    type Item = Option<&'t str>;
288
289    fn next(&mut self) -> Option<Option<&'t str>> {
290        if self.idx < self.caps.len() {
291            self.idx += 1;
292            Some(self.caps.at(self.idx - 1))
293        } else {
294            None
295        }
296    }
297
298    fn size_hint(&self) -> (usize, Option<usize>) {
299        let size = self.caps.len();
300        (size, Some(size))
301    }
302
303    fn count(self) -> usize {
304        self.caps.len()
305    }
306}
307
308impl<'t> FusedIterator for SubCaptures<'t> {}
309
310impl<'t> ExactSizeIterator for SubCaptures<'t> {}
311
312/// An iterator over capture group positions for a particular match of
313/// a regular expression.
314///
315/// Positions are byte indices in terms of the original
316/// string matched. `'t` is the lifetime of the matched text.
317pub struct SubCapturesPos<'t> {
318    idx: usize,
319    caps: &'t Captures<'t>,
320}
321
322impl<'t> Iterator for SubCapturesPos<'t> {
323    type Item = Option<(usize, usize)>;
324
325    fn next(&mut self) -> Option<Option<(usize, usize)>> {
326        if self.idx < self.caps.len() {
327            self.idx += 1;
328            Some(self.caps.pos(self.idx - 1))
329        } else {
330            None
331        }
332    }
333
334    fn size_hint(&self) -> (usize, Option<usize>) {
335        let size = self.caps.len();
336        (size, Some(size))
337    }
338
339    fn count(self) -> usize {
340        self.caps.len()
341    }
342}
343
344impl<'t> FusedIterator for SubCapturesPos<'t> {}
345
346impl<'t> ExactSizeIterator for SubCapturesPos<'t> {}
347
348/// An iterator over all non-overlapping matches for a particular string.
349///
350/// The iterator yields a tuple of integers corresponding to the start and end
351/// of the match. The indices are byte offsets. The iterator stops when no more
352/// matches can be found.
353///
354/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
355/// of the matched string.
356pub struct FindMatches<'r, 't> {
357    regex: &'r Regex,
358    region: Region,
359    text: &'t str,
360    last_end: usize,
361    last_match_end: Option<usize>,
362}
363
364impl<'r, 't> Iterator for FindMatches<'r, 't> {
365    type Item = (usize, usize);
366
367    fn next(&mut self) -> Option<(usize, usize)> {
368        if self.last_end > self.text.len() {
369            return None;
370        }
371        self.region.clear();
372        self.regex.search_with_options(
373            self.text,
374            self.last_end,
375            self.text.len(),
376            SearchOptions::SEARCH_OPTION_NONE,
377            Some(&mut self.region),
378        )?;
379        let (s, e) = self.region.pos(0).unwrap();
380
381        // Don't accept empty matches immediately following the last match.
382        // i.e., no infinite loops please.
383        if e == s && self.last_match_end.map_or(false, |l| l == e) {
384            self.last_end += self.text[self.last_end..]
385                .chars()
386                .next()
387                .map(|c| c.len_utf8())
388                .unwrap_or(1);
389            return self.next();
390        } else {
391            self.last_end = e;
392            self.last_match_end = Some(e);
393        }
394
395        Some((s, e))
396    }
397}
398
399impl<'r, 't> FusedIterator for FindMatches<'r, 't> {}
400
401/// An iterator that yields all non-overlapping capture groups matching a
402/// particular regular expression.
403///
404/// The iterator stops when no more matches can be found.
405///
406/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
407/// of the matched string.
408pub struct FindCaptures<'r, 't> {
409    regex: &'r Regex,
410    text: &'t str,
411    last_end: usize,
412    last_match_end: Option<usize>,
413}
414
415impl<'r, 't> Iterator for FindCaptures<'r, 't> {
416    type Item = Captures<'t>;
417
418    fn next(&mut self) -> Option<Captures<'t>> {
419        if self.last_end > self.text.len() {
420            return None;
421        }
422
423        let mut region = Region::new();
424        let r = self.regex.search_with_options(
425            self.text,
426            self.last_end,
427            self.text.len(),
428            SearchOptions::SEARCH_OPTION_NONE,
429            Some(&mut region),
430        )?;
431        let (s, e) = region.pos(0).unwrap();
432
433        // Don't accept empty matches immediately following the last match.
434        // i.e., no infinite loops please.
435        if e == s && self.last_match_end.map_or(false, |l| l == e) {
436            self.last_end += self.text[self.last_end..]
437                .chars()
438                .next()
439                .map(|c| c.len_utf8())
440                .unwrap_or(1);
441            return self.next();
442        } else {
443            self.last_end = e;
444            self.last_match_end = Some(e);
445        }
446        Some(Captures {
447            text: self.text,
448            region,
449            offset: r,
450        })
451    }
452}
453
454impl<'r, 't> FusedIterator for FindCaptures<'r, 't> {}
455
456/// Yields all substrings delimited by a regular expression match.
457///
458/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
459/// of the string being split.
460pub struct RegexSplits<'r, 't> {
461    finder: FindMatches<'r, 't>,
462    last: usize,
463}
464
465impl<'r, 't> Iterator for RegexSplits<'r, 't> {
466    type Item = &'t str;
467
468    fn next(&mut self) -> Option<&'t str> {
469        let text = self.finder.text;
470        match self.finder.next() {
471            None => {
472                if self.last >= text.len() {
473                    None
474                } else {
475                    let s = &text[self.last..];
476                    self.last = text.len();
477                    Some(s)
478                }
479            }
480            Some((s, e)) => {
481                let matched = &text[self.last..s];
482                self.last = e;
483                Some(matched)
484            }
485        }
486    }
487}
488
489impl<'r, 't> FusedIterator for RegexSplits<'r, 't> {}
490
491/// Yields at most `N` substrings delimited by a regular expression match.
492///
493/// The last substring will be whatever remains after splitting.
494///
495/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
496/// of the string being split.
497pub struct RegexSplitsN<'r, 't> {
498    splits: RegexSplits<'r, 't>,
499    n: usize,
500}
501
502impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
503    type Item = &'t str;
504
505    fn next(&mut self) -> Option<&'t str> {
506        if self.n == 0 {
507            return None;
508        }
509        self.n -= 1;
510        if self.n == 0 {
511            let text = self.splits.finder.text;
512            Some(&text[self.splits.last..])
513        } else {
514            self.splits.next()
515        }
516    }
517
518    fn size_hint(&self) -> (usize, Option<usize>) {
519        (0, Some(self.n))
520    }
521}
522
523impl<'r, 't> FusedIterator for RegexSplitsN<'r, 't> {}
524
525#[cfg(test)]
526mod tests {
527    use super::super::*;
528
529    #[test]
530    fn test_regex_captures() {
531        let regex = Regex::new("e(l+)|(r+)").unwrap();
532        let captures = regex.captures("hello").unwrap();
533        assert_eq!(captures.len(), 3);
534        assert_eq!(captures.is_empty(), false);
535        let pos1 = captures.pos(0).unwrap();
536        let pos2 = captures.pos(1).unwrap();
537        let pos3 = captures.pos(2);
538        assert_eq!(pos1, (1, 4));
539        assert_eq!(pos2, (2, 4));
540        assert_eq!(pos3, None);
541        let str1 = captures.at(0).unwrap();
542        let str2 = captures.at(1).unwrap();
543        let str3 = captures.at(2);
544        assert_eq!(str1, "ell");
545        assert_eq!(str2, "ll");
546        assert_eq!(str3, None);
547    }
548
549    #[test]
550    fn test_regex_subcaptures() {
551        let regex = Regex::new("e(l+)").unwrap();
552        let captures = regex.captures("hello").unwrap();
553        let caps = captures.iter().collect::<Vec<_>>();
554        assert_eq!(caps[0], Some("ell"));
555        assert_eq!(caps[1], Some("ll"));
556        assert_eq!(caps.len(), 2);
557    }
558
559    #[test]
560    fn test_regex_subcapturespos() {
561        let regex = Regex::new("e(l+)").unwrap();
562        let captures = regex.captures("hello").unwrap();
563        let caps = captures.iter_pos().collect::<Vec<_>>();
564        assert_eq!(caps[0], Some((1, 4)));
565        assert_eq!(caps[1], Some((2, 4)));
566        assert_eq!(caps.len(), 2);
567    }
568
569    #[test]
570    fn test_find_iter() {
571        let re = Regex::new(r"\d+").unwrap();
572        let ms = re.find_iter("a12b2").collect::<Vec<_>>();
573        assert_eq!(ms, vec![(1, 3), (4, 5)]);
574    }
575
576    #[test]
577    fn test_find_iter_one_zero_length() {
578        let re = Regex::new(r"\d*").unwrap();
579        let ms = re.find_iter("a1b2").collect::<Vec<_>>();
580        assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]);
581    }
582
583    #[test]
584    fn test_find_iter_many_zero_length() {
585        let re = Regex::new(r"\d*").unwrap();
586        let ms = re.find_iter("a1bbb2").collect::<Vec<_>>();
587        assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]);
588    }
589
590    #[test]
591    fn test_find_iter_empty_after_match() {
592        let re = Regex::new(r"b|(?=,)").unwrap();
593        let ms = re.find_iter("ba,").collect::<Vec<_>>();
594        assert_eq!(ms, vec![(0, 1), (2, 2)]);
595    }
596
597    #[test]
598    fn test_zero_length_matches_jumps_past_match_location() {
599        let re = Regex::new(r"\b").unwrap();
600        let matches = re.find_iter("test string").collect::<Vec<_>>();
601        assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]);
602    }
603
604    #[test]
605    fn test_captures_iter() {
606        let re = Regex::new(r"\d+").unwrap();
607        let ms = re.captures_iter("a12b2").collect::<Vec<_>>();
608        assert_eq!(ms[0].pos(0).unwrap(), (1, 3));
609        assert_eq!(ms[1].pos(0).unwrap(), (4, 5));
610    }
611
612    #[test]
613    fn test_captures_stores_match_offset() {
614        let reg = Regex::new(r"\d+\.(\d+)").unwrap();
615        let captures = reg.captures("100 - 3.1415 / 2.0").unwrap();
616        assert_eq!(6, captures.offset());
617        let all_caps = reg
618            .captures_iter("1 - 3234.3 * 123.2 - 100")
619            .map(|cap| cap.offset())
620            .collect::<Vec<_>>();
621        assert_eq!(vec![4, 13], all_caps);
622    }
623}