syntect/parsing/
regex.rs

1use once_cell::sync::OnceCell;
2use serde::de::{Deserialize, Deserializer};
3use serde::ser::{Serialize, Serializer};
4use std::error::Error;
5
6/// An abstraction for regex patterns.
7///
8/// * Allows swapping out the regex implementation because it's only in this module.
9/// * Makes regexes serializable and deserializable using just the pattern string.
10/// * Lazily compiles regexes on first use to improve initialization time.
11#[derive(Debug)]
12pub struct Regex {
13    regex_str: String,
14    regex: OnceCell<regex_impl::Regex>,
15}
16
17/// A region contains text positions for capture groups in a match result.
18#[derive(Clone, Debug, Eq, PartialEq)]
19pub struct Region {
20    region: regex_impl::Region,
21}
22
23impl Regex {
24    /// Create a new regex from the pattern string.
25    ///
26    /// Note that the regex compilation happens on first use, which is why this method does not
27    /// return a result.
28    pub fn new(regex_str: String) -> Self {
29        Self {
30            regex_str,
31            regex: OnceCell::new(),
32        }
33    }
34
35    /// Check whether the pattern compiles as a valid regex or not.
36    pub fn try_compile(regex_str: &str) -> Option<Box<dyn Error + Send + Sync + 'static>> {
37        regex_impl::Regex::new(regex_str).err()
38    }
39
40    /// Return the regex pattern.
41    pub fn regex_str(&self) -> &str {
42        &self.regex_str
43    }
44
45    /// Check if the regex matches the given text.
46    pub fn is_match(&self, text: &str) -> bool {
47        self.regex().is_match(text)
48    }
49
50    /// Search for the pattern in the given text from begin/end positions.
51    ///
52    /// If a region is passed, it is used for storing match group positions. The argument allows
53    /// the [`Region`] to be reused between searches, which makes a significant performance
54    /// difference.
55    ///
56    /// [`Region`]: struct.Region.html
57    pub fn search(
58        &self,
59        text: &str,
60        begin: usize,
61        end: usize,
62        region: Option<&mut Region>,
63    ) -> bool {
64        self.regex()
65            .search(text, begin, end, region.map(|r| &mut r.region))
66    }
67
68    fn regex(&self) -> &regex_impl::Regex {
69        self.regex.get_or_init(|| {
70            regex_impl::Regex::new(&self.regex_str).expect("regex string should be pre-tested")
71        })
72    }
73}
74
75impl Clone for Regex {
76    fn clone(&self) -> Self {
77        Regex {
78            regex_str: self.regex_str.clone(),
79            regex: OnceCell::new(),
80        }
81    }
82}
83
84impl PartialEq for Regex {
85    fn eq(&self, other: &Regex) -> bool {
86        self.regex_str == other.regex_str
87    }
88}
89
90impl Eq for Regex {}
91
92impl Serialize for Regex {
93    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
94    where
95        S: Serializer,
96    {
97        serializer.serialize_str(&self.regex_str)
98    }
99}
100
101impl<'de> Deserialize<'de> for Regex {
102    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
103    where
104        D: Deserializer<'de>,
105    {
106        let regex_str = String::deserialize(deserializer)?;
107        Ok(Regex::new(regex_str))
108    }
109}
110
111impl Region {
112    pub fn new() -> Self {
113        Self {
114            region: regex_impl::new_region(),
115        }
116    }
117
118    /// Get the start/end positions of the capture group with given index.
119    ///
120    /// If there is no match for that group or the index does not correspond to a group, `None` is
121    /// returned. The index 0 returns the whole match.
122    pub fn pos(&self, index: usize) -> Option<(usize, usize)> {
123        self.region.pos(index)
124    }
125}
126
127impl Default for Region {
128    fn default() -> Self {
129        Self::new()
130    }
131}
132
133#[cfg(feature = "regex-onig")]
134mod regex_impl {
135    pub use onig::Region;
136    use onig::{MatchParam, RegexOptions, SearchOptions, Syntax};
137    use std::error::Error;
138
139    #[derive(Debug)]
140    pub struct Regex {
141        regex: onig::Regex,
142    }
143
144    pub fn new_region() -> Region {
145        Region::with_capacity(8)
146    }
147
148    impl Regex {
149        pub fn new(regex_str: &str) -> Result<Regex, Box<dyn Error + Send + Sync + 'static>> {
150            let result = onig::Regex::with_options(
151                regex_str,
152                RegexOptions::REGEX_OPTION_CAPTURE_GROUP,
153                Syntax::default(),
154            );
155            match result {
156                Ok(regex) => Ok(Regex { regex }),
157                Err(error) => Err(Box::new(error)),
158            }
159        }
160
161        pub fn is_match(&self, text: &str) -> bool {
162            self.regex
163                .match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None)
164                .is_some()
165        }
166
167        pub fn search(
168            &self,
169            text: &str,
170            begin: usize,
171            end: usize,
172            region: Option<&mut Region>,
173        ) -> bool {
174            let matched = self.regex.search_with_param(
175                text,
176                begin,
177                end,
178                SearchOptions::SEARCH_OPTION_NONE,
179                region,
180                MatchParam::default(),
181            );
182
183            // If there's an error during search, treat it as non-matching.
184            // For example, in case of catastrophic backtracking, onig should
185            // fail with a "retry-limit-in-match over" error eventually.
186            matches!(matched, Ok(Some(_)))
187        }
188    }
189}
190
191// If both regex-fancy and regex-onig are requested, this condition makes regex-onig win.
192#[cfg(all(feature = "regex-fancy", not(feature = "regex-onig")))]
193mod regex_impl {
194    use std::error::Error;
195
196    #[derive(Debug)]
197    pub struct Regex {
198        regex: fancy_regex::Regex,
199    }
200
201    #[derive(Clone, Debug, Eq, PartialEq)]
202    pub struct Region {
203        positions: Vec<Option<(usize, usize)>>,
204    }
205
206    pub fn new_region() -> Region {
207        Region {
208            positions: Vec::with_capacity(8),
209        }
210    }
211
212    impl Regex {
213        pub fn new(regex_str: &str) -> Result<Regex, Box<dyn Error + Send + Sync + 'static>> {
214            let result = fancy_regex::Regex::new(regex_str);
215            match result {
216                Ok(regex) => Ok(Regex { regex }),
217                Err(error) => Err(Box::new(error)),
218            }
219        }
220
221        pub fn is_match(&self, text: &str) -> bool {
222            // Errors are treated as non-matches
223            self.regex.is_match(text).unwrap_or(false)
224        }
225
226        pub fn search(
227            &self,
228            text: &str,
229            begin: usize,
230            end: usize,
231            region: Option<&mut Region>,
232        ) -> bool {
233            // If there's an error during search, treat it as non-matching.
234            // For example, in case of catastrophic backtracking, fancy-regex should
235            // fail with an error eventually.
236            if let Ok(Some(captures)) = self.regex.captures_from_pos(&text[..end], begin) {
237                if let Some(region) = region {
238                    region.init_from_captures(&captures);
239                }
240                true
241            } else {
242                false
243            }
244        }
245    }
246
247    impl Region {
248        fn init_from_captures(&mut self, captures: &fancy_regex::Captures) {
249            self.positions.clear();
250            for i in 0..captures.len() {
251                let pos = captures.get(i).map(|m| (m.start(), m.end()));
252                self.positions.push(pos);
253            }
254        }
255
256        pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
257            if i < self.positions.len() {
258                self.positions[i]
259            } else {
260                None
261            }
262        }
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn caches_compiled_regex() {
272        let regex = Regex::new(String::from(r"\w+"));
273
274        assert!(regex.regex.get().is_none());
275        assert!(regex.is_match("test"));
276        assert!(regex.regex.get().is_some());
277    }
278
279    #[test]
280    fn serde_as_string() {
281        let pattern: Regex = serde_json::from_str("\"just a string\"").unwrap();
282        assert_eq!(pattern.regex_str(), "just a string");
283        let back_to_str = serde_json::to_string(&pattern).unwrap();
284        assert_eq!(back_to_str, "\"just a string\"");
285    }
286}