syntect/parsing/
syntax_definition.rs

1//! Data structures for representing syntax definitions
2//!
3//! Everything here is public becaues I want this library to be useful in super integrated cases
4//! like text editors and I have no idea what kind of monkeying you might want to do with the data.
5//! Perhaps parsing your own syntax format into this data structure?
6
7use std::collections::{BTreeMap, HashMap};
8use std::hash::Hash;
9use super::{scope::*, ParsingError};
10use super::regex::{Regex, Region};
11use regex_syntax::escape;
12use serde::ser::{Serialize, Serializer};
13use serde_derive::{Deserialize, Serialize};
14use crate::parsing::syntax_set::SyntaxSet;
15
16pub type CaptureMapping = Vec<(usize, Vec<Scope>)>;
17
18/// An opaque ID for a [`Context`].
19#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
20pub struct ContextId {
21    /// Index into [`SyntaxSet::syntaxes`]
22    pub(crate) syntax_index: usize,
23
24    /// Index into [`crate::parsing::LazyContexts::contexts`] for the [`Self::syntax_index`] syntax
25    pub(crate) context_index: usize,
26}
27
28/// The main data structure representing a syntax definition loaded from a
29/// `.sublime-syntax` file
30///
31/// You'll probably only need these as references to be passed around to parsing code.
32///
33/// Some useful public fields are the `name` field which is a human readable name to display in
34/// syntax lists, and the `hidden` field which means hide this syntax from any lists because it is
35/// for internal use.
36#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
37pub struct SyntaxDefinition {
38    pub name: String,
39    pub file_extensions: Vec<String>,
40    pub scope: Scope,
41    pub first_line_match: Option<String>,
42    pub hidden: bool,
43    #[serde(serialize_with = "ordered_map")]
44    pub variables: HashMap<String, String>,
45    #[serde(serialize_with = "ordered_map")]
46    pub contexts: HashMap<String, Context>,
47}
48
49#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
50pub struct Context {
51    pub meta_scope: Vec<Scope>,
52    pub meta_content_scope: Vec<Scope>,
53    /// This being set false in the syntax file implies this field being set false,
54    /// but it can also be set falso for contexts that don't include the prototype for other reasons
55    pub meta_include_prototype: bool,
56    pub clear_scopes: Option<ClearAmount>,
57    /// This is filled in by the linker at link time
58    /// for contexts that have `meta_include_prototype==true`
59    /// and are not included from the prototype.
60    pub prototype: Option<ContextId>,
61    pub uses_backrefs: bool,
62
63    pub patterns: Vec<Pattern>,
64}
65
66impl Context {
67    pub fn new(meta_include_prototype: bool) -> Context {
68        Context {
69            meta_scope: Vec::new(),
70            meta_content_scope: Vec::new(),
71            meta_include_prototype,
72            clear_scopes: None,
73            uses_backrefs: false,
74            patterns: Vec::new(),
75            prototype: None,
76        }
77    }
78}
79
80#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
81pub enum Pattern {
82    Match(MatchPattern),
83    Include(ContextReference),
84}
85
86/// Used to iterate over all the match patterns in a context
87///
88/// Basically walks the tree of patterns and include directives in the correct order.
89#[derive(Debug)]
90pub struct MatchIter<'a> {
91    syntax_set: &'a SyntaxSet,
92    ctx_stack: Vec<&'a Context>,
93    index_stack: Vec<usize>,
94}
95
96#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
97pub struct MatchPattern {
98    pub has_captures: bool,
99    pub regex: Regex,
100    pub scope: Vec<Scope>,
101    pub captures: Option<CaptureMapping>,
102    pub operation: MatchOperation,
103    pub with_prototype: Option<ContextReference>,
104}
105
106#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
107#[non_exhaustive]
108pub enum ContextReference {
109    #[non_exhaustive]
110    Named(String),
111    #[non_exhaustive]
112    ByScope {
113        scope: Scope,
114        sub_context: Option<String>,
115        /// `true` if this reference by scope is part of an `embed` for which
116        /// there is an `escape`. In other words a reference for a context for
117        /// which there "always is a way out". Enables falling back to `Plain
118        /// Text` syntax in case the referenced scope is missing.
119        with_escape: bool,
120    },
121    #[non_exhaustive]
122    File {
123        name: String,
124        sub_context: Option<String>,
125        /// Same semantics as for [`Self::ByScope::with_escape`].
126        with_escape: bool,
127    },
128    #[non_exhaustive]
129    Inline(String),
130    #[non_exhaustive]
131    Direct(ContextId),
132}
133
134
135#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
136pub enum MatchOperation {
137    Push(Vec<ContextReference>),
138    Set(Vec<ContextReference>),
139    Pop,
140    None,
141}
142
143impl<'a> Iterator for MatchIter<'a> {
144    type Item = (&'a Context, usize);
145
146    fn next(&mut self) -> Option<(&'a Context, usize)> {
147        loop {
148            if self.ctx_stack.is_empty() {
149                return None;
150            }
151            // uncomment for debugging infinite recursion
152            // println!("{:?}", self.index_stack);
153            // use std::thread::sleep_ms;
154            // sleep_ms(500);
155            let last_index = self.ctx_stack.len() - 1;
156            let context = self.ctx_stack[last_index];
157            let index = self.index_stack[last_index];
158            self.index_stack[last_index] = index + 1;
159            if index < context.patterns.len() {
160                match context.patterns[index] {
161                    Pattern::Match(_) => {
162                        return Some((context, index));
163                    },
164                    Pattern::Include(ref ctx_ref) => {
165                        let ctx_ptr = match *ctx_ref {
166                            ContextReference::Direct(ref context_id) => {
167                                self.syntax_set.get_context(context_id).unwrap()
168                            }
169                            _ => return self.next(), // skip this and move onto the next one
170                        };
171                        self.ctx_stack.push(ctx_ptr);
172                        self.index_stack.push(0);
173                    }
174                }
175            } else {
176                self.ctx_stack.pop();
177                self.index_stack.pop();
178            }
179        }
180    }
181}
182
183/// Returns an iterator over all the match patterns in this context.
184///
185/// It recursively follows include directives. Can only be run on contexts that have already been
186/// linked up.
187pub fn context_iter<'a>(syntax_set: &'a SyntaxSet, context: &'a Context) -> MatchIter<'a> {
188    MatchIter {
189        syntax_set,
190        ctx_stack: vec![context],
191        index_stack: vec![0],
192    }
193}
194
195impl Context {
196    /// Returns the match pattern at an index
197    pub fn match_at(&self, index: usize) -> Result<&MatchPattern, ParsingError> {
198        match self.patterns[index] {
199            Pattern::Match(ref match_pat) => Ok(match_pat),
200            _ => Err(ParsingError::BadMatchIndex(index)),
201        }
202    }
203}
204
205impl ContextReference {
206    /// find the pointed to context
207    pub fn resolve<'a>(&self, syntax_set: &'a SyntaxSet) -> Result<&'a Context, ParsingError> {
208        match *self {
209            ContextReference::Direct(ref context_id) => syntax_set.get_context(context_id),
210            _ => Err(ParsingError::UnresolvedContextReference(self.clone())),
211        }
212    }
213
214    /// get the context ID this reference points to
215    pub fn id(&self) -> Result<ContextId, ParsingError> {
216        match *self {
217            ContextReference::Direct(ref context_id) => Ok(*context_id),
218             _ => Err(ParsingError::UnresolvedContextReference(self.clone())),
219        }
220    }
221}
222
223pub(crate) fn substitute_backrefs_in_regex<F>(regex_str: &str, substituter: F) -> String
224    where F: Fn(usize) -> Option<String>
225{
226    let mut reg_str = String::with_capacity(regex_str.len());
227
228    let mut last_was_escape = false;
229    for c in regex_str.chars() {
230        if last_was_escape && c.is_ascii_digit() {
231            let val = c.to_digit(10).unwrap() as usize;
232            if let Some(sub) = substituter(val) {
233                reg_str.push_str(&sub);
234            }
235        } else if last_was_escape {
236            reg_str.push('\\');
237            reg_str.push(c);
238        } else if c != '\\' {
239            reg_str.push(c);
240        }
241
242        last_was_escape = c == '\\' && !last_was_escape;
243    }
244    reg_str
245}
246
247impl MatchPattern {
248
249    pub fn new(
250        has_captures: bool,
251        regex_str: String,
252        scope: Vec<Scope>,
253        captures: Option<CaptureMapping>,
254        operation: MatchOperation,
255        with_prototype: Option<ContextReference>,
256    ) -> MatchPattern {
257        MatchPattern {
258            has_captures,
259            regex: Regex::new(regex_str),
260            scope,
261            captures,
262            operation,
263            with_prototype,
264        }
265    }
266
267    /// Used by the parser to compile a regex which needs to reference
268    /// regions from another matched pattern.
269    pub fn regex_with_refs(&self, region: &Region, text: &str) -> Regex {
270        let new_regex = substitute_backrefs_in_regex(self.regex.regex_str(), |i| {
271            region.pos(i).map(|(start, end)| escape(&text[start..end]))
272        });
273
274        Regex::new(new_regex)
275    }
276
277    pub fn regex(&self) -> &Regex {
278        &self.regex
279    }
280}
281
282
283/// Serialize the provided map in natural key order, so that it's deterministic when dumping.
284pub(crate) fn ordered_map<K, V, S>(map: &HashMap<K, V>, serializer: S) -> Result<S::Ok, S::Error>
285    where S: Serializer, K: Eq + Hash + Ord + Serialize, V: Serialize
286{
287    let ordered: BTreeMap<_, _> = map.iter().collect();
288    ordered.serialize(serializer)
289}
290
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn can_compile_refs() {
298        let pat = MatchPattern {
299            has_captures: true,
300            regex: Regex::new(r"lol \\ \2 \1 '\9' \wz".into()),
301            scope: vec![],
302            captures: None,
303            operation: MatchOperation::None,
304            with_prototype: None,
305        };
306        let r = Regex::new(r"(\\\[\]\(\))(b)(c)(d)(e)".into());
307        let s = r"\[]()bcde";
308        let mut region = Region::new();
309        let matched = r.search(s, 0, s.len(), Some(&mut region));
310        assert!(matched);
311
312        let regex_with_refs = pat.regex_with_refs(&region, s);
313        assert_eq!(regex_with_refs.regex_str(), r"lol \\ b \\\[\]\(\) '' \wz");
314    }
315}