syntect/parsing/
yaml_load.rs

1use super::regex::{Regex, Region};
2use super::scope::*;
3use super::syntax_definition::*;
4use yaml_rust::{YamlLoader, Yaml, ScanError};
5use yaml_rust::yaml::Hash;
6use std::collections::HashMap;
7use std::error::Error;
8use std::path::Path;
9use std::ops::DerefMut;
10
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum ParseSyntaxError {
14    /// Invalid YAML file syntax, or at least something yaml_rust can't handle
15    #[error("Invalid YAML file syntax: {0}")]
16    InvalidYaml(#[from] ScanError),
17    /// The file must contain at least one YAML document
18    #[error("The file must contain at least one YAML document")]
19    EmptyFile,
20    /// Some keys are required for something to be a valid `.sublime-syntax`
21    #[error("Missing mandatory key in YAML file: {0}")]
22    MissingMandatoryKey(&'static str),
23    /// Invalid regex
24    #[error("Error while compiling regex '{0}': {1}")]
25    RegexCompileError(String, #[source] Box<dyn Error + Send + Sync + 'static>),
26    /// A scope that syntect's scope implementation can't handle
27    #[error("Invalid scope: {0}")]
28    InvalidScope(ParseScopeError),
29    /// A reference to another file that is invalid
30    #[error("Invalid file reference")]
31    BadFileRef,
32    /// Syntaxes must have a context named "main"
33    #[error("Context 'main' is missing")]
34    MainMissing,
35    /// Some part of the YAML file is the wrong type (e.g a string but should be a list)
36    /// Sorry this doesn't give you any way to narrow down where this is.
37    /// Maybe use Sublime Text to figure it out.
38    #[error("Type mismatch")]
39    TypeMismatch,
40}
41
42fn get_key<'a, R, F: FnOnce(&'a Yaml) -> Option<R>>(map: &'a Hash,
43                                                    key: &'static str,
44                                                    f: F)
45                                                    -> Result<R, ParseSyntaxError> {
46    map.get(&Yaml::String(key.to_owned()))
47        .ok_or(ParseSyntaxError::MissingMandatoryKey(key))
48        .and_then(|x| f(x).ok_or(ParseSyntaxError::TypeMismatch))
49}
50
51fn str_to_scopes(s: &str, repo: &mut ScopeRepository) -> Result<Vec<Scope>, ParseSyntaxError> {
52    s.split_whitespace()
53        .map(|scope| repo.build(scope).map_err(ParseSyntaxError::InvalidScope))
54        .collect()
55}
56
57struct ParserState<'a> {
58    scope_repo: &'a mut ScopeRepository,
59    variables: HashMap<String, String>,
60    variable_regex: Regex,
61    backref_regex: Regex,
62    lines_include_newline: bool,
63}
64
65// `__start` must not include prototypes from the actual syntax definition,
66// otherwise it's possible that a prototype makes us pop out of `__start`.
67static START_CONTEXT: &str = "
68__start:
69    - meta_include_prototype: false
70    - match: ''
71      push: __main
72__main:
73    - include: main
74";
75
76impl SyntaxDefinition {
77    /// In case you want to create your own SyntaxDefinition's in memory from strings.
78    ///
79    /// Generally you should use a [`SyntaxSet`].
80    ///
81    /// `fallback_name` is an optional name to use when the YAML doesn't provide a `name` key.
82    ///
83    /// [`SyntaxSet`]: ../struct.SyntaxSet.html
84    pub fn load_from_str(
85        s: &str,
86        lines_include_newline: bool,
87        fallback_name: Option<&str>,
88    ) -> Result<SyntaxDefinition, ParseSyntaxError> {
89        let docs = match YamlLoader::load_from_str(s) {
90            Ok(x) => x,
91            Err(e) => return Err(ParseSyntaxError::InvalidYaml(e)),
92        };
93        if docs.is_empty() {
94            return Err(ParseSyntaxError::EmptyFile);
95        }
96        let doc = &docs[0];
97        let mut scope_repo = SCOPE_REPO.lock().unwrap();
98        SyntaxDefinition::parse_top_level(doc, scope_repo.deref_mut(), lines_include_newline, fallback_name)
99    }
100
101    fn parse_top_level(doc: &Yaml,
102                       scope_repo: &mut ScopeRepository,
103                       lines_include_newline: bool,
104                       fallback_name: Option<&str>)
105                       -> Result<SyntaxDefinition, ParseSyntaxError> {
106        let h = doc.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
107
108        let mut variables = HashMap::new();
109        if let Ok(map) = get_key(h, "variables", |x| x.as_hash()) {
110            for (key, value) in map.iter() {
111                if let (Some(key_str), Some(val_str)) = (key.as_str(), value.as_str()) {
112                    variables.insert(key_str.to_owned(), val_str.to_owned());
113                }
114            }
115        }
116        let contexts_hash = get_key(h, "contexts", |x| x.as_hash())?;
117        let top_level_scope = scope_repo.build(get_key(h, "scope", |x| x.as_str())?)
118            .map_err(ParseSyntaxError::InvalidScope)?;
119        let mut state = ParserState {
120            scope_repo,
121            variables,
122            variable_regex: Regex::new(r"\{\{([A-Za-z0-9_]+)\}\}".into()),
123            backref_regex: Regex::new(r"\\\d".into()),
124            lines_include_newline,
125        };
126
127        let mut contexts = SyntaxDefinition::parse_contexts(contexts_hash, &mut state)?;
128        if !contexts.contains_key("main") {
129            return Err(ParseSyntaxError::MainMissing);
130        }
131
132        SyntaxDefinition::add_initial_contexts(
133            &mut contexts,
134            &mut state,
135            top_level_scope,
136        );
137
138        let mut file_extensions = Vec::new();
139        for extension_key in &["file_extensions", "hidden_file_extensions"] {
140            if let Ok(v) = get_key(h, extension_key, |x| x.as_vec()) {
141                file_extensions.extend(v.iter().filter_map(|y| y.as_str().map(|s| s.to_owned())))
142            }
143        }
144
145        let defn = SyntaxDefinition {
146            name: get_key(h, "name", |x| x.as_str()).unwrap_or_else(|_| fallback_name.unwrap_or("Unnamed")).to_owned(),
147            scope: top_level_scope,
148            file_extensions,
149            // TODO maybe cache a compiled version of this Regex
150            first_line_match: get_key(h, "first_line_match", |x| x.as_str())
151                .ok()
152                .map(|s| s.to_owned()),
153            hidden: get_key(h, "hidden", |x| x.as_bool()).unwrap_or(false),
154
155            variables: state.variables,
156            contexts,
157        };
158        Ok(defn)
159    }
160
161    fn parse_contexts(map: &Hash,
162                      state: &mut ParserState<'_>)
163                      -> Result<HashMap<String, Context>, ParseSyntaxError> {
164        let mut contexts = HashMap::new();
165        for (key, value) in map.iter() {
166            if let (Some(name), Some(val_vec)) = (key.as_str(), value.as_vec()) {
167                let is_prototype = name == "prototype";
168                let mut namer = ContextNamer::new(name);
169                SyntaxDefinition::parse_context(val_vec, state, &mut contexts, is_prototype, &mut namer)?;
170            }
171        }
172
173        Ok(contexts)
174    }
175
176    fn parse_context(vec: &[Yaml],
177                     // TODO: Maybe just pass the scope repo if that's all that's needed?
178                     state: &mut ParserState<'_>,
179                     contexts: &mut HashMap<String, Context>,
180                     is_prototype: bool,
181                     namer: &mut ContextNamer)
182                     -> Result<String, ParseSyntaxError> {
183        let mut context = Context::new(!is_prototype);
184        let name = namer.next();
185
186        for y in vec.iter() {
187            let map = y.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
188
189            let mut is_special = false;
190            if let Ok(x) = get_key(map, "meta_scope", |x| x.as_str()) {
191                context.meta_scope = str_to_scopes(x, state.scope_repo)?;
192                is_special = true;
193            }
194            if let Ok(x) = get_key(map, "meta_content_scope", |x| x.as_str()) {
195                context.meta_content_scope = str_to_scopes(x, state.scope_repo)?;
196                is_special = true;
197            }
198            if let Ok(x) = get_key(map, "meta_include_prototype", |x| x.as_bool()) {
199                context.meta_include_prototype = x;
200                is_special = true;
201            }
202            if let Ok(true) = get_key(map, "clear_scopes", |x| x.as_bool()) {
203                context.clear_scopes = Some(ClearAmount::All);
204                is_special = true;
205            }
206            if let Ok(x) = get_key(map, "clear_scopes", |x| x.as_i64()) {
207                context.clear_scopes = Some(ClearAmount::TopN(x as usize));
208                is_special = true;
209            }
210            if !is_special {
211                if let Ok(x) = get_key(map, "include", Some) {
212                    let reference = SyntaxDefinition::parse_reference(
213                        x, state, contexts, namer, false)?;
214                    context.patterns.push(Pattern::Include(reference));
215                } else {
216                    let pattern = SyntaxDefinition::parse_match_pattern(
217                        map, state, contexts, namer)?;
218                    if pattern.has_captures {
219                        context.uses_backrefs = true;
220                    }
221                    context.patterns.push(Pattern::Match(pattern));
222                }
223            }
224
225        }
226
227        contexts.insert(name.clone(), context);
228        Ok(name)
229    }
230
231    fn parse_reference(y: &Yaml,
232                       state: &mut ParserState<'_>,
233                       contexts: &mut HashMap<String, Context>,
234                       namer: &mut ContextNamer,
235                       with_escape: bool)
236                       -> Result<ContextReference, ParseSyntaxError> {
237        if let Some(s) = y.as_str() {
238            let parts: Vec<&str> = s.split('#').collect();
239            let sub_context = if parts.len() > 1 {
240                Some(parts[1].to_owned())
241            } else {
242                None
243            };
244            if parts[0].starts_with("scope:") {
245                Ok(ContextReference::ByScope {
246                    scope: state.scope_repo
247                        .build(&parts[0][6..])
248                        .map_err(ParseSyntaxError::InvalidScope)?,
249                    sub_context,
250                    with_escape,
251                })
252            } else if parts[0].ends_with(".sublime-syntax") {
253                let stem = Path::new(parts[0])
254                    .file_stem()
255                    .and_then(|x| x.to_str())
256                    .ok_or(ParseSyntaxError::BadFileRef)?;
257                Ok(ContextReference::File {
258                    name: stem.to_owned(),
259                    sub_context,
260                    with_escape,
261                })
262            } else {
263                Ok(ContextReference::Named(parts[0].to_owned()))
264            }
265        } else if let Some(v) = y.as_vec() {
266            let subname = SyntaxDefinition::parse_context(v, state, contexts, false, namer)?;
267            Ok(ContextReference::Inline(subname))
268        } else {
269            Err(ParseSyntaxError::TypeMismatch)
270        }
271    }
272
273    fn parse_match_pattern(map: &Hash,
274                           state: &mut ParserState<'_>,
275                           contexts: &mut HashMap<String, Context>,
276                           namer: &mut ContextNamer)
277                           -> Result<MatchPattern, ParseSyntaxError> {
278        let raw_regex = get_key(map, "match", |x| x.as_str())?;
279        let regex_str = Self::parse_regex(raw_regex, state)?;
280        // println!("{:?}", regex_str);
281
282        let scope = get_key(map, "scope", |x| x.as_str())
283            .ok()
284            .map(|s| str_to_scopes(s, state.scope_repo))
285            .unwrap_or_else(|| Ok(vec![]))?;
286
287        let captures = if let Ok(map) = get_key(map, "captures", |x| x.as_hash()) {
288            Some(Self::parse_captures(map, &regex_str, state)?)
289        } else {
290            None
291        };
292
293        let mut has_captures = false;
294        let operation = if get_key(map, "pop", Some).is_ok() {
295            // Thanks @wbond for letting me know this is the correct way to check for captures
296            has_captures = state.backref_regex.search(&regex_str, 0, regex_str.len(), None);
297            MatchOperation::Pop
298        } else if let Ok(y) = get_key(map, "push", Some) {
299            MatchOperation::Push(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
300        } else if let Ok(y) = get_key(map, "set", Some) {
301            MatchOperation::Set(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
302        } else if let Ok(y) = get_key(map, "embed", Some) {
303            // Same as push so we translate it to what it would be
304            let mut embed_escape_context_yaml = vec!();
305            let mut commands = Hash::new();
306            commands.insert(Yaml::String("meta_include_prototype".to_string()), Yaml::Boolean(false));
307            embed_escape_context_yaml.push(Yaml::Hash(commands));
308            if let Ok(s) = get_key(map, "embed_scope", Some) {
309                commands = Hash::new();
310                commands.insert(Yaml::String("meta_content_scope".to_string()), s.clone());
311                embed_escape_context_yaml.push(Yaml::Hash(commands));
312            }
313            if let Ok(v) = get_key(map, "escape", Some) {
314                let mut match_map = Hash::new();
315                match_map.insert(Yaml::String("match".to_string()), v.clone());
316                match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
317                if let Ok(y) = get_key(map, "escape_captures", Some) {
318                    match_map.insert(Yaml::String("captures".to_string()), y.clone());
319                }
320                embed_escape_context_yaml.push(Yaml::Hash(match_map));
321                let escape_context = SyntaxDefinition::parse_context(
322                    &embed_escape_context_yaml,
323                    state,
324                    contexts,
325                    false,
326                    namer,
327                )?;
328                MatchOperation::Push(vec![ContextReference::Inline(escape_context),
329                                          SyntaxDefinition::parse_reference(y, state, contexts, namer, true)?])
330            } else {
331                return Err(ParseSyntaxError::MissingMandatoryKey("escape"));
332            }
333
334        } else {
335            MatchOperation::None
336        };
337
338        let with_prototype = if let Ok(v) = get_key(map, "with_prototype", |x| x.as_vec()) {
339            // should a with_prototype include the prototype? I don't think so.
340            let subname = Self::parse_context(v, state, contexts, true, namer)?;
341            Some(ContextReference::Inline(subname))
342        } else if let Ok(v) = get_key(map, "escape", Some) {
343            let subname = namer.next();
344
345            let mut context = Context::new(false);
346            let mut match_map = Hash::new();
347            match_map.insert(Yaml::String("match".to_string()), Yaml::String(format!("(?={})", v.as_str().unwrap())));
348            match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
349            let pattern = SyntaxDefinition::parse_match_pattern(&match_map, state, contexts, namer)?;
350            if pattern.has_captures {
351                context.uses_backrefs = true;
352            }
353            context.patterns.push(Pattern::Match(pattern));
354
355            contexts.insert(subname.clone(), context);
356            Some(ContextReference::Inline(subname))
357        } else {
358            None
359        };
360
361        let pattern = MatchPattern::new(
362            has_captures,
363            regex_str,
364            scope,
365            captures,
366            operation,
367            with_prototype,
368        );
369
370        Ok(pattern)
371    }
372
373    fn parse_pushargs(y: &Yaml,
374                      state: &mut ParserState<'_>,
375                      contexts: &mut HashMap<String, Context>,
376                      namer: &mut ContextNamer)
377                      -> Result<Vec<ContextReference>, ParseSyntaxError> {
378        // check for a push of multiple items
379        if y.as_vec().map_or(false, |v| !v.is_empty() && (v[0].as_str().is_some() || (v[0].as_vec().is_some() && v[0].as_vec().unwrap()[0].as_hash().is_some()))) {
380            // this works because Result implements FromIterator to handle the errors
381            y.as_vec()
382                .unwrap()
383                .iter()
384                .map(|x| SyntaxDefinition::parse_reference(x, state, contexts, namer, false))
385                .collect()
386        } else {
387            let reference = SyntaxDefinition::parse_reference(y, state, contexts, namer, false)?;
388            Ok(vec![reference])
389        }
390    }
391
392    fn parse_regex(raw_regex: &str, state: &ParserState<'_>) -> Result<String, ParseSyntaxError> {
393        let regex = Self::resolve_variables(raw_regex, state);
394        let regex = replace_posix_char_classes(regex);
395        let regex = if state.lines_include_newline {
396            regex_for_newlines(regex)
397        } else {
398            // If the passed in strings don't include newlines (unlike Sublime) we can't match on
399            // them using the original regex. So this tries to rewrite the regex in a way that
400            // allows matching against lines without newlines (essentially replacing `\n` with `$`).
401            regex_for_no_newlines(regex)
402        };
403        Self::try_compile_regex(&regex)?;
404        Ok(regex)
405    }
406
407    fn resolve_variables(raw_regex: &str, state: &ParserState<'_>) -> String {
408        let mut result = String::new();
409        let mut index = 0;
410        let mut region = Region::new();
411        while state.variable_regex.search(raw_regex, index, raw_regex.len(), Some(&mut region)) {
412            let (begin, end) = region.pos(0).unwrap();
413
414            result.push_str(&raw_regex[index..begin]);
415
416            let var_pos = region.pos(1).unwrap();
417            let var_name = &raw_regex[var_pos.0..var_pos.1];
418            let var_raw = state.variables.get(var_name).map(String::as_ref).unwrap_or("");
419            let var_resolved = Self::resolve_variables(var_raw, state);
420            result.push_str(&var_resolved);
421
422            index = end;
423        }
424        if index < raw_regex.len() {
425            result.push_str(&raw_regex[index..]);
426        }
427        result
428    }
429
430    fn try_compile_regex(regex_str: &str) -> Result<(), ParseSyntaxError> {
431        // Replace backreferences with a placeholder value that will also appear in errors
432        let regex_str = substitute_backrefs_in_regex(regex_str, |i| Some(format!("<placeholder_{}>", i)));
433
434        if let Some(error) = Regex::try_compile(&regex_str) {
435            Err(ParseSyntaxError::RegexCompileError(regex_str, error))
436        } else {
437            Ok(())
438        }
439    }
440
441    fn parse_captures(
442        map: &Hash,
443        regex_str: &str,
444        state: &mut ParserState<'_>,
445    ) -> Result<CaptureMapping, ParseSyntaxError> {
446        let valid_indexes = get_consuming_capture_indexes(regex_str);
447        let mut captures = Vec::new();
448        for (key, value) in map.iter() {
449            if let (Some(key_int), Some(val_str)) = (key.as_i64(), value.as_str()) {
450                if valid_indexes.contains(&(key_int as usize)) {
451                    captures.push((key_int as usize, str_to_scopes(val_str, state.scope_repo)?));
452                }
453            }
454        }
455        Ok(captures)
456    }
457
458    /// Sublime treats the top level context slightly differently from
459    /// including the main context from other syntaxes. When main is popped
460    /// it is immediately re-added and when it is `set` over the file level
461    /// scope remains. This behaviour is emulated through some added contexts
462    /// that are the actual top level contexts used in parsing.
463    /// See <https://github.com/trishume/syntect/issues/58> for more.
464    fn add_initial_contexts(
465        contexts: &mut HashMap<String, Context>,
466        state: &mut ParserState<'_>,
467        top_level_scope: Scope,
468    ) {
469        let yaml_docs = YamlLoader::load_from_str(START_CONTEXT).unwrap();
470        let yaml = &yaml_docs[0];
471
472        let start_yaml : &[Yaml] = yaml["__start"].as_vec().unwrap();
473        SyntaxDefinition::parse_context(start_yaml, state, contexts, false, &mut ContextNamer::new("__start")).unwrap();
474        if let Some(start) = contexts.get_mut("__start") {
475            start.meta_content_scope = vec![top_level_scope];
476        }
477
478        let main_yaml : &[Yaml] = yaml["__main"].as_vec().unwrap();
479        SyntaxDefinition::parse_context(main_yaml, state, contexts, false, &mut ContextNamer::new("__main")).unwrap();
480
481        let meta_include_prototype = contexts["main"].meta_include_prototype;
482        let meta_scope = contexts["main"].meta_scope.clone();
483        let meta_content_scope = contexts["main"].meta_content_scope.clone();
484
485        if let Some(outer_main) = contexts.get_mut("__main") {
486            outer_main.meta_include_prototype = meta_include_prototype;
487            outer_main.meta_scope = meta_scope;
488            outer_main.meta_content_scope = meta_content_scope;
489        }
490
491        // add the top_level_scope as a meta_content_scope to main so
492        // pushes from other syntaxes add the file scope
493        // TODO: this order is not quite correct if main also has a meta_scope
494        if let Some(main) = contexts.get_mut("main") {
495            main.meta_content_scope.insert(0, top_level_scope);
496        }
497    }
498}
499
500struct ContextNamer {
501    name: String,
502    anonymous_index: Option<usize>,
503}
504
505impl ContextNamer {
506    fn new(name: &str) -> ContextNamer {
507        ContextNamer {
508            name: name.to_string(),
509            anonymous_index: None,
510        }
511    }
512
513    fn next(&mut self) -> String {
514        let name = if let Some(index) = self.anonymous_index {
515            format!("#anon_{}_{}", self.name, index)
516        } else {
517            self.name.clone()
518        };
519
520        self.anonymous_index = Some(self.anonymous_index.map(|i| i + 1).unwrap_or(0));
521        name
522    }
523}
524
525/// In fancy-regex, POSIX character classes only match ASCII characters.
526///
527/// Sublime's syntaxes expect them to match Unicode characters as well, so transform them to
528/// corresponding Unicode character classes.
529fn replace_posix_char_classes(regex: String) -> String {
530    regex.replace("[:alpha:]", r"\p{L}")
531        .replace("[:alnum:]", r"\p{L}\p{N}")
532        .replace("[:lower:]", r"\p{Ll}")
533        .replace("[:upper:]", r"\p{Lu}")
534        .replace("[:digit:]", r"\p{Nd}")
535}
536
537
538/// Some of the regexes include `$` and expect it to match end of line,
539/// e.g. *before* the `\n` in `test\n`.
540///
541/// In fancy-regex, `$` means end of text by default, so that would
542/// match *after* `\n`. Using `(?m:$)` instead means it matches end of line.
543///
544/// Note that we don't want to add a `(?m)` in the beginning to change the
545/// whole regex because that would also change the meaning of `^`. In
546/// fancy-regex, that also matches at the end of e.g. `test\n` which is
547/// different from onig. It would also change `.` to match more.
548fn regex_for_newlines(regex: String) -> String {
549    if !regex.contains('$') {
550        return regex;
551    }
552
553    let rewriter = RegexRewriterForNewlines {
554        parser: Parser::new(regex.as_bytes()),
555    };
556    rewriter.rewrite()
557}
558
559struct RegexRewriterForNewlines<'a> {
560    parser: Parser<'a>,
561}
562
563impl<'a> RegexRewriterForNewlines<'a> {
564    fn rewrite(mut self) -> String {
565        let mut result = Vec::new();
566
567        while let Some(c) = self.parser.peek() {
568            match c {
569                b'$' => {
570                    self.parser.next();
571                    result.extend_from_slice(br"(?m:$)");
572                }
573                b'\\' => {
574                    self.parser.next();
575                    result.push(c);
576                    if let Some(c2) = self.parser.peek() {
577                        self.parser.next();
578                        result.push(c2);
579                    }
580                }
581                b'[' => {
582                    let (mut content, _) = self.parser.parse_character_class();
583                    result.append(&mut content);
584                }
585                _ => {
586                    self.parser.next();
587                    result.push(c);
588                }
589            }
590        }
591        String::from_utf8(result).unwrap()
592    }
593}
594
595/// Rewrite a regex that matches `\n` to one that matches `$` (end of line) instead.
596/// That allows the regex to be used to match lines that don't include a trailing newline character.
597///
598/// The reason we're doing this is because the regexes in the syntax definitions assume that the
599/// lines that are being matched on include a trailing newline.
600///
601/// Note that the rewrite is just an approximation and there's a couple of cases it can not handle,
602/// due to `$` being an anchor whereas `\n` matches a character.
603fn regex_for_no_newlines(regex: String) -> String {
604    if !regex.contains(r"\n") {
605        return regex;
606    }
607
608    // A special fix to rewrite a pattern from the `Rd` syntax that the RegexRewriter can not
609    // handle properly.
610    let regex = regex.replace("(?:\\n)?", "(?:$|)");
611
612    let rewriter = RegexRewriterForNoNewlines {
613        parser: Parser::new(regex.as_bytes()),
614    };
615    rewriter.rewrite()
616}
617
618struct RegexRewriterForNoNewlines<'a> {
619    parser: Parser<'a>,
620}
621
622impl<'a> RegexRewriterForNoNewlines<'a> {
623    fn rewrite(mut self) -> String {
624        let mut result = Vec::new();
625        while let Some(c) = self.parser.peek() {
626            match c {
627                b'\\' => {
628                    self.parser.next();
629                    if let Some(c2) = self.parser.peek() {
630                        self.parser.next();
631                        // Replacing `\n` with `$` in `\n?` or `\n+` would make parsing later fail
632                        // with "target of repeat operator is invalid"
633                        let c3 = self.parser.peek();
634                        if c2 == b'n' && c3 != Some(b'?') && c3 != Some(b'+') && c3 != Some(b'*') {
635                            result.extend_from_slice(b"$");
636                        } else {
637                            result.push(c);
638                            result.push(c2);
639                        }
640                    } else {
641                        result.push(c);
642                    }
643                }
644                b'[' => {
645                    let (mut content, matches_newline) = self.parser.parse_character_class();
646                    if matches_newline && self.parser.peek() != Some(b'?') {
647                        result.extend_from_slice(b"(?:");
648                        result.append(&mut content);
649                        result.extend_from_slice(br"|$)");
650                    } else {
651                        result.append(&mut content);
652                    }
653                }
654                _ => {
655                    self.parser.next();
656                    result.push(c);
657                }
658            }
659        }
660        String::from_utf8(result).unwrap()
661    }
662}
663
664fn get_consuming_capture_indexes(regex: &str) -> Vec<usize> {
665    let parser = ConsumingCaptureIndexParser {
666        parser: Parser::new(regex.as_bytes()),
667    };
668    parser.get_consuming_capture_indexes()
669}
670
671struct ConsumingCaptureIndexParser<'a> {
672    parser: Parser<'a>,
673}
674
675impl<'a> ConsumingCaptureIndexParser<'a> {
676    /// Find capture groups which are not inside lookarounds.
677    ///
678    /// If, in a YAML syntax definition, a scope stack is applied to a capture group inside a
679    /// lookaround, (i.e. "captures:\n x: scope.stack goes.here", where "x" is the number of a
680    /// capture group in a lookahead/behind), those those scopes are not applied, so no need to
681    /// even parse them.
682    fn get_consuming_capture_indexes(mut self) -> Vec<usize> {
683        let mut result = Vec::new();
684        let mut stack = Vec::new();
685        let mut cap_num = 0;
686        let mut in_lookaround = false;
687        stack.push(in_lookaround);
688        result.push(cap_num);
689
690        while let Some(c) = self.parser.peek() {
691            match c {
692                b'\\' => {
693                    self.parser.next();
694                    self.parser.next();
695                }
696                b'[' => {
697                    self.parser.parse_character_class();
698                }
699                b'(' => {
700                    self.parser.next();
701                    // add the current lookaround state to the stack so we can just pop at a closing paren
702                    stack.push(in_lookaround);
703                    if let Some(c2) = self.parser.peek() {
704                        if c2 != b'?' {
705                            // simple numbered capture group
706                            cap_num += 1;
707                            // if we are not currently in a lookaround,
708                            // add this capture group number to the valid ones
709                            if !in_lookaround {
710                                result.push(cap_num);
711                            }
712                        } else {
713                            self.parser.next();
714                            if let Some(c3) = self.parser.peek() {
715                                self.parser.next();
716                                if c3 == b'=' || c3 == b'!' {
717                                    // lookahead
718                                    in_lookaround = true;
719                                } else if c3 == b'<' {
720                                    if let Some(c4) = self.parser.peek() {
721                                        if c4 == b'=' || c4 == b'!' {
722                                            self.parser.next();
723                                            // lookbehind
724                                            in_lookaround = true;
725                                        }
726                                    }
727                                } else if c3 == b'P' {
728                                    if let Some(c4) = self.parser.peek() {
729                                        if c4 == b'<' {
730                                            // named capture group
731                                            cap_num += 1;
732                                            // if we are not currently in a lookaround,
733                                            // add this capture group number to the valid ones
734                                            if !in_lookaround {
735                                                result.push(cap_num);
736                                            }
737                                        }
738                                    }
739                                }
740                            }
741                        }
742                    }
743                }
744                b')' => {
745                    if let Some(value) = stack.pop() {
746                        in_lookaround = value;
747                    }
748                    self.parser.next();
749                }
750                _ => {
751                    self.parser.next();
752                }
753            }
754        }
755        result
756    }
757}
758
759struct Parser<'a> {
760    bytes: &'a [u8],
761    index: usize,
762}
763
764impl<'a> Parser<'a> {
765    fn new(bytes: &[u8]) -> Parser {
766        Parser {
767            bytes,
768            index: 0,
769        }
770    }
771
772    fn peek(&self) -> Option<u8> {
773        self.bytes.get(self.index).copied()
774    }
775
776    fn next(&mut self) {
777        self.index += 1;
778    }
779
780    fn parse_character_class(&mut self) -> (Vec<u8>, bool) {
781        let mut content = Vec::new();
782        let mut negated = false;
783        let mut nesting = 0;
784        let mut matches_newline = false;
785
786        self.next();
787        content.push(b'[');
788        if let Some(b'^') = self.peek() {
789            self.next();
790            content.push(b'^');
791            negated = true;
792        }
793
794        // An unescaped `]` is allowed after `[` or `[^` and doesn't mean the end of the class.
795        if let Some(b']') = self.peek() {
796            self.next();
797            content.push(b']');
798        }
799
800        while let Some(c) = self.peek() {
801            match c {
802                b'\\' => {
803                    self.next();
804                    content.push(c);
805                    if let Some(c2) = self.peek() {
806                        self.next();
807                        if c2 == b'n' && !negated && nesting == 0 {
808                            matches_newline = true;
809                        }
810                        content.push(c2);
811                    }
812                }
813                b'[' => {
814                    self.next();
815                    content.push(b'[');
816                    nesting += 1;
817                }
818                b']' => {
819                    self.next();
820                    content.push(b']');
821                    if nesting == 0 {
822                        break;
823                    }
824                    nesting -= 1;
825                }
826                _ => {
827                    self.next();
828                    content.push(c);
829                }
830            }
831        }
832
833        (content, matches_newline)
834    }
835}
836
837
838#[cfg(test)]
839mod tests {
840    use crate::parsing::syntax_definition::*;
841    use crate::parsing::Scope;
842    use super::*;
843
844    #[test]
845    fn can_parse() {
846        let defn: SyntaxDefinition =
847            SyntaxDefinition::load_from_str("name: C\nscope: source.c\ncontexts: {main: []}",
848                                            false, None)
849                .unwrap();
850        assert_eq!(defn.name, "C");
851        assert_eq!(defn.scope, Scope::new("source.c").unwrap());
852        let exts_empty: Vec<String> = Vec::new();
853        assert_eq!(defn.file_extensions, exts_empty);
854        assert!(!defn.hidden);
855        assert!(defn.variables.is_empty());
856        let defn2: SyntaxDefinition =
857            SyntaxDefinition::load_from_str("
858        name: C
859        scope: source.c
860        file_extensions: [c, h]
861        hidden_file_extensions: [k, l]
862        hidden: true
863        variables:
864          ident: '[QY]+'
865        contexts:
866          prototype:
867            - match: lol
868              scope: source.php
869          main:
870            - match: \\b(if|else|for|while|{{ident}})\\b
871              scope: keyword.control.c keyword.looping.c
872              captures:
873                  1: meta.preprocessor.c++
874                  2: keyword.control.include.c++
875              push: [string, 'scope:source.c#main', 'CSS.sublime-syntax#rule-list-body']
876              with_prototype:
877                - match: wow
878                  pop: true
879            - match: '\"'
880              push: string
881          string:
882            - meta_scope: string.quoted.double.c
883            - meta_include_prototype: false
884            - match: \\\\.
885              scope: constant.character.escape.c
886            - match: '\"'
887              pop: true
888        ",
889                                            false, None)
890                .unwrap();
891        assert_eq!(defn2.name, "C");
892        let top_level_scope = Scope::new("source.c").unwrap();
893        assert_eq!(defn2.scope, top_level_scope);
894        let exts: Vec<String> = vec!["c", "h", "k", "l"].into_iter().map(String::from).collect();
895        assert_eq!(defn2.file_extensions, exts);
896        assert!(defn2.hidden);
897        assert_eq!(defn2.variables.get("ident").unwrap(), "[QY]+");
898
899        let n: Vec<Scope> = Vec::new();
900        println!("{:?}", defn2);
901        // unreachable!();
902        let main = &defn2.contexts["main"];
903        assert_eq!(main.meta_content_scope, vec![top_level_scope]);
904        assert_eq!(main.meta_scope, n);
905        assert!(main.meta_include_prototype);
906
907        assert_eq!(defn2.contexts["__main"].meta_content_scope, n);
908        assert_eq!(defn2.contexts["__start"].meta_content_scope, vec![top_level_scope]);
909
910        assert_eq!(defn2.contexts["string"].meta_scope,
911                   vec![Scope::new("string.quoted.double.c").unwrap()]);
912        let first_pattern: &Pattern = &main.patterns[0];
913        match *first_pattern {
914            Pattern::Match(ref match_pat) => {
915                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
916                assert_eq!(&m[0], &(1,vec![Scope::new("meta.preprocessor.c++").unwrap()]));
917                use crate::parsing::syntax_definition::ContextReference::*;
918
919                // this is sadly necessary because Context is not Eq because of the Regex
920                let expected = MatchOperation::Push(vec![
921                    Named("string".to_owned()),
922                    ByScope {
923                        scope: Scope::new("source.c").unwrap(),
924                        sub_context: Some("main".to_owned()),
925                        with_escape: false,
926                    },
927                    File {
928                        name: "CSS".to_owned(),
929                        sub_context: Some("rule-list-body".to_owned()),
930                        with_escape: false,
931                    },
932                ]);
933                assert_eq!(format!("{:?}", match_pat.operation),
934                           format!("{:?}", expected));
935
936                assert_eq!(match_pat.scope,
937                           vec![Scope::new("keyword.control.c").unwrap(),
938                                Scope::new("keyword.looping.c").unwrap()]);
939
940                assert!(match_pat.with_prototype.is_some());
941            }
942            _ => unreachable!(),
943        }
944    }
945
946    #[test]
947    fn can_parse_embed_as_with_prototypes() {
948        let old_def = SyntaxDefinition::load_from_str(r#"
949        name: C
950        scope: source.c
951        file_extensions: [c, h]
952        variables:
953          ident: '[QY]+'
954        contexts:
955          main:
956            - match: '(>)\s*'
957              captures:
958                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
959              push:
960                - [{ meta_include_prototype: false }, { meta_content_scope: 'source.css.embedded.html' }, { match: '(?i)(?=</style)', pop: true }]
961                - scope:source.css
962              with_prototype:
963                - match: (?=(?i)(?=</style))
964                  pop: true
965        "#,false, None).unwrap();
966
967        let mut def_with_embed = SyntaxDefinition::load_from_str(r#"
968        name: C
969        scope: source.c
970        file_extensions: [c, h]
971        variables:
972          ident: '[QY]+'
973        contexts:
974          main:
975            - match: '(>)\s*'
976              captures:
977                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
978              embed: scope:source.css
979              embed_scope: source.css.embedded.html
980              escape: (?i)(?=</style)
981        "#,false, None).unwrap();
982
983        // We will soon do an `assert_eq!()`. But there is one difference we must expect, namely
984        // that for `def_with_embed`, the value of `ContextReference::ByScope::with_escape` will be
985        // `true`, whereas for `old_def` it will be `false`. So manually adjust `with_escape` to
986        // `false` so that `assert_eq!()` will work.
987        let def_with_embed_context = def_with_embed.contexts.get_mut("main").unwrap();
988        if let Pattern::Match(ref mut match_pattern) = def_with_embed_context.patterns[0] {
989            if let MatchOperation::Push(ref mut context_references) = match_pattern.operation {
990                if let ContextReference::ByScope {
991                    ref mut with_escape,
992                    ..
993                } = context_references[1]
994                {
995                    *with_escape = false;
996                }
997            }
998        }
999
1000        assert_eq!(old_def.contexts["main"], def_with_embed.contexts["main"]);
1001    }
1002
1003    #[test]
1004    fn errors_on_embed_without_escape() {
1005        let def = SyntaxDefinition::load_from_str(r#"
1006        name: C
1007        scope: source.c
1008        file_extensions: [c, h]
1009        variables:
1010          ident: '[QY]+'
1011        contexts:
1012          main:
1013            - match: '(>)\s*'
1014              captures:
1015                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1016              embed: scope:source.css
1017              embed_scope: source.css.embedded.html
1018        "#,false, None);
1019        assert!(def.is_err());
1020        match def.unwrap_err() {
1021            ParseSyntaxError::MissingMandatoryKey(key) => assert_eq!(key, "escape"),
1022            _ => unreachable!("Got unexpected ParseSyntaxError"),
1023        }
1024    }
1025
1026    #[test]
1027    fn errors_on_regex_compile_error() {
1028        let def = SyntaxDefinition::load_from_str(r#"
1029        name: C
1030        scope: source.c
1031        file_extensions: [test]
1032        contexts:
1033          main:
1034            - match: '[a'
1035              scope: keyword.name
1036        "#,false, None);
1037        assert!(def.is_err());
1038        match def.unwrap_err() {
1039            ParseSyntaxError::RegexCompileError(ref regex, _) => assert_eq!("[a", regex),
1040            _ => unreachable!("Got unexpected ParseSyntaxError"),
1041        }
1042    }
1043
1044    #[test]
1045    fn can_parse_ugly_yaml() {
1046        let defn: SyntaxDefinition =
1047            SyntaxDefinition::load_from_str("
1048        name: LaTeX
1049        scope: text.tex.latex
1050        contexts:
1051          main:
1052            - match: '((\\\\)(?:framebox|makebox))\\b'
1053              captures:
1054                1: support.function.box.latex
1055                2: punctuation.definition.backslash.latex
1056              push:
1057                - [{meta_scope: meta.function.box.latex}, {match: '', pop: true}]
1058                - argument
1059                - optional-arguments
1060          argument:
1061            - match: '\\{'
1062              scope: punctuation.definition.group.brace.begin.latex
1063            - match: '(?=\\S)'
1064              pop: true
1065          optional-arguments:
1066            - match: '(?=\\S)'
1067              pop: true
1068        ",
1069                                            false, None)
1070                .unwrap();
1071        assert_eq!(defn.name, "LaTeX");
1072        let top_level_scope = Scope::new("text.tex.latex").unwrap();
1073        assert_eq!(defn.scope, top_level_scope);
1074
1075        let first_pattern: &Pattern = &defn.contexts["main"].patterns[0];
1076        match *first_pattern {
1077            Pattern::Match(ref match_pat) => {
1078                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
1079                assert_eq!(&m[0], &(1,vec![Scope::new("support.function.box.latex").unwrap()]));
1080
1081                //use parsing::syntax_definition::ContextReference::*;
1082                // TODO: check the first pushed reference is Inline(...) and has a meta_scope of meta.function.box.latex
1083                // TODO: check the second pushed reference is Named("argument".to_owned())
1084                // TODO: check the third pushed reference is Named("optional-arguments".to_owned())
1085
1086                assert!(match_pat.with_prototype.is_none());
1087            }
1088            _ => unreachable!(),
1089        }
1090    }
1091
1092    #[test]
1093    fn names_anonymous_contexts() {
1094        let def = SyntaxDefinition::load_from_str(
1095            r#"
1096            scope: source.c
1097            contexts:
1098              main:
1099                - match: a
1100                  push: a
1101              a:
1102                - meta_scope: a
1103                - match: x
1104                  push:
1105                    - meta_scope: anonymous_x
1106                    - match: anything
1107                      push:
1108                        - meta_scope: anonymous_x_2
1109                - match: y
1110                  push:
1111                    - meta_scope: anonymous_y
1112                - match: z
1113                  escape: 'test'
1114            "#,
1115            false,
1116            None
1117        ).unwrap();
1118
1119        assert_eq!(def.contexts["a"].meta_scope, vec![Scope::new("a").unwrap()]);
1120        assert_eq!(def.contexts["#anon_a_0"].meta_scope, vec![Scope::new("anonymous_x").unwrap()]);
1121        assert_eq!(def.contexts["#anon_a_1"].meta_scope, vec![Scope::new("anonymous_x_2").unwrap()]);
1122        assert_eq!(def.contexts["#anon_a_2"].meta_scope, vec![Scope::new("anonymous_y").unwrap()]);
1123        assert_eq!(def.contexts["#anon_a_3"].patterns.len(), 1); // escape
1124    }
1125
1126    #[test]
1127    fn can_use_fallback_name() {
1128        let def = SyntaxDefinition::load_from_str(r#"
1129        scope: source.c
1130        contexts:
1131          main:
1132            - match: ''
1133        "#,false, Some("C"));
1134        assert_eq!(def.unwrap().name, "C");
1135    }
1136
1137    #[test]
1138    fn can_rewrite_regex_for_newlines() {
1139        fn rewrite(s: &str) -> String {
1140            regex_for_newlines(s.to_string())
1141        }
1142
1143        assert_eq!(&rewrite(r"a"), r"a");
1144        assert_eq!(&rewrite(r"\b"), r"\b");
1145        assert_eq!(&rewrite(r"(a)"), r"(a)");
1146        assert_eq!(&rewrite(r"[a]"), r"[a]");
1147        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1148        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1149        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1150
1151        assert_eq!(&rewrite(r"^"), r"^");
1152        assert_eq!(&rewrite(r"$"), r"(?m:$)");
1153        assert_eq!(&rewrite(r"^ab$"), r"^ab(?m:$)");
1154        assert_eq!(&rewrite(r"\^ab\$"), r"\^ab\$");
1155        assert_eq!(&rewrite(r"(//).*$"), r"(//).*(?m:$)");
1156
1157        // Do not rewrite this `$` because it's in a char class and doesn't mean end of line
1158        assert_eq!(&rewrite(r"[a$]"), r"[a$]");
1159    }
1160
1161    #[test]
1162    fn can_rewrite_regex_for_no_newlines() {
1163        fn rewrite(s: &str) -> String {
1164            regex_for_no_newlines(s.to_string())
1165        }
1166
1167        assert_eq!(&rewrite(r"a"), r"a");
1168        assert_eq!(&rewrite(r"\b"), r"\b");
1169        assert_eq!(&rewrite(r"(a)"), r"(a)");
1170        assert_eq!(&rewrite(r"[a]"), r"[a]");
1171        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1172        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1173        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1174
1175        assert_eq!(&rewrite(r"\n"), r"$");
1176        assert_eq!(&rewrite(r"\[\n"), r"\[$");
1177        assert_eq!(&rewrite(r"a\n?"), r"a\n?");
1178        assert_eq!(&rewrite(r"a\n+"), r"a\n+");
1179        assert_eq!(&rewrite(r"a\n*"), r"a\n*");
1180        assert_eq!(&rewrite(r"[abc\n]"), r"(?:[abc\n]|$)");
1181        assert_eq!(&rewrite(r"[^\n]"), r"[^\n]");
1182        assert_eq!(&rewrite(r"[^]\n]"), r"[^]\n]");
1183        assert_eq!(&rewrite(r"[\n]?"), r"[\n]?");
1184        // Removing the `\n` might result in an empty character class, so we should leave it.
1185        assert_eq!(&rewrite(r"[\n]"), r"(?:[\n]|$)");
1186        assert_eq!(&rewrite(r"[]\n]"), r"(?:[]\n]|$)");
1187        // In order to properly understand nesting, we'd have to have a full parser, so ignore it.
1188        assert_eq!(&rewrite(r"[[a]&&[\n]]"), r"[[a]&&[\n]]");
1189
1190        assert_eq!(&rewrite(r"ab(?:\n)?"), r"ab(?:$|)");
1191        assert_eq!(&rewrite(r"(?<!\n)ab"), r"(?<!$)ab");
1192        assert_eq!(&rewrite(r"(?<=\n)ab"), r"(?<=$)ab");
1193    }
1194
1195    #[test]
1196    fn can_get_valid_captures_from_regex() {
1197        let regex = "hello(test)(?=(world))(foo(?P<named>bar))";
1198        println!("{:?}", regex);
1199        let valid_indexes = get_consuming_capture_indexes(regex);
1200        println!("{:?}", valid_indexes);
1201        assert_eq!(valid_indexes, [0, 1, 3, 4]);
1202    }
1203
1204    #[test]
1205    fn can_get_valid_captures_from_regex2() {
1206        let regex = "hello(test)[(?=tricked](foo(bar))";
1207        println!("{:?}", regex);
1208        let valid_indexes = get_consuming_capture_indexes(regex);
1209        println!("{:?}", valid_indexes);
1210        assert_eq!(valid_indexes, [0, 1, 2, 3]);
1211    }
1212
1213    #[test]
1214    fn can_get_valid_captures_from_nested_regex() {
1215        let regex = "hello(test)(?=(world(?!(te(?<=(st))))))(foo(bar))";
1216        println!("{:?}", regex);
1217        let valid_indexes = get_consuming_capture_indexes(regex);
1218        println!("{:?}", valid_indexes);
1219        assert_eq!(valid_indexes, [0, 1, 5, 6]);
1220    }
1221}