pest/
position.rs

1// pest. The Elegant Parser
2// Copyright (c) 2018 Dragoș Tiselice
3//
4// Licensed under the Apache License, Version 2.0
5// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. All files in the project carrying such notice may not be copied,
8// modified, or distributed except according to those terms.
9
10use core::cmp::Ordering;
11use core::fmt;
12use core::hash::{Hash, Hasher};
13use core::ops::Range;
14use core::ptr;
15use core::str;
16
17use crate::span;
18
19/// A cursor position in a `&str` which provides useful methods to manually parse that string.
20#[derive(Clone, Copy)]
21pub struct Position<'i> {
22    input: &'i str,
23    pos: usize,
24}
25
26impl<'i> Position<'i> {
27    /// Create a new `Position` without checking invariants. (Checked with `debug_assertions`.)
28    pub(crate) fn new_internal(input: &str, pos: usize) -> Position<'_> {
29        debug_assert!(input.get(pos..).is_some());
30        Position { input, pos }
31    }
32
33    /// Attempts to create a new `Position` at the given position. If the specified position is
34    /// an invalid index, or the specified position is not a valid UTF8 boundary, then None is
35    /// returned.
36    ///
37    /// # Examples
38    /// ```
39    /// # use pest::Position;
40    /// let cheart = '💖';
41    /// let heart = "💖";
42    /// assert_eq!(Position::new(heart, 1), None);
43    /// assert_ne!(Position::new(heart, cheart.len_utf8()), None);
44    /// ```
45    pub fn new(input: &str, pos: usize) -> Option<Position<'_>> {
46        input.get(pos..).map(|_| Position { input, pos })
47    }
48
49    /// Creates a `Position` at the start of a `&str`.
50    ///
51    /// # Examples
52    ///
53    /// ```
54    /// # use pest::Position;
55    /// let start = Position::from_start("");
56    /// assert_eq!(start.pos(), 0);
57    /// ```
58    #[inline]
59    pub fn from_start(input: &'i str) -> Position<'i> {
60        // Position 0 is always safe because it's always a valid UTF-8 border.
61        Position { input, pos: 0 }
62    }
63
64    /// Returns the byte position of this `Position` as a `usize`.
65    ///
66    /// # Examples
67    ///
68    /// ```
69    /// # use pest::Position;
70    /// let input = "ab";
71    /// let mut start = Position::from_start(input);
72    ///
73    /// assert_eq!(start.pos(), 0);
74    /// ```
75    #[inline]
76    pub fn pos(&self) -> usize {
77        self.pos
78    }
79
80    /// Creates a `Span` from two `Position`s.
81    ///
82    /// # Panics
83    ///
84    /// Panics if the positions come from different inputs.
85    ///
86    /// # Examples
87    ///
88    /// ```
89    /// # use pest::Position;
90    /// let input = "ab";
91    /// let start = Position::from_start(input);
92    /// let span = start.span(&start.clone());
93    ///
94    /// assert_eq!(span.start(), 0);
95    /// assert_eq!(span.end(), 0);
96    /// ```
97    #[inline]
98    pub fn span(&self, other: &Position<'i>) -> span::Span<'i> {
99        if ptr::eq(self.input, other.input)
100        /* && self.input.get(self.pos..other.pos).is_some() */
101        {
102            span::Span::new_internal(self.input, self.pos, other.pos)
103        } else {
104            // TODO: maybe a panic if self.pos < other.pos
105            panic!("span created from positions from different inputs")
106        }
107    }
108
109    /// Returns the line and column number of this `Position`.
110    ///
111    /// This is an O(n) operation, where n is the number of chars in the input.
112    /// You better use [`pair.line_col()`](struct.Pair.html#method.line_col) instead.
113    ///
114    /// # Examples
115    ///
116    /// ```
117    /// # use pest;
118    /// # #[allow(non_camel_case_types)]
119    /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
120    /// enum Rule {}
121    ///
122    /// let input = "\na";
123    /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
124    /// let mut result = state.match_string("\na");
125    /// assert!(result.is_ok());
126    /// assert_eq!(result.unwrap().position().line_col(), (2, 2));
127    /// ```
128    #[inline]
129    pub fn line_col(&self) -> (usize, usize) {
130        if self.pos > self.input.len() {
131            panic!("position out of bounds");
132        }
133        let mut pos = self.pos;
134        let slice = &self.input[..pos];
135        let mut chars = slice.chars().peekable();
136
137        let mut line_col = (1, 1);
138
139        while pos != 0 {
140            match chars.next() {
141                Some('\r') => {
142                    if let Some(&'\n') = chars.peek() {
143                        chars.next();
144
145                        if pos == 1 {
146                            pos -= 1;
147                        } else {
148                            pos -= 2;
149                        }
150
151                        line_col = (line_col.0 + 1, 1);
152                    } else {
153                        pos -= 1;
154                        line_col = (line_col.0, line_col.1 + 1);
155                    }
156                }
157                Some('\n') => {
158                    pos -= 1;
159                    line_col = (line_col.0 + 1, 1);
160                }
161                Some(c) => {
162                    pos -= c.len_utf8();
163                    line_col = (line_col.0, line_col.1 + 1);
164                }
165                None => unreachable!(),
166            }
167        }
168
169        line_col
170    }
171
172    /// Returns the entire line of the input that contains this `Position`.
173    ///
174    /// # Examples
175    ///
176    /// ```
177    /// # use pest;
178    /// # #[allow(non_camel_case_types)]
179    /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
180    /// enum Rule {}
181    ///
182    /// let input = "\na";
183    /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
184    /// let mut result = state.match_string("\na");
185    /// assert!(result.is_ok());
186    /// assert_eq!(result.unwrap().position().line_of(), "a");
187    /// ```
188    #[inline]
189    pub fn line_of(&self) -> &'i str {
190        if self.pos > self.input.len() {
191            panic!("position out of bounds");
192        };
193        // Safe since start and end can only be valid UTF-8 borders.
194        &self.input[self.find_line_start()..self.find_line_end()]
195    }
196
197    pub(crate) fn find_line_start(&self) -> usize {
198        if self.input.is_empty() {
199            return 0;
200        };
201        // Position's pos is always a UTF-8 border.
202        let start = self
203            .input
204            .char_indices()
205            .rev()
206            .skip_while(|&(i, _)| i >= self.pos)
207            .find(|&(_, c)| c == '\n');
208        match start {
209            Some((i, _)) => i + 1,
210            None => 0,
211        }
212    }
213
214    pub(crate) fn find_line_end(&self) -> usize {
215        if self.input.is_empty() {
216            0
217        } else if self.pos == self.input.len() - 1 {
218            self.input.len()
219        } else {
220            // Position's pos is always a UTF-8 border.
221            let end = self
222                .input
223                .char_indices()
224                .skip_while(|&(i, _)| i < self.pos)
225                .find(|&(_, c)| c == '\n');
226            match end {
227                Some((i, _)) => i + 1,
228                None => self.input.len(),
229            }
230        }
231    }
232
233    /// Returns `true` when the `Position` points to the start of the input `&str`.
234    #[inline]
235    pub(crate) fn at_start(&self) -> bool {
236        self.pos == 0
237    }
238
239    /// Returns `true` when the `Position` points to the end of the input `&str`.
240    #[inline]
241    pub(crate) fn at_end(&self) -> bool {
242        self.pos == self.input.len()
243    }
244
245    /// Skips `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
246    /// otherwise. If the return value is `false`, `pos` will not be updated.
247    #[inline]
248    pub(crate) fn skip(&mut self, n: usize) -> bool {
249        let skipped = {
250            let mut len = 0;
251            // Position's pos is always a UTF-8 border.
252            let mut chars = self.input[self.pos..].chars();
253            for _ in 0..n {
254                if let Some(c) = chars.next() {
255                    len += c.len_utf8();
256                } else {
257                    return false;
258                }
259            }
260            len
261        };
262
263        self.pos += skipped;
264        true
265    }
266
267    /// Goes back `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
268    /// otherwise. If the return value is `false`, `pos` will not be updated.
269    #[inline]
270    pub(crate) fn skip_back(&mut self, n: usize) -> bool {
271        let skipped = {
272            let mut len = 0;
273            // Position's pos is always a UTF-8 border.
274            let mut chars = self.input[..self.pos].chars().rev();
275            for _ in 0..n {
276                if let Some(c) = chars.next() {
277                    len += c.len_utf8();
278                } else {
279                    return false;
280                }
281            }
282            len
283        };
284
285        self.pos -= skipped;
286        true
287    }
288
289    /// Skips until one of the given `strings` is found. If none of the `strings` can be found,
290    /// this function will return `false` but its `pos` will *still* be updated.
291    #[inline]
292    pub(crate) fn skip_until(&mut self, strings: &[&str]) -> bool {
293        #[cfg(not(feature = "memchr"))]
294        {
295            self.skip_until_basic(strings)
296        }
297        #[cfg(feature = "memchr")]
298        {
299            match strings {
300                [] => (),
301                [s1] => {
302                    if let Some(from) =
303                        memchr::memmem::find(&self.input.as_bytes()[self.pos..], s1.as_bytes())
304                    {
305                        self.pos += from;
306                        return true;
307                    }
308                }
309                [s1, s2] if !s1.is_empty() && !s2.is_empty() => {
310                    let b1 = s1.as_bytes()[0];
311                    let b2 = s2.as_bytes()[0];
312                    let miter = memchr::memchr2_iter(b1, b2, &self.input.as_bytes()[self.pos..]);
313                    for from in miter {
314                        let start = &self.input[self.pos + from..];
315                        if start.starts_with(s1) || start.starts_with(s2) {
316                            self.pos += from;
317                            return true;
318                        }
319                    }
320                }
321                [s1, s2, s3] if !s1.is_empty() && !s2.is_empty() && s3.is_empty() => {
322                    let b1 = s1.as_bytes()[0];
323                    let b2 = s2.as_bytes()[0];
324                    let b3 = s2.as_bytes()[0];
325                    let miter =
326                        memchr::memchr3_iter(b1, b2, b3, &self.input.as_bytes()[self.pos..]);
327                    for from in miter {
328                        let start = &self.input[self.pos + from..];
329                        if start.starts_with(s1) || start.starts_with(s2) || start.starts_with(s3) {
330                            self.pos += from;
331                            return true;
332                        }
333                    }
334                }
335                _ => {
336                    return self.skip_until_basic(strings);
337                }
338            }
339            self.pos = self.input.len();
340            false
341        }
342    }
343
344    #[inline]
345    fn skip_until_basic(&mut self, strings: &[&str]) -> bool {
346        // TODO: optimize with Aho-Corasick, e.g. https://crates.io/crates/daachorse?
347        for from in self.pos..self.input.len() {
348            let bytes = if let Some(string) = self.input.get(from..) {
349                string.as_bytes()
350            } else {
351                continue;
352            };
353
354            for slice in strings.iter() {
355                let to = slice.len();
356                if Some(slice.as_bytes()) == bytes.get(0..to) {
357                    self.pos = from;
358                    return true;
359                }
360            }
361        }
362
363        self.pos = self.input.len();
364        false
365    }
366
367    /// Matches the char at the `Position` against a specified character and returns `true` if a match
368    /// was made. If no match was made, returns `false`.
369    /// `pos` will not be updated in either case.
370    #[inline]
371    pub(crate) fn match_char(&self, c: char) -> bool {
372        matches!(self.input[self.pos..].chars().next(), Some(cc) if c == cc)
373    }
374
375    /// Matches the char at the `Position` against a filter function and returns `true` if a match
376    /// was made. If no match was made, returns `false` and `pos` will not be updated.
377    #[inline]
378    pub(crate) fn match_char_by<F>(&mut self, f: F) -> bool
379    where
380        F: FnOnce(char) -> bool,
381    {
382        if let Some(c) = self.input[self.pos..].chars().next() {
383            if f(c) {
384                self.pos += c.len_utf8();
385                true
386            } else {
387                false
388            }
389        } else {
390            false
391        }
392    }
393
394    /// Matches `string` from the `Position` and returns `true` if a match was made or `false`
395    /// otherwise. If no match was made, `pos` will not be updated.
396    #[inline]
397    pub(crate) fn match_string(&mut self, string: &str) -> bool {
398        let to = self.pos + string.len();
399
400        if Some(string.as_bytes()) == self.input.as_bytes().get(self.pos..to) {
401            self.pos = to;
402            true
403        } else {
404            false
405        }
406    }
407
408    /// Case-insensitively matches `string` from the `Position` and returns `true` if a match was
409    /// made or `false` otherwise. If no match was made, `pos` will not be updated.
410    #[inline]
411    pub(crate) fn match_insensitive(&mut self, string: &str) -> bool {
412        let matched = {
413            let slice = &self.input[self.pos..];
414            if let Some(slice) = slice.get(0..string.len()) {
415                slice.eq_ignore_ascii_case(string)
416            } else {
417                false
418            }
419        };
420
421        if matched {
422            self.pos += string.len();
423            true
424        } else {
425            false
426        }
427    }
428
429    /// Matches `char` `range` from the `Position` and returns `true` if a match was made or `false`
430    /// otherwise. If no match was made, `pos` will not be updated.
431    #[inline]
432    pub(crate) fn match_range(&mut self, range: Range<char>) -> bool {
433        if let Some(c) = self.input[self.pos..].chars().next() {
434            if range.start <= c && c <= range.end {
435                self.pos += c.len_utf8();
436                return true;
437            }
438        }
439
440        false
441    }
442}
443
444impl<'i> fmt::Debug for Position<'i> {
445    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
446        f.debug_struct("Position").field("pos", &self.pos).finish()
447    }
448}
449
450impl<'i> PartialEq for Position<'i> {
451    fn eq(&self, other: &Position<'i>) -> bool {
452        ptr::eq(self.input, other.input) && self.pos == other.pos
453    }
454}
455
456impl<'i> Eq for Position<'i> {}
457
458#[allow(clippy::non_canonical_partial_ord_impl)]
459impl<'i> PartialOrd for Position<'i> {
460    fn partial_cmp(&self, other: &Position<'i>) -> Option<Ordering> {
461        if ptr::eq(self.input, other.input) {
462            self.pos.partial_cmp(&other.pos)
463        } else {
464            None
465        }
466    }
467}
468
469impl<'i> Ord for Position<'i> {
470    fn cmp(&self, other: &Position<'i>) -> Ordering {
471        self.partial_cmp(other)
472            .expect("cannot compare positions from different strs")
473    }
474}
475
476impl<'i> Hash for Position<'i> {
477    fn hash<H: Hasher>(&self, state: &mut H) {
478        (self.input as *const str).hash(state);
479        self.pos.hash(state);
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    #[test]
488    fn empty() {
489        let input = "";
490        assert!(Position::new(input, 0).unwrap().match_string(""));
491        assert!(!Position::new(input, 0).unwrap().match_string("a"));
492    }
493
494    #[test]
495    fn parts() {
496        let input = "asdasdf";
497
498        assert!(Position::new(input, 0).unwrap().match_string("asd"));
499        assert!(Position::new(input, 3).unwrap().match_string("asdf"));
500    }
501
502    #[test]
503    fn line_col() {
504        let input = "a\rb\nc\r\nd嗨";
505
506        assert_eq!(Position::new(input, 0).unwrap().line_col(), (1, 1));
507        assert_eq!(Position::new(input, 1).unwrap().line_col(), (1, 2));
508        assert_eq!(Position::new(input, 2).unwrap().line_col(), (1, 3));
509        assert_eq!(Position::new(input, 3).unwrap().line_col(), (1, 4));
510        assert_eq!(Position::new(input, 4).unwrap().line_col(), (2, 1));
511        assert_eq!(Position::new(input, 5).unwrap().line_col(), (2, 2));
512        assert_eq!(Position::new(input, 6).unwrap().line_col(), (2, 3));
513        assert_eq!(Position::new(input, 7).unwrap().line_col(), (3, 1));
514        assert_eq!(Position::new(input, 8).unwrap().line_col(), (3, 2));
515        assert_eq!(Position::new(input, 11).unwrap().line_col(), (3, 3));
516        let input = "abcd嗨";
517        assert_eq!(Position::new(input, 7).unwrap().line_col(), (1, 6));
518    }
519
520    #[test]
521    fn line_of() {
522        let input = "a\rb\nc\r\nd嗨";
523
524        assert_eq!(Position::new(input, 0).unwrap().line_of(), "a\rb\n");
525        assert_eq!(Position::new(input, 1).unwrap().line_of(), "a\rb\n");
526        assert_eq!(Position::new(input, 2).unwrap().line_of(), "a\rb\n");
527        assert_eq!(Position::new(input, 3).unwrap().line_of(), "a\rb\n");
528        assert_eq!(Position::new(input, 4).unwrap().line_of(), "c\r\n");
529        assert_eq!(Position::new(input, 5).unwrap().line_of(), "c\r\n");
530        assert_eq!(Position::new(input, 6).unwrap().line_of(), "c\r\n");
531        assert_eq!(Position::new(input, 7).unwrap().line_of(), "d嗨");
532        assert_eq!(Position::new(input, 8).unwrap().line_of(), "d嗨");
533        assert_eq!(Position::new(input, 11).unwrap().line_of(), "d嗨");
534    }
535
536    #[test]
537    fn line_of_empty() {
538        let input = "";
539
540        assert_eq!(Position::new(input, 0).unwrap().line_of(), "");
541    }
542
543    #[test]
544    fn line_of_new_line() {
545        let input = "\n";
546
547        assert_eq!(Position::new(input, 0).unwrap().line_of(), "\n");
548    }
549
550    #[test]
551    fn line_of_between_new_line() {
552        let input = "\n\n";
553
554        assert_eq!(Position::new(input, 1).unwrap().line_of(), "\n");
555    }
556
557    fn measure_skip(input: &str, pos: usize, n: usize) -> Option<usize> {
558        let mut p = Position::new(input, pos).unwrap();
559        if p.skip(n) {
560            Some(p.pos - pos)
561        } else {
562            None
563        }
564    }
565
566    #[test]
567    fn skip_empty() {
568        let input = "";
569
570        assert_eq!(measure_skip(input, 0, 0), Some(0));
571        assert_eq!(measure_skip(input, 0, 1), None);
572    }
573
574    #[test]
575    fn skip() {
576        let input = "d嗨";
577
578        assert_eq!(measure_skip(input, 0, 0), Some(0));
579        assert_eq!(measure_skip(input, 0, 1), Some(1));
580        assert_eq!(measure_skip(input, 1, 1), Some(3));
581    }
582
583    #[test]
584    fn skip_until() {
585        let input = "ab ac";
586        let pos = Position::from_start(input);
587
588        let mut test_pos = pos;
589        test_pos.skip_until(&["a", "b"]);
590        assert_eq!(test_pos.pos(), 0);
591
592        test_pos = pos;
593        test_pos.skip_until(&["b"]);
594        assert_eq!(test_pos.pos(), 1);
595
596        test_pos = pos;
597        test_pos.skip_until(&["ab"]);
598        assert_eq!(test_pos.pos(), 0);
599
600        test_pos = pos;
601        test_pos.skip_until(&["ac", "z"]);
602        assert_eq!(test_pos.pos(), 3);
603
604        test_pos = pos;
605        assert!(!test_pos.skip_until(&["z"]));
606        assert_eq!(test_pos.pos(), 5);
607    }
608
609    #[test]
610    fn match_range() {
611        let input = "b";
612
613        assert!(Position::new(input, 0).unwrap().match_range('a'..'c'));
614        assert!(Position::new(input, 0).unwrap().match_range('b'..'b'));
615        assert!(!Position::new(input, 0).unwrap().match_range('a'..'a'));
616        assert!(!Position::new(input, 0).unwrap().match_range('c'..'c'));
617        assert!(Position::new(input, 0).unwrap().match_range('a'..'嗨'));
618    }
619
620    #[test]
621    fn match_insensitive() {
622        let input = "AsdASdF";
623
624        assert!(Position::new(input, 0).unwrap().match_insensitive("asd"));
625        assert!(Position::new(input, 3).unwrap().match_insensitive("asdf"));
626    }
627
628    #[test]
629    fn cmp() {
630        let input = "a";
631        let start = Position::from_start(input);
632        let mut end = start;
633
634        assert!(end.skip(1));
635        let result = start.cmp(&end);
636
637        assert_eq!(result, Ordering::Less);
638    }
639
640    #[test]
641    #[should_panic]
642    fn cmp_panic() {
643        let input1 = "a";
644        let input2 = "b";
645        let pos1 = Position::from_start(input1);
646        let pos2 = Position::from_start(input2);
647
648        let _ = pos1.cmp(&pos2);
649    }
650
651    #[test]
652    #[cfg(feature = "std")]
653    fn hash() {
654        use std::collections::HashSet;
655
656        let input = "a";
657        let start = Position::from_start(input);
658        let mut positions = HashSet::new();
659
660        positions.insert(start);
661    }
662}