pulldown_cmark/
scanners.rs

Help
1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Scanners for fragments of CommonMark syntax
22
23use std::char;
24
25use crate::parse::HtmlScanGuard;
26pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
27use crate::strings::CowStr;
28use crate::{entities, BlockQuoteKind, HeadingLevel};
29use crate::{Alignment, LinkType};
30
31use memchr::memchr;
32
33// sorted for binary search
34const HTML_TAGS: [&str; 62] = [
35    "address",
36    "article",
37    "aside",
38    "base",
39    "basefont",
40    "blockquote",
41    "body",
42    "caption",
43    "center",
44    "col",
45    "colgroup",
46    "dd",
47    "details",
48    "dialog",
49    "dir",
50    "div",
51    "dl",
52    "dt",
53    "fieldset",
54    "figcaption",
55    "figure",
56    "footer",
57    "form",
58    "frame",
59    "frameset",
60    "h1",
61    "h2",
62    "h3",
63    "h4",
64    "h5",
65    "h6",
66    "head",
67    "header",
68    "hr",
69    "html",
70    "iframe",
71    "legend",
72    "li",
73    "link",
74    "main",
75    "menu",
76    "menuitem",
77    "nav",
78    "noframes",
79    "ol",
80    "optgroup",
81    "option",
82    "p",
83    "param",
84    "search",
85    "section",
86    "summary",
87    "table",
88    "tbody",
89    "td",
90    "tfoot",
91    "th",
92    "thead",
93    "title",
94    "tr",
95    "track",
96    "ul",
97];
98
99/// Analysis of the beginning of a line, including indentation and container
100/// markers.
101#[derive(Clone)]
102pub(crate) struct LineStart<'a> {
103    bytes: &'a [u8],
104    ix: usize,
105
106    // The index in `bytes` after the last tab we scanned; initially
107    // zero.
108    //
109    // Thus, there are no tab characters between `ix` and here, and for
110    // the purpose of defining block structure, this position can be
111    // considered to fall on a tab stop.
112    //
113    // This is only valid while scanning the initial portion of the
114    // line; methods that work with interior structure don't bother to
115    // update it.
116    tab_start: usize,
117
118    // In contexts where spaces help to define block structure, tabs
119    // behave as if they were replaced by spaces with a tab stop of 4
120    // characters.
121    //
122    // If we have scanned past a tab character but not consumed all
123    // the horizontal width it contributed, this is the number of
124    // spaces logically remaining, before the character at `ix`.
125    spaces_remaining: usize,
126
127    // no thematic breaks can occur before this offset.
128    // this prevents scanning over and over up to a certain point
129    min_hrule_offset: usize,
130}
131
132impl<'a> LineStart<'a> {
133    pub(crate) fn new(bytes: &[u8]) -> LineStart<'_> {
134        LineStart {
135            bytes,
136            tab_start: 0,
137            ix: 0,
138            spaces_remaining: 0,
139            min_hrule_offset: 0,
140        }
141    }
142
143    /// Try to scan a number of spaces.
144    ///
145    /// Returns true if all spaces were consumed.
146    ///
147    /// Note: consumes some spaces even if not successful.
148    pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
149        self.scan_space_inner(n_space) == 0
150    }
151
152    /// Scan a number of spaces up to a maximum.
153    ///
154    /// Returns number of spaces scanned.
155    pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
156        n_space - self.scan_space_inner(n_space)
157    }
158
159    /// Returns unused remainder of spaces.
160    fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
161        // Consume any common prefix between the number of spaces we
162        // want and the number of unscanned tab-introduced spaces.
163        let n_from_remaining = self.spaces_remaining.min(n_space);
164        self.spaces_remaining -= n_from_remaining;
165        n_space -= n_from_remaining;
166
167        while n_space > 0 && self.ix < self.bytes.len() {
168            match self.bytes[self.ix] {
169                b' ' => {
170                    self.ix += 1;
171                    n_space -= 1;
172                }
173                b'\t' => {
174                    let spaces = 4 - (self.ix - self.tab_start) % 4;
175                    self.ix += 1;
176                    self.tab_start = self.ix;
177                    let n = spaces.min(n_space);
178                    n_space -= n;
179
180                    // Record the unscanned portion of the tab.
181                    self.spaces_remaining = spaces - n;
182                }
183                _ => break,
184            }
185        }
186        n_space
187    }
188
189    /// Scan all available ASCII whitespace (not including eol).
190    pub(crate) fn scan_all_space(&mut self) {
191        self.spaces_remaining = 0;
192        self.ix += self.bytes[self.ix..]
193            .iter()
194            .take_while(|&&b| b == b' ' || b == b'\t')
195            .count();
196    }
197
198    /// Determine whether we're at end of line (includes end of file).
199    pub(crate) fn is_at_eol(&self) -> bool {
200        self.bytes
201            .get(self.ix)
202            .map(|&c| c == b'\r' || c == b'\n')
203            .unwrap_or(true)
204    }
205
206    fn scan_ch(&mut self, c: u8) -> bool {
207        if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
208            self.ix += 1;
209            true
210        } else {
211            false
212        }
213    }
214
215    fn scan_case_insensitive(&mut self, tag: &[u8]) -> bool {
216        if self.bytes.len() - self.ix < tag.len() {
217            return false;
218        }
219        let prefix = &self.bytes[self.ix..self.ix + tag.len()];
220        let ok = prefix.eq_ignore_ascii_case(tag);
221        if ok {
222            self.ix += tag.len();
223        }
224        ok
225    }
226
227    pub(crate) fn scan_blockquote_tag(&mut self) -> Option<BlockQuoteKind> {
228        let saved_ix = self.ix;
229        let tag = if self.scan_ch(b'[') && self.scan_ch(b'!') {
230            let tag = if self.scan_case_insensitive(b"note") {
231                Some(BlockQuoteKind::Note)
232            } else if self.scan_case_insensitive(b"tip") {
233                Some(BlockQuoteKind::Tip)
234            } else if self.scan_case_insensitive(b"important") {
235                Some(BlockQuoteKind::Important)
236            } else if self.scan_case_insensitive(b"warning") {
237                Some(BlockQuoteKind::Warning)
238            } else if self.scan_case_insensitive(b"caution") {
239                Some(BlockQuoteKind::Caution)
240            } else {
241                None
242            };
243            if tag.is_some() && self.scan_ch(b']') {
244                if let Some(nl) = scan_blank_line(&self.bytes[self.ix..]) {
245                    self.ix += nl;
246                    tag
247                } else {
248                    None
249                }
250            } else {
251                None
252            }
253        } else {
254            None
255        };
256        if tag.is_none() {
257            self.ix = saved_ix;
258        }
259        tag
260    }
261
262    pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
263        if self.scan_ch(b'>') {
264            let _ = self.scan_space(1);
265            true
266        } else {
267            false
268        }
269    }
270
271    /// Scan a definition marker.
272    ///
273    /// Definition markers are single colons, preceded by at most three spaces
274    /// and followed by at most three spaces. The indentation of following
275    /// lines is equal to the whole size of the marker, including the colon.
276    ///
277    /// If one is found, it will make the preceding paragraph into a definition
278    /// list title.
279    ///
280    /// Return value is the amount of indentation, or `None` if it's not a
281    /// definition list marker.
282    pub(crate) fn scan_definition_list_definition_marker_with_indent(
283        &mut self,
284        indent: usize,
285    ) -> Option<usize> {
286        let save = self.clone();
287        if self.scan_ch(b':') {
288            let remaining = 4 - (indent + 1);
289            Some(indent + 1 + self.scan_space_upto(remaining))
290        } else {
291            *self = save;
292            None
293        }
294    }
295
296    /// Scan a list marker.
297    ///
298    /// Return value is the character, the start index, and the indent in spaces.
299    /// For ordered list markers, the character will be one of b'.' or b')'. For
300    /// bullet list markers, it will be one of b'-', b'+', or b'*'.
301    pub(crate) fn scan_list_marker_with_indent(
302        &mut self,
303        indent: usize,
304    ) -> Option<(u8, u64, usize)> {
305        let save = self.clone();
306        if self.ix < self.bytes.len() {
307            let c = self.bytes[self.ix];
308            if c == b'-' || c == b'+' || c == b'*' {
309                if self.ix >= self.min_hrule_offset {
310                    // there could be an hrule here
311                    if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
312                        self.min_hrule_offset = min_offset;
313                    } else {
314                        *self = save;
315                        return None;
316                    }
317                }
318                self.ix += 1;
319                if self.scan_space(1) || self.is_at_eol() {
320                    return self.finish_list_marker(c, 0, indent + 2);
321                }
322            } else if c.is_ascii_digit() {
323                let start_ix = self.ix;
324                let mut ix = self.ix + 1;
325                let mut val = u64::from(c - b'0');
326                while ix < self.bytes.len() && ix - start_ix < 10 {
327                    let c = self.bytes[ix];
328                    ix += 1;
329                    if c.is_ascii_digit() {
330                        val = val * 10 + u64::from(c - b'0');
331                    } else if c == b')' || c == b'.' {
332                        self.ix = ix;
333                        if self.scan_space(1) || self.is_at_eol() {
334                            return self.finish_list_marker(c, val, indent + 1 + ix - start_ix);
335                        } else {
336                            break;
337                        }
338                    } else {
339                        break;
340                    }
341                }
342            }
343        }
344        *self = save;
345        None
346    }
347
348    fn finish_list_marker(
349        &mut self,
350        c: u8,
351        start: u64,
352        mut indent: usize,
353    ) -> Option<(u8, u64, usize)> {
354        let save = self.clone();
355
356        // skip the rest of the line if it's blank
357        if scan_blank_line(&self.bytes[self.ix..]).is_some() {
358            return Some((c, start, indent));
359        }
360
361        let post_indent = self.scan_space_upto(4);
362        if post_indent < 4 {
363            indent += post_indent;
364        } else {
365            *self = save;
366        }
367        Some((c, start, indent))
368    }
369
370    /// Returns Some(is_checked) when a task list marker was found. Resets itself
371    /// to original state otherwise.
372    pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
373        let save = self.clone();
374        self.scan_space_upto(3);
375
376        if !self.scan_ch(b'[') {
377            *self = save;
378            return None;
379        }
380        let is_checked = match self.bytes.get(self.ix) {
381            Some(&c) if is_ascii_whitespace_no_nl(c) => {
382                self.ix += 1;
383                false
384            }
385            Some(b'x') | Some(b'X') => {
386                self.ix += 1;
387                true
388            }
389            _ => {
390                *self = save;
391                return None;
392            }
393        };
394        if !self.scan_ch(b']') {
395            *self = save;
396            return None;
397        }
398        if !self
399            .bytes
400            .get(self.ix)
401            .map(|&b| is_ascii_whitespace_no_nl(b))
402            .unwrap_or(false)
403        {
404            *self = save;
405            return None;
406        }
407        Some(is_checked)
408    }
409
410    pub(crate) fn bytes_scanned(&self) -> usize {
411        self.ix
412    }
413
414    pub(crate) fn remaining_space(&self) -> usize {
415        self.spaces_remaining
416    }
417}
418
419pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
420    (0x09..=0x0d).contains(&c) || c == b' '
421}
422
423pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
424    c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
425}
426
427fn is_ascii_alpha(c: u8) -> bool {
428    c.is_ascii_alphabetic()
429}
430
431fn is_ascii_alphanumeric(c: u8) -> bool {
432    matches!(c, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z')
433}
434
435fn is_ascii_letterdigitdash(c: u8) -> bool {
436    c == b'-' || is_ascii_alphanumeric(c)
437}
438
439fn is_digit(c: u8) -> bool {
440    c.is_ascii_digit()
441}
442
443fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
444    !matches!(
445        c,
446        b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r'
447    )
448}
449
450// scan a single character
451pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
452    if !data.is_empty() && data[0] == c {
453        1
454    } else {
455        0
456    }
457}
458
459pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
460where
461    F: FnMut(u8) -> bool,
462{
463    data.iter().take_while(|&&c| f(c)).count()
464}
465
466pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
467where
468    F: FnMut(u8) -> bool,
469{
470    data.iter().rev().take_while(|&&c| f(c)).count()
471}
472
473pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
474    scan_while(data, |x| x == c)
475}
476
477// Note: this scans ASCII whitespace only, for Unicode whitespace use
478// a different function.
479pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
480    scan_while(data, is_ascii_whitespace_no_nl)
481}
482
483fn scan_attr_value_chars(data: &[u8]) -> usize {
484    scan_while(data, is_valid_unquoted_attr_value_char)
485}
486
487pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
488    if bytes.is_empty() {
489        return Some(0);
490    }
491    match bytes[0] {
492        b'\n' => Some(1),
493        b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
494        _ => None,
495    }
496}
497
498pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
499    let i = scan_whitespace_no_nl(bytes);
500    scan_eol(&bytes[i..]).map(|n| i + n)
501}
502
503pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
504    memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
505}
506
507// return: end byte for closing code fence, or None
508// if the line is not a closing code fence
509pub(crate) fn scan_closing_code_fence(
510    bytes: &[u8],
511    fence_char: u8,
512    n_fence_char: usize,
513) -> Option<usize> {
514    if bytes.is_empty() {
515        return Some(0);
516    }
517    let mut i = 0;
518    let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
519    if num_fence_chars_found < n_fence_char {
520        return None;
521    }
522    i += num_fence_chars_found;
523    let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
524    i += num_trailing_spaces;
525    scan_eol(&bytes[i..]).map(|_| i)
526}
527
528// return: end byte for closing metadata block, or None
529// if the line is not a closing metadata block
530pub(crate) fn scan_closing_metadata_block(bytes: &[u8], fence_char: u8) -> Option<usize> {
531    let mut i = 0;
532    let mut num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
533    if num_fence_chars_found != 3 {
534        // if YAML style metadata block the closing character can also be `.`
535        if fence_char == b'-' {
536            num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'.');
537            if num_fence_chars_found != 3 {
538                return None;
539            }
540        } else {
541            return None;
542        }
543    }
544    i += num_fence_chars_found;
545    let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
546    i += num_trailing_spaces;
547    scan_eol(&bytes[i..]).map(|_| i)
548}
549
550// returned pair is (number of bytes, number of spaces)
551pub(crate) fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
552    let mut spaces = 0;
553    let mut offset = 0;
554
555    for (i, &b) in text.iter().enumerate() {
556        offset = i;
557        match b {
558            b' ' => {
559                spaces += 1;
560                if spaces == max {
561                    break;
562                }
563            }
564            b'\t' => {
565                let new_spaces = spaces + 4 - (spaces & 3);
566                if new_spaces > max {
567                    break;
568                }
569                spaces = new_spaces;
570            }
571            _ => break,
572        }
573    }
574
575    (offset, spaces)
576}
577
578/// Scan hrule opening sequence.
579///
580/// Returns Ok(x) when it finds an hrule, where x is the
581/// size of line containing the hrule, including the trailing newline.
582///
583/// Returns Err(x) when it does not find an hrule and x is
584/// the offset in data before no hrule can appear.
585pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
586    if bytes.len() < 3 {
587        return Err(0);
588    }
589    let c = bytes[0];
590    if !(c == b'*' || c == b'-' || c == b'_') {
591        return Err(0);
592    }
593    let mut n = 0;
594    let mut i = 0;
595
596    while i < bytes.len() {
597        match bytes[i] {
598            b'\n' | b'\r' => {
599                i += scan_eol(&bytes[i..]).unwrap_or(0);
600                break;
601            }
602            c2 if c2 == c => {
603                n += 1;
604            }
605            b' ' | b'\t' => (),
606            _ => return Err(i),
607        }
608        i += 1;
609    }
610    if n >= 3 {
611        Ok(i)
612    } else {
613        Err(i)
614    }
615}
616
617/// Scan an ATX heading opening sequence.
618///
619/// Returns number of bytes in prefix and level.
620pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
621    let level = scan_ch_repeat(data, b'#');
622    if data.get(level).copied().map_or(true, is_ascii_whitespace) {
623        HeadingLevel::try_from(level).ok()
624    } else {
625        None
626    }
627}
628
629/// Scan a setext heading underline.
630///
631/// Returns number of bytes in line (including trailing newline) and level.
632pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
633    let c = *data.first()?;
634    let level = if c == b'=' {
635        HeadingLevel::H1
636    } else if c == b'-' {
637        HeadingLevel::H2
638    } else {
639        return None;
640    };
641    let mut i = 1 + scan_ch_repeat(&data[1..], c);
642    i += scan_blank_line(&data[i..])?;
643    Some((i, level))
644}
645
646// returns number of bytes in line (including trailing
647// newline) and column alignments
648pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
649    let (mut i, spaces) = calc_indent(data, 4);
650    if spaces > 3 || i == data.len() {
651        return (0, vec![]);
652    }
653    let mut cols = vec![];
654    let mut active_col = Alignment::None;
655    let mut start_col = true;
656    let mut found_pipe = false;
657    let mut found_hyphen = false;
658    let mut found_hyphen_in_col = false;
659    if data[i] == b'|' {
660        i += 1;
661        found_pipe = true;
662    }
663    for c in &data[i..] {
664        if let Some(n) = scan_eol(&data[i..]) {
665            i += n;
666            break;
667        }
668        match *c {
669            b' ' => (),
670            b':' => {
671                active_col = match (start_col, active_col) {
672                    (true, Alignment::None) => Alignment::Left,
673                    (false, Alignment::Left) => Alignment::Center,
674                    (false, Alignment::None) => Alignment::Right,
675                    _ => active_col,
676                };
677                start_col = false;
678            }
679            b'-' => {
680                start_col = false;
681                found_hyphen = true;
682                found_hyphen_in_col = true;
683            }
684            b'|' => {
685                start_col = true;
686                found_pipe = true;
687                cols.push(active_col);
688                active_col = Alignment::None;
689                if !found_hyphen_in_col {
690                    // It isn't a table head if it has back-to-back pipes.
691                    return (0, vec![]);
692                }
693                found_hyphen_in_col = false;
694            }
695            _ => {
696                // It isn't a table head if it has characters outside the allowed set.
697                return (0, vec![]);
698            }
699        }
700        i += 1;
701    }
702
703    if !start_col {
704        cols.push(active_col);
705    }
706    if !found_pipe || !found_hyphen {
707        // It isn't a table head if it doesn't have a least one pipe or hyphen.
708        // It's a list, a header, or a thematic break.
709        return (0, vec![]);
710    }
711
712    (i, cols)
713}
714
715/// Scan code fence.
716///
717/// Returns number of bytes scanned and the char that is repeated to make the code fence.
718pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
719    let c = *data.first()?;
720    if !(c == b'`' || c == b'~') {
721        return None;
722    }
723    let i = 1 + scan_ch_repeat(&data[1..], c);
724    if i >= 3 {
725        if c == b'`' {
726            let suffix = &data[i..];
727            let next_line = i + scan_nextline(suffix);
728            // FIXME: make sure this is correct
729            if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
730                return None;
731            }
732        }
733        Some((i, c))
734    } else {
735        None
736    }
737}
738
739/// Scan metadata block, returning the number of delimiter bytes
740/// (always 3 for now) and the delimiter character.
741///
742/// Differently to code blocks, metadata blocks must be closed with the closing
743/// sequence not being a valid terminator the end of the file.
744///
745/// In addition, they cannot be empty (closing sequence in the next line) and
746/// the next line cannot be an empty line.
747pub(crate) fn scan_metadata_block(
748    data: &[u8],
749    yaml_style_enabled: bool,
750    pluses_style_enabled: bool,
751) -> Option<(usize, u8)> {
752    // Only if metadata blocks are enabled
753    if yaml_style_enabled || pluses_style_enabled {
754        let c = *data.first()?;
755        if !((c == b'-' && yaml_style_enabled) || (c == b'+' && pluses_style_enabled)) {
756            return None;
757        }
758        let i = 1 + scan_ch_repeat(&data[1..], c);
759        // Only trailing spaces after the delimiters in the line
760        let next_line = scan_nextline(&data[i..]);
761        for c in &data[i..i + next_line] {
762            if !c.is_ascii_whitespace() {
763                return None;
764            }
765        }
766        if i == 3 {
767            // Search the closing sequence
768            let mut j = i;
769            let mut first_line = true;
770            while j < data.len() {
771                j += scan_nextline(&data[j..]);
772                let closed = scan_closing_metadata_block(&data[j..], c).is_some();
773                // The first line of the metadata block cannot be an empty line
774                // nor the end of the block
775                if first_line {
776                    if closed || scan_blank_line(&data[j..]).is_some() {
777                        return None;
778                    }
779                    first_line = false;
780                }
781                if closed {
782                    return Some((i, c));
783                }
784            }
785            None
786        } else {
787            None
788        }
789    } else {
790        None
791    }
792}
793
794pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
795    if data.first().copied() == Some(b'>') {
796        let space = if data.get(1).copied() == Some(b' ') {
797            1
798        } else {
799            0
800        };
801        Some(1 + space)
802    } else {
803        None
804    }
805}
806
807/// return number of bytes scanned, delimiter, start index, and indent
808pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
809    let mut c = *bytes.first()?;
810    let (w, start) = match c {
811        b'-' | b'+' | b'*' => (1, 0),
812        b'0'..=b'9' => {
813            let (length, start) = parse_decimal(bytes, 9);
814            c = *bytes.get(length)?;
815            if !(c == b'.' || c == b')') {
816                return None;
817            }
818            (length + 1, start)
819        }
820        _ => {
821            return None;
822        }
823    };
824    // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
825    let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
826    if postindent == 0 {
827        scan_eol(&bytes[w..])?;
828        postindent += 1;
829    } else if postindent > 4 {
830        postn = 1;
831        postindent = 1;
832    }
833    if scan_blank_line(&bytes[w..]).is_some() {
834        postn = 0;
835        postindent = 1;
836    }
837    Some((w + postn, c, start, w + postindent))
838}
839
840// returns (number of bytes, parsed decimal)
841fn parse_decimal(bytes: &[u8], limit: usize) -> (usize, usize) {
842    match bytes
843        .iter()
844        .take(limit)
845        .take_while(|&&b| is_digit(b))
846        .try_fold((0, 0usize), |(count, acc), c| {
847            let digit = usize::from(c - b'0');
848            match acc
849                .checked_mul(10)
850                .and_then(|ten_acc| ten_acc.checked_add(digit))
851            {
852                Some(number) => Ok((count + 1, number)),
853                // stop early on overflow
854                None => Err((count, acc)),
855            }
856        }) {
857        Ok(p) | Err(p) => p,
858    }
859}
860
861// returns (number of bytes, parsed hex)
862fn parse_hex(bytes: &[u8], limit: usize) -> (usize, usize) {
863    match bytes
864        .iter()
865        .take(limit)
866        .try_fold((0, 0usize), |(count, acc), c| {
867            let mut c = *c;
868            let digit = if c.is_ascii_digit() {
869                usize::from(c - b'0')
870            } else {
871                // make lower case
872                c |= 0x20;
873                if (b'a'..=b'f').contains(&c) {
874                    usize::from(c - b'a' + 10)
875                } else {
876                    return Err((count, acc));
877                }
878            };
879            match acc
880                .checked_mul(16)
881                .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
882            {
883                Some(number) => Ok((count + 1, number)),
884                // stop early on overflow
885                None => Err((count, acc)),
886            }
887        }) {
888        Ok(p) | Err(p) => p,
889    }
890}
891
892fn char_from_codepoint(input: usize) -> Option<char> {
893    let codepoint = input.try_into().ok()?;
894    if codepoint == 0 {
895        return None;
896    }
897    char::from_u32(codepoint)
898}
899
900// doesn't bother to check data[0] == '&'
901pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
902    let mut end = 1;
903    if scan_ch(&bytes[end..], b'#') == 1 {
904        end += 1;
905        let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
906            end += 1;
907            parse_hex(&bytes[end..], 6)
908        } else {
909            parse_decimal(&bytes[end..], 7)
910        };
911        end += bytecount;
912        return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
913            (0, None)
914        } else {
915            (
916                end + 1,
917                Some(char_from_codepoint(codepoint).unwrap_or('\u{FFFD}').into()),
918            )
919        };
920    }
921    end += scan_while(&bytes[end..], is_ascii_alphanumeric);
922    if scan_ch(&bytes[end..], b';') == 1 {
923        if let Some(value) = entities::get_entity(&bytes[1..end]) {
924            return (end + 1, Some(value.into()));
925        }
926    }
927    (0, None)
928}
929
930// note: dest returned is raw, still needs to be unescaped
931// TODO: check that nested parens are really not allowed for refdefs
932// TODO(performance): this func should probably its own unescaping
933pub(crate) fn scan_link_dest(
934    data: &str,
935    start_ix: usize,
936    max_next: usize,
937) -> Option<(usize, &str)> {
938    let bytes = &data.as_bytes()[start_ix..];
939    let mut i = scan_ch(bytes, b'<');
940
941    if i != 0 {
942        // pointy links
943        while i < bytes.len() {
944            match bytes[i] {
945                b'\n' | b'\r' | b'<' => return None,
946                b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
947                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
948                    i += 1;
949                }
950                _ => {}
951            }
952            i += 1;
953        }
954        None
955    } else {
956        // non-pointy links
957        let mut nest = 0;
958        while i < bytes.len() {
959            match bytes[i] {
960                0x0..=0x20 => {
961                    break;
962                }
963                b'(' => {
964                    if nest > max_next {
965                        return None;
966                    }
967                    nest += 1;
968                }
969                b')' => {
970                    if nest == 0 {
971                        break;
972                    }
973                    nest -= 1;
974                }
975                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
976                    i += 1;
977                }
978                _ => {}
979            }
980            i += 1;
981        }
982        if nest != 0 {
983            return None;
984        }
985        Some((i, &data[start_ix..(start_ix + i)]))
986    }
987}
988
989/// Returns bytes scanned
990fn scan_attribute_name(data: &[u8]) -> Option<usize> {
991    let (&c, tail) = data.split_first()?;
992    if is_ascii_alpha(c) || c == b'_' || c == b':' {
993        Some(
994            1 + scan_while(tail, |c| {
995                is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
996            }),
997        )
998    } else {
999        None
1000    }
1001}
1002
1003/// Returns the index immediately following the attribute on success.
1004/// The argument `buffer_ix` refers to the index into `data` from which we
1005/// should copy into `buffer` when we find bytes to skip.
1006fn scan_attribute(
1007    data: &[u8],
1008    mut ix: usize,
1009    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1010    buffer: &mut Vec<u8>,
1011    buffer_ix: &mut usize,
1012) -> Option<usize> {
1013    ix += scan_attribute_name(&data[ix..])?;
1014    let ix_after_attribute = ix;
1015    ix = scan_whitespace_with_newline_handler_without_buffer(data, ix, newline_handler)?;
1016    if scan_ch(&data[ix..], b'=') == 1 {
1017        ix = scan_whitespace_with_newline_handler(
1018            data,
1019            ix_after_attribute,
1020            newline_handler,
1021            buffer,
1022            buffer_ix,
1023        )?;
1024        ix += 1;
1025        ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
1026        ix = scan_attribute_value(data, ix, newline_handler, buffer, buffer_ix)?;
1027        Some(ix)
1028    } else {
1029        // Leave whitespace for next attribute.
1030        Some(ix_after_attribute)
1031    }
1032}
1033
1034/// Scans whitespace and possibly newlines according to the
1035/// behavior defined by the newline handler. When bytes are skipped,
1036/// all preceding non-skipped bytes are pushed to the buffer.
1037fn scan_whitespace_with_newline_handler(
1038    data: &[u8],
1039    mut i: usize,
1040    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1041    buffer: &mut Vec<u8>,
1042    buffer_ix: &mut usize,
1043) -> Option<usize> {
1044    while i < data.len() {
1045        if !is_ascii_whitespace(data[i]) {
1046            return Some(i);
1047        }
1048        if let Some(eol_bytes) = scan_eol(&data[i..]) {
1049            let handler = newline_handler?;
1050            i += eol_bytes;
1051            let skipped_bytes = handler(&data[i..]);
1052
1053            if skipped_bytes > 0 {
1054                buffer.extend(&data[*buffer_ix..i]);
1055                *buffer_ix = i + skipped_bytes;
1056            }
1057
1058            i += skipped_bytes;
1059        } else {
1060            i += 1;
1061        }
1062    }
1063
1064    Some(i)
1065}
1066
1067/// Scans whitespace and possible newlines according to the behavior defined
1068/// by the newline handler.
1069///
1070/// Unlike [`scan_whitespace_with_newline_handler`], this function doesn't
1071/// copy skipped data into a buffer. Typically, if this function
1072/// returns `Some`, a call to `scan_whitespace_with_newline_handler` will
1073/// soon follow.
1074fn scan_whitespace_with_newline_handler_without_buffer(
1075    data: &[u8],
1076    mut i: usize,
1077    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1078) -> Option<usize> {
1079    while i < data.len() {
1080        if !is_ascii_whitespace(data[i]) {
1081            return Some(i);
1082        }
1083        if let Some(eol_bytes) = scan_eol(&data[i..]) {
1084            let handler = newline_handler?;
1085            i += eol_bytes;
1086            let skipped_bytes = handler(&data[i..]);
1087            i += skipped_bytes;
1088        } else {
1089            i += 1;
1090        }
1091    }
1092
1093    Some(i)
1094}
1095
1096/// Returns the index immediately following the attribute value on success.
1097fn scan_attribute_value(
1098    data: &[u8],
1099    mut i: usize,
1100    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1101    buffer: &mut Vec<u8>,
1102    buffer_ix: &mut usize,
1103) -> Option<usize> {
1104    match *data.get(i)? {
1105        b @ b'"' | b @ b'\'' => {
1106            i += 1;
1107            while i < data.len() {
1108                if data[i] == b {
1109                    return Some(i + 1);
1110                }
1111                if let Some(eol_bytes) = scan_eol(&data[i..]) {
1112                    let handler = newline_handler?;
1113                    i += eol_bytes;
1114                    let skipped_bytes = handler(&data[i..]);
1115
1116                    if skipped_bytes > 0 {
1117                        buffer.extend(&data[*buffer_ix..i]);
1118                        *buffer_ix = i + skipped_bytes;
1119                    }
1120                    i += skipped_bytes;
1121                } else {
1122                    i += 1;
1123                }
1124            }
1125            return None;
1126        }
1127        b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
1128            return None;
1129        }
1130        _ => {
1131            // unquoted attribute value
1132            i += scan_attr_value_chars(&data[i..]);
1133        }
1134    }
1135
1136    Some(i)
1137}
1138
1139// Remove backslash escapes and resolve entities
1140pub(crate) fn unescape<'a, I: Into<CowStr<'a>>>(input: I, is_in_table: bool) -> CowStr<'a> {
1141    let input = input.into();
1142    let mut result = String::new();
1143    let mut mark = 0;
1144    let mut i = 0;
1145    let bytes = input.as_bytes();
1146    while i < bytes.len() {
1147        match bytes[i] {
1148            // Tables are special, because they're parsed as-if the tables
1149            // were parsed in a discrete pass, changing `\|` to `|`, and then
1150            // passing the changed string to the inline parser.
1151            b'\\'
1152                if is_in_table
1153                    && i + 2 < bytes.len()
1154                    && bytes[i + 1] == b'\\'
1155                    && bytes[i + 2] == b'|' =>
1156            {
1157                // even number of `\`s before pipe
1158                // odd number is handled in the normal way below
1159                result.push_str(&input[mark..i]);
1160                mark = i + 2;
1161                i += 3;
1162            }
1163            b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
1164                result.push_str(&input[mark..i]);
1165                mark = i + 1;
1166                i += 2;
1167            }
1168            b'&' => match scan_entity(&bytes[i..]) {
1169                (n, Some(value)) => {
1170                    result.push_str(&input[mark..i]);
1171                    result.push_str(&value);
1172                    i += n;
1173                    mark = i;
1174                }
1175                _ => i += 1,
1176            },
1177            b'\r' => {
1178                result.push_str(&input[mark..i]);
1179                i += 1;
1180                mark = i;
1181            }
1182            _ => i += 1,
1183        }
1184    }
1185    if mark == 0 {
1186        input
1187    } else {
1188        result.push_str(&input[mark..]);
1189        result.into()
1190    }
1191}
1192
1193/// Assumes `data` is preceded by `<`.
1194pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
1195    let i = scan_ch(data, b'/');
1196    let tail = &data[i..];
1197    let n = scan_while(tail, is_ascii_alphanumeric);
1198    if !is_html_tag(&tail[..n]) {
1199        return false;
1200    }
1201    // Starting condition says the next byte must be either a space, a tab,
1202    // the end of the line, the string >, or the string />
1203    let tail = &tail[n..];
1204    tail.is_empty()
1205        || tail[0] == b' '
1206        || tail[0] == b'\t'
1207        || tail[0] == b'\r'
1208        || tail[0] == b'\n'
1209        || tail[0] == b'>'
1210        || tail.len() >= 2 && &tail[..2] == b"/>"
1211}
1212
1213fn is_html_tag(tag: &[u8]) -> bool {
1214    HTML_TAGS
1215        .binary_search_by(|probe| {
1216            let probe_bytes_iter = probe.as_bytes().iter();
1217            let tag_bytes_iter = tag.iter();
1218
1219            probe_bytes_iter
1220                .zip(tag_bytes_iter)
1221                .find_map(|(&a, &b)| {
1222                    // We can compare case insensitively because the probes are
1223                    // all lower case alpha strings.
1224                    match a.cmp(&(b | 0x20)) {
1225                        std::cmp::Ordering::Equal => None,
1226                        inequality => Some(inequality),
1227                    }
1228                })
1229                .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1230        })
1231        .is_ok()
1232}
1233
1234/// Assumes that `data` starts with `<`.
1235/// Returns the index into data directly after the html tag on success.
1236pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1237    // Block type html does not allow for newlines, so we
1238    // do not pass a newline handler.
1239    let (_span, i) = scan_html_block_inner(data, None)?;
1240    scan_blank_line(&data[i..])?;
1241    Some(i)
1242}
1243
1244/// Assumes that `data` starts with `<`.
1245/// Returns the number of bytes scanned and the html in case of
1246/// success.
1247/// When some bytes were skipped, because the html was split over
1248/// multiple leafs (e.g. over multiple lines in a blockquote),
1249/// the html is returned as a vector of bytes.
1250/// If no bytes were skipped, the buffer will be empty.
1251pub(crate) fn scan_html_block_inner(
1252    data: &[u8],
1253    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1254) -> Option<(Vec<u8>, usize)> {
1255    let mut buffer = Vec::new();
1256    let mut last_buf_index = 0;
1257
1258    let close_tag_bytes = scan_ch(&data[1..], b'/');
1259    let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1260    if l == 0 {
1261        return None;
1262    }
1263    let mut i = 1 + close_tag_bytes + l;
1264    i += scan_while(&data[i..], is_ascii_letterdigitdash);
1265
1266    if close_tag_bytes == 0 {
1267        loop {
1268            let old_i = i;
1269            loop {
1270                i += scan_whitespace_no_nl(&data[i..]);
1271                if let Some(eol_bytes) = scan_eol(&data[i..]) {
1272                    if eol_bytes == 0 {
1273                        return None;
1274                    }
1275                    let handler = newline_handler?;
1276                    i += eol_bytes;
1277                    let skipped_bytes = handler(&data[i..]);
1278
1279                    let data_len = data.len() - i;
1280
1281                    debug_assert!(
1282                        skipped_bytes <= data_len,
1283                        "Handler tried to skip too many bytes, fed {}, skipped {}",
1284                        data_len,
1285                        skipped_bytes
1286                    );
1287
1288                    if skipped_bytes > 0 {
1289                        buffer.extend(&data[last_buf_index..i]);
1290                        i += skipped_bytes;
1291                        last_buf_index = i;
1292                    }
1293                } else {
1294                    break;
1295                }
1296            }
1297            if let Some(b'/') | Some(b'>') = data.get(i) {
1298                break;
1299            }
1300            if old_i == i {
1301                // No whitespace, which is mandatory.
1302                return None;
1303            }
1304            i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1305        }
1306    }
1307
1308    i += scan_whitespace_no_nl(&data[i..]);
1309
1310    if close_tag_bytes == 0 {
1311        i += scan_ch(&data[i..], b'/');
1312    }
1313
1314    if scan_ch(&data[i..], b'>') == 0 {
1315        None
1316    } else {
1317        i += 1;
1318        if !buffer.is_empty() {
1319            buffer.extend(&data[last_buf_index..i]);
1320        }
1321        Some((buffer, i))
1322    }
1323}
1324
1325/// Returns (next_byte_offset, uri, type)
1326pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1327    scan_uri(text, start_ix)
1328        .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1329        .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1330}
1331
1332/// Returns (next_byte_offset, uri)
1333fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1334    let bytes = &text.as_bytes()[start_ix..];
1335
1336    // scheme's first byte must be an ascii letter
1337    if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1338        return None;
1339    }
1340
1341    let mut i = 1;
1342
1343    while i < bytes.len() {
1344        let c = bytes[i];
1345        i += 1;
1346        match c {
1347            c if is_ascii_alphanumeric(c) => (),
1348            b'.' | b'-' | b'+' => (),
1349            b':' => break,
1350            _ => return None,
1351        }
1352    }
1353
1354    // scheme length must be between 2 and 32 characters long. scheme
1355    // must be followed by colon
1356    if !(3..=33).contains(&i) {
1357        return None;
1358    }
1359
1360    while i < bytes.len() {
1361        match bytes[i] {
1362            b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1363            b'\0'..=b' ' | b'<' => return None,
1364            _ => (),
1365        }
1366        i += 1;
1367    }
1368
1369    None
1370}
1371
1372/// Returns (next_byte_offset, email)
1373fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1374    // using a regex library would be convenient, but doing it by hand is not too bad
1375    let bytes = &text.as_bytes()[start_ix..];
1376    let mut i = 0;
1377
1378    while i < bytes.len() {
1379        let c = bytes[i];
1380        i += 1;
1381        match c {
1382            c if is_ascii_alphanumeric(c) => (),
1383            b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1384            | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1385            b'@' if i > 1 => break,
1386            _ => return None,
1387        }
1388    }
1389
1390    loop {
1391        let label_start_ix = i;
1392        let mut fresh_label = true;
1393
1394        while i < bytes.len() {
1395            match bytes[i] {
1396                c if is_ascii_alphanumeric(c) => (),
1397                b'-' if fresh_label => {
1398                    return None;
1399                }
1400                b'-' => (),
1401                _ => break,
1402            }
1403            fresh_label = false;
1404            i += 1;
1405        }
1406
1407        if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1408            return None;
1409        }
1410
1411        if scan_ch(&bytes[i..], b'.') == 0 {
1412            break;
1413        }
1414        i += 1;
1415    }
1416
1417    if scan_ch(&bytes[i..], b'>') == 0 {
1418        return None;
1419    }
1420
1421    Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1422}
1423
1424/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1425/// Returns byte offset on match.
1426pub(crate) fn scan_inline_html_comment(
1427    bytes: &[u8],
1428    mut ix: usize,
1429    scan_guard: &mut HtmlScanGuard,
1430) -> Option<usize> {
1431    let c = *bytes.get(ix)?;
1432    ix += 1;
1433    match c {
1434        // An HTML comment consists of `<!-->`, `<!--->`, or  `<!--`, a string of characters not
1435        // including the string `-->`, and `-->`.
1436        b'-' if ix > scan_guard.comment => {
1437            // HTML comment needs two hyphens after the !.
1438            if *bytes.get(ix)? != b'-' {
1439                return None;
1440            }
1441            // Yes, we're intentionally going backwards.
1442            // We want the cursor to point here:
1443            //
1444            //     <!--
1445            //       ^
1446            //
1447            // This way, the `<!-->` case is covered by the loop below.
1448            ix -= 1;
1449
1450            while let Some(x) = memchr(b'-', &bytes[ix..]) {
1451                ix += x + 1;
1452                scan_guard.comment = ix;
1453                if scan_ch(&bytes[ix..], b'-') == 1 && scan_ch(&bytes[ix + 1..], b'>') == 1 {
1454                    return Some(ix + 2);
1455                }
1456            }
1457            None
1458        }
1459        // A CDATA section consists of the string `<![CDATA[`, a string of characters not
1460        // including the string `]]>`, and the string `]]>`.
1461        b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1462            ix += b"CDATA[".len();
1463            ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1464            let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1465            ix += close_brackets;
1466
1467            if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1468                scan_guard.cdata = ix;
1469                None
1470            } else {
1471                Some(ix + 1)
1472            }
1473        }
1474        // A declaration consists of the string `<!`, an ASCII letter, zero or more characters not
1475        // including the character >, and the character >.
1476        _ if c.is_ascii_alphabetic() && ix > scan_guard.declaration => {
1477            ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1478            if scan_ch(&bytes[ix..], b'>') == 0 {
1479                scan_guard.declaration = ix;
1480                None
1481            } else {
1482                Some(ix + 1)
1483            }
1484        }
1485        _ => None,
1486    }
1487}
1488
1489/// Scan processing directive, with initial "<?" already consumed.
1490/// Returns the next byte offset on success.
1491pub(crate) fn scan_inline_html_processing(
1492    bytes: &[u8],
1493    mut ix: usize,
1494    scan_guard: &mut HtmlScanGuard,
1495) -> Option<usize> {
1496    if ix <= scan_guard.processing {
1497        return None;
1498    }
1499    while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1500        ix += offset + 1;
1501        if scan_ch(&bytes[ix..], b'>') == 1 {
1502            return Some(ix + 1);
1503        }
1504    }
1505    scan_guard.processing = ix;
1506    None
1507}
1508
1509#[cfg(test)]
1510mod test {
1511    use super::*;
1512    #[test]
1513    fn overflow_list() {
1514        assert!(
1515            scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1516        );
1517    }
1518
1519    #[test]
1520    fn overflow_by_addition() {
1521        assert!(scan_listitem(b"1844674407370955161615!").is_none());
1522    }
1523
1524    #[test]
1525    fn good_emails() {
1526        const EMAILS: &[&str] = &[
1527            "<a@b.c>",
1528            "<a@b>",
1529            "<a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-@example.com>",
1530            "<a@sixty-three-letters-in-this-identifier-----------------------63>",
1531        ];
1532        for email in EMAILS {
1533            assert!(scan_email(email, 1).is_some());
1534        }
1535    }
1536
1537    #[test]
1538    fn bad_emails() {
1539        const EMAILS: &[&str] = &[
1540            "<@b.c>",
1541            "<foo@-example.com>",
1542            "<foo@example-.com>",
1543            "<a@notrailingperiod.>",
1544            "<a(noparens)@example.com>",
1545            "<\"noquotes\"@example.com>",
1546            "<a@sixty-four-letters-in-this-identifier-------------------------64>",
1547        ];
1548        for email in EMAILS {
1549            assert!(scan_email(email, 1).is_none());
1550        }
1551    }
1552}
pulldown_cmark/scanners.rs

pulldown_cmark/
scanners.rs